# 4. Embeddings
## Making TF-IDF Movie Recommendations with LLMs

In [8]:
import requests
import json
import numpy as np
from IPython.display import Markdown, display, Code
import pandas as pd
import re
import json

In [4]:
api_key = 'xryRHgoJc57nJONpRBRxNeGLQDNpE9a26jKazcLdkSQk'

In [5]:
def check_balance(api_key):
    url = "https://llm.api.ai8.io/credit"
    headers = {'Authorization': api_key}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        response_data = json.loads(response.content)
        return response_data
    else:
        return {"statusCode": response.status_code, "body": response.content}

In [7]:
# setting up the LLM for TF-IDF and Word2Vec
def get_resp_oai(input_text, model):
    url = "https://llm.api.ai8.io/query_llm"
    data = {
        # Specify the model that you want to use
        "model": model,
        "messages": [
                    {"role": "system", "content": "You act as a highly intelligent recommendation system. Your job is to analyse the user and movie data and rank the top 20 candidate movies to the user is most likely to watch next based on their historical data provided and the last movie the user has watched."},
                    {"role": "user", "content": input_text}
        ]
    }
    headers = {'Authorization': api_key}
    response = requests.post(url, json=data, headers=headers)
    if response.status_code == 200:
        response_data = json.loads(response.content)
        model_response = extract_message_oai(response_data)
        return model_response
    else:
        return {"statusCode": response.status_code, "body": response.content}

def extract_message_oai(response_data):
    message_content = response_data.get("choices", [])[0].get("message", {}).get("content", "")
    # format the extracted message as markdown
    markdown_content = "---\n\n" + message_content + "\n\n---"
    return markdown_content

In [8]:
# Load the final merged dataframe
final_movies = pd.read_csv('final_merged_df.csv')
final_movies.head()

Unnamed: 0.1,Unnamed: 0,director_name,duration,actor_2_name,genres_x,actor_1_name,movie_title,actor_3_name,movie_imdb_link,language,...,Genre_Western,duration_category,rating_category,budget_category,revenue_category,budget_revenue_ratio,log_budget,log_revenue,log_title_year,sqr_root_duration
0,0,James Cameron,178.0,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,Wes Studi,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,...,0,165-180 Minutes,7-8,Greater than 200 Million,Greater than 1.8 Billion,0.085009,19.283571,21.748578,7.60589,13.341664
1,1,Gore Verbinski,169.0,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean At Worlds End,Jack Davenport,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,...,0,165-180 Minutes,7-8,Greater than 200 Million,800-1000 Million,0.312176,19.519293,20.683485,7.604894,13.0
2,2,Sam Mendes,148.0,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,Stephanie Sigman,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,...,0,135-150 Minutes,6-7,Greater than 200 Million,800-1000 Million,0.278197,19.316769,20.596199,7.608871,12.165525
3,3,Christopher Nolan,164.0,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,Joseph Gordon-Levitt,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,...,0,150-165 Minutes,8-9,Greater than 200 Million,1-1.20 Billion,0.230429,19.336971,20.80479,7.607381,12.806248
4,4,Andrew Stanton,132.0,Samantha Morton,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,Polly Walker,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,English,...,0,120-135 Minutes,6-7,Greater than 200 Million,200-400 Million,0.915046,19.376192,19.464974,7.607381,11.489125


In [9]:
# Check the structure of the dataframe for prompting
final_movies.columns

Index(['Unnamed: 0', 'director_name', 'duration', 'actor_2_name', 'genres_x',
       'actor_1_name', 'movie_title', 'actor_3_name', 'movie_imdb_link',
       'language', 'country', 'title_year', 'imdb_score', 'budget',
       'original_title', 'revenue', 'tagline', 'Genre_Action',
       'Genre_Adventure', 'Genre_Animation', 'Genre_Biography', 'Genre_Comedy',
       'Genre_Crime', 'Genre_Documentary', 'Genre_Drama', 'Genre_Family',
       'Genre_Fantasy', 'Genre_Film-Noir', 'Genre_History', 'Genre_Horror',
       'Genre_Music', 'Genre_Musical', 'Genre_Mystery', 'Genre_News',
       'Genre_Romance', 'Genre_Sci-Fi', 'Genre_Sport', 'Genre_Thriller',
       'Genre_War', 'Genre_Western', 'duration_category', 'rating_category',
       'budget_category', 'revenue_category', 'budget_revenue_ratio',
       'log_budget', 'log_revenue', 'log_title_year', 'sqr_root_duration'],
      dtype='object')

### Part I: Get TF-IDF and Word2Vec embeddings

In [53]:
def define_prompt():
    prompt = """
    **Task**: Create embeddings for a movie dataset using Term Frequency-Inverse Document Frequency (TF-IDF) and Word2Vec (W2V) approaches. You must consider the appropriate data preprocessing steps. 
    
    **Dataset Format**: The dataset is read in a CSV format with rows for each movie and columns including movie title, budget, imdb_score and other columns. These are all the name columns included in the dataset: ['director_name', 'duration', 'actor_2_name', 'genres_x', 'actor_1_name',
       'movie_title', 'actor_3_name', 'movie_imdb_link', 'language', 'country',
       'title_year', 'imdb_score', 'budget', 'original_title', 'revenue',
       'tagline', 'Genre_Action', 'Genre_Adventure', 'Genre_Animation',
       'Genre_Biography', 'Genre_Comedy', 'Genre_Crime', 'Genre_Documentary',
       'Genre_Drama', 'Genre_Family', 'Genre_Fantasy', 'Genre_Film-Noir',
       'Genre_History', 'Genre_Horror', 'Genre_Music', 'Genre_Musical',
       'Genre_Mystery', 'Genre_News', 'Genre_Romance', 'Genre_Sci-Fi',
       'Genre_Sport', 'Genre_Thriller', 'Genre_War', 'Genre_Western',
       'duration_category', 'rating_category', 'budget_category',
       'revenue_category', 'budget_revenue_ratio', 'log_budget', 'log_revenue',
       'log_title_year', 'sqr_root_duration'].

        This is the head of the dataset in json format, for your reference: Dataset Head = [{"index":0,"director_name":"James Cameron","duration":"178.0","actor_2_name":"Joel David Moore","genres_x":"Action|Adventure|Fantasy|Sci-Fi","actor_1_name":"CCH Pounder","movie_title":"Avatar","actor_3_name":"Wes Studi","movie_imdb_link":"http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1","language":"English","country":"USA","title_year":"2009.0","imdb_score":7.9,"budget":237000000,"original_title":"Avatar","revenue":"2787965087.0","tagline":"Enter the World of Pandora.","Genre_Action":1,"Genre_Adventure":1,"Genre_Animation":0,"Genre_Biography":0},{"index":1,"director_name":"Gore Verbinski","duration":"169.0","actor_2_name":"Orlando Bloom","genres_x":"Action|Adventure|Fantasy","actor_1_name":"Johnny Depp","movie_title":"Pirates of the Caribbean At Worlds End","actor_3_name":"Jack Davenport","movie_imdb_link":"http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1","language":"English","country":"USA","title_year":"2007.0","imdb_score":7.1,"budget":300000000,"original_title":"Pirates of the Caribbean At Worlds End","revenue":"961000000.0","tagline":"At the end of the world, the adventure begins.","Genre_Action":1,"Genre_Adventure":1,"Genre_Animation":0,"Genre_Biography":0},{"index":2,"director_name":"Sam Mendes","duration":"148.0","actor_2_name":"Rory Kinnear","genres_x":"Action|Adventure|Thriller","actor_1_name":"Christoph Waltz","movie_title":"Spectre","actor_3_name":"Stephanie Sigman","movie_imdb_link":"http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1","language":"English","country":"UK","title_year":"2015.0","imdb_score":6.8,"budget":245000000,"original_title":"Spectre","revenue":"880674609.0","tagline":"A Plan No One Escapes","Genre_Action":1,"Genre_Adventure":1,"Genre_Animation":0,"Genre_Biography":0},{"index":3,"director_name":"Christopher Nolan","duration":"164.0","actor_2_name":"Christian Bale","genres_x":"Action|Thriller","actor_1_name":"Tom Hardy","movie_title":"The Dark Knight Rises","actor_3_name":"Joseph Gordon-Levitt","movie_imdb_link":"http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1","language":"English","country":"USA","title_year":"2012.0","imdb_score":8.5,"budget":250000000,"original_title":"The Dark Knight Rises","revenue":"1084939099.0","tagline":"The Legend Ends","Genre_Action":1,"Genre_Adventure":0,"Genre_Animation":0,"Genre_Biography":0},{"index":4,"director_name":"Andrew Stanton","duration":"132.0","actor_2_name":"Samantha Morton","genres_x":"Action|Adventure|Sci-Fi","actor_1_name":"Daryl Sabara","movie_title":"John Carter","actor_3_name":"Polly Walker","movie_imdb_link":"http://www.imdb.com/title/tt0401729/?ref_=fn_tt_tt_1","language":"English","country":"USA","title_year":"2012.0","imdb_score":6.6,"budget":260000000,"original_title":"John Carter","revenue":"284139100.0","tagline":"Lost in our world, found in another.","Genre_Action":1,"Genre_Adventure":1,"Genre_Animation":0,"Genre_Biography":0}]
        
        More information about the dataset (we get this when executing movies_dataset_name.info()): <class 'pandas.core.frame.DataFrame'>
        Index: 4144 entries, 0 to 4161
        Data columns (total 48 columns):
         #   Column                Non-Null Count  Dtype   
        ---  ------                --------------  -----   
         0   director_name         4144 non-null   object  
         1   duration              4142 non-null   float64 
         2   actor_2_name          4140 non-null   object  
         3   genres_x              4144 non-null   object  
         4   actor_1_name          4141 non-null   object  
         5   movie_title           4144 non-null   object  
         6   actor_3_name          4135 non-null   object  
         7   movie_imdb_link       4144 non-null   object  
         8   language              4137 non-null   object  
         9   country               4144 non-null   object  
         10  title_year            4144 non-null   float64 
         11  imdb_score            4144 non-null   float64
         12  budget                4144 non-null   int64   
         13  original_title        4144 non-null   object  
         14  revenue               4144 non-null   float64 
         15  tagline               3584 non-null   object  
         16  Genre_Action          4144 non-null   int64   
         17  Genre_Adventure       4144 non-null   int64   
         18  Genre_Animation       4144 non-null   int64   
         19  Genre_Biography       4144 non-null   int64   
         20  Genre_Comedy          4144 non-null   int64   
         21  Genre_Crime           4144 non-null   int64   
         22  Genre_Documentary     4144 non-null   int64   
         23  Genre_Drama           4144 non-null   int64   
         24  Genre_Family          4144 non-null   int64   
         25  Genre_Fantasy         4144 non-null   int64   
         26  Genre_Film-Noir       4144 non-null   int64   
         27  Genre_History         4144 non-null   int64   
         28  Genre_Horror          4144 non-null   int64   
         29  Genre_Music           4144 non-null   int64   
         30  Genre_Musical         4144 non-null   int64   
         31  Genre_Mystery         4144 non-null   int64   
         32  Genre_News            4144 non-null   int64   
         33  Genre_Romance         4144 non-null   int64   
         34  Genre_Sci-Fi          4144 non-null   int64   
         35  Genre_Sport           4144 non-null   int64   
         36  Genre_Thriller        4144 non-null   int64   
         37  Genre_War             4144 non-null   int64   
         38  Genre_Western         4144 non-null   int64   
         39  duration_category     4142 non-null   category
         40  rating_category       4144 non-null   category
         41  budget_category       3430 non-null   category
         42  revenue_category      3126 non-null   category
        43  budget_revenue_ratio  3552 non-null   float64 
         44  log_budget            4144 non-null   float64 
         45  log_revenue           4144 non-null   float64 
         46  log_title_year        4144 non-null   float64 
         47  sqr_root_duration     4142 non-null   float64 
        dtypes: category(4), float64(9), int64(24), object(11)
        memory usage: 1.4+ MB

    **Dataset Description:** You have been provided with a movie dataset containing 48 features of 4144 movies. Each entry in the dataset represents a single movie along with its associated features.

    **Objective:**
        1. TF-IDF Embeddings: Implement TF-IDF vectorization on the movie dataset to generate embeddings.
        2. Word2Vec Embeddings: Implement Word2Vec model training on the movie dataset to generate embeddings.
    
    **Deliverables:**
        1. Python code implementing the generation of TF-IDF and Word2Vec embeddings for the provided movie dataset.
        2. Embeddings files or data structures containing the embeddings generated for each movie.
   
    **Requirements:**
        * Use Python for implementation.
        * Generate executable jupyter notebook code chunks.
        * Include the appropriate data preprocessing steps. 
        * Utilise appropriate libraries (e.g., scikit-learn, Gensim) for TF-IDF and Word2Vec implementations.
        * Provide comments or documentation within the code to explain the process and any parameters used.
    """
    return prompt

In [54]:
prompt = define_prompt()
msg_1 = get_resp_oai(prompt,"gpt-4-0125-preview")
# msg_1
display(Markdown("<div style='color: #34568B;'>\n\n" + msg_1))

<div style='color: #34568B;'>

---

To achieve the objective of creating embeddings for the given movie dataset using Term Frequency-Inverse Document Frequency (TF-IDF) and Word2Vec (W2V) approaches, I'll walk you through both methods step by step, including the appropriate data preprocessing techniques.

### 1. Data Preprocessing

Data preprocessing is crucial for both TF-IDF and Word2Vec models to ensure the models have clean and relevant data to work with. Given our dataset, we'll focus on text-based features such as `movie_title`, `original_title`, `tagline`, and `genres_x` as they are likely to provide the most meaningful context for generating embeddings.

First, let's prepare our environment and load the dataset:

```python
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import normalize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset (assuming the file path is correct)
dataset_path = 'path_to_your_dataset.csv' # Please replace this with your dataset path
movies_dataset = pd.read_csv(dataset_path)

# Preview the dataset
print(movies_dataset.head())
```

#### Cleaning and Preparing Text Data

For both TF-IDF and Word2Vec models, our text data must be cleaned and preprocessed. This involves converting to lowercase, removing punctuation, stop words, and tokenization.

```python
# Function to clean text data
def clean_text(text):
    if not pd.isnull(text):
        # Convert text to lowercase
        text = text.lower()
        # Remove numbers and punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Tokenize the text
        tokens = word_tokenize(text)
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]
        # Join the tokens back into a string
        text = ' '.join(tokens)
    else:
        text = ''
    return text

# Columns to preprocess
text_columns = ['movie_title', 'original_title', 'tagline', 'genres_x']

# Apply cleaning function to text columns
for column in text_columns:
    movies_dataset[column] = movies_dataset[column].apply(clean_text)

# Combine relevant text features into a single text corpus per movie
movies_dataset['text_corpus'] = movies_dataset['movie_title'] + ' ' + movies_dataset['original_title'] + ' ' + movies_dataset['tagline'] + ' ' + movies_dataset['genres_x']
```

### 2. TF-IDF Embeddings

Now, let's create TF-IDF embeddings for our dataset.

```python
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate TF-IDF embeddings
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_dataset['text_corpus'])

# Normalize TF-IDF embeddings
tfidf_matrix = normalize(tfidf_matrix)
```

### 3. Word2Vec Embeddings

For Word2Vec, we'll need to prepare our corpus slightly differently.

```python
# Prepare corpus for Word2Vec
corpus = [word_tokenize(doc) for doc in movies_dataset['text_corpus']]

# Initialize and train the Word2Vec model
word2vec_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)

# Generate Word2Vec embeddings (averaged across all words in the document's corpus)
word2vec_embeddings = np.array([np.mean([word2vec_model.wv[word] for word in document if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for document in corpus])
```

### Deliverables

1. **TF-IDF Embeddings:** The `tfidf_matrix` from the TF-IDF section contains the embeddings for each movie based on the `text_corpus`. You can save or use this matrix as needed.

2. **Word2Vec Embeddings:** The `word2vec_embeddings` array from the Word2Vec section contains the embeddings for each movie. Each embedding is the averaged vector of all word vectors in the movie's `text_corpus`.

### Additional Notes:

- The implemented preprocessing steps, such as tokenization and stop words removal, are essential for both models to focus on the most meaningful words in the texts.
- Parameters for both TF-IDF and Word2Vec (such as `vector_size`, `window`, etc.) can be tuned according to specific requirements or based on experimentation to improve the performance of downstream tasks.
- For more accurate or contextual embeddings, consider additional preprocessing or feature engineering steps based on the dataset's characteristics and the target application.

---

In [55]:
def helper_extraction_prompt(resp_str):
    return f"Can you transform this string: {resp_str} into an executable string by extracting the python code and keeping the rest of the information as comments? Please place comment symbols where necessary and keep in mind that the given response string will be executed in a python code chunk using the function:'exec(given_response_string)."

In [56]:
prompt2 = helper_extraction_prompt(msg_1)
msg_2 = get_resp_oai(prompt2,"gpt-4-0125-preview")
display(Code(msg_2, language='python'))

In [57]:
# Replace the df name generated by the LLM with our dataframe path
msg_2[15:len(msg_2)-8].replace('path_to_your_dataset.csv', 'final_merged_df.csv')

"# To achieve the objective of creating embeddings for the given movie dataset using Term Frequency-Inverse Document Frequency (TF-IDF) and Word2Vec (W2V) approaches, I'll walk you through both methods step by step, including the appropriate data preprocessing techniques.\n\n# 1. Data Preprocessing\n\n# Data preprocessing is crucial for both TF-IDF and Word2Vec models to ensure the models have clean and relevant data to work with. Given our dataset, we'll focus on text-based features such as `movie_title`, `original_title`, `tagline`, and `genres_x` as they are likely to provide the most meaningful context for generating embeddings.\n\n# First, let's prepare our environment and load the dataset:\n\n# Import necessary libraries\nimport pandas as pd\nimport numpy as np\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom gensim.models import Word2Vec\nimport string\nimport re\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\nfrom sklearn.preproces

In [9]:
!pip install gensim

Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/22/40/7d2cce3ad4ad5d02aa68e253e6ea5f0acc381f02f594e235fe00a274faff/gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Using cached gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Installing collected packages: gensim
Successfully installed gensim-4.3.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [58]:
exec(msg_2[15:len(msg_2)-8].replace('path_to_your_dataset.csv', 'final_merged_df.csv'))

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   Unnamed: 0      director_name  duration      actor_2_name  \
0           0      James Cameron     178.0  Joel David Moore   
1           1     Gore Verbinski     169.0     Orlando Bloom   
2           2         Sam Mendes     148.0      Rory Kinnear   
3           3  Christopher Nolan     164.0    Christian Bale   
4           4     Andrew Stanton     132.0   Samantha Morton   

                          genres_x     actor_1_name  \
0  Action|Adventure|Fantasy|Sci-Fi      CCH Pounder   
1         Action|Adventure|Fantasy      Johnny Depp   
2        Action|Adventure|Thriller  Christoph Waltz   
3                  Action|Thriller        Tom Hardy   
4          Action|Adventure|Sci-Fi     Daryl Sabara   

                              movie_title          actor_3_name  \
0                                  Avatar             Wes Studi   
1  Pirates of the Caribbean At Worlds End        Jack Davenport   
2                                 Spectre      Stephanie Sigman   
3               

In [59]:
tfidf_matrix

<4144x7295 sparse matrix of type '<class 'numpy.float64'>'
	with 26752 stored elements in Compressed Sparse Row format>

In [60]:
word2vec_embeddings

array([[-0.0062125 ,  0.00827003,  0.00314423, ..., -0.00956431,
        -0.00091494,  0.00272677],
       [-0.01262492,  0.01499987,  0.00747827, ..., -0.0206518 ,
         0.00198918, -0.00159149],
       [-0.01069544,  0.01570955,  0.0025968 , ..., -0.01611487,
         0.00268714, -0.00023023],
       ...,
       [-0.00255603,  0.0029258 , -0.00304272, ..., -0.00521181,
         0.00146588,  0.00587271],
       [-0.00592796,  0.00554268,  0.00104809, ..., -0.00599817,
         0.00168504,  0.00219389],
       [-0.00652725,  0.00920803, -0.00099061, ..., -0.01675735,
         0.0017123 , -0.00069297]], dtype=float32)

## Part II: Get movie recommendations based on similarity scores

In [125]:
def helper_extraction_prompt_rec(resp_str):
    return f"Can you transform this string: {resp_str} so that it calculates the cosine similarity for TF-IDF embeddings and Word2Vec? Please place comment symbols where necessary and keep in mind that the given response string will be executed in a python code chunk using the function:'exec(given_response_string)."

In [126]:
prompt3 = helper_extraction_prompt_rec(msg_2)
msg_3 = get_resp_oai(prompt3,"gpt-4-0125-preview")
display(Code(msg_3, language ='python'))

In [127]:
def helper_extraction_prompt_rec(resp_str):
    return f"Can you transform this string: {resp_str} so that it also finds the indices of the top 20 similar movies based on a 'movie_tile'? Please extract all code and place comment symbols where necessary keeping in mind that the given response string will be executed in a python code chunk using the function: 'exec(given_response_string). Only provide the code chunk."

In [128]:
prompt31 = helper_extraction_prompt_rec(msg_3)
msg_31 = get_resp_oai(prompt31,"gpt-4-0125-preview")
display(Code(msg_31, language ='python'))

In [2]:
msg_311 = """
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import normalize
import nltk
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
dataset_path = 'path_to_your_dataset.csv'
movies_dataset = pd.read_csv(dataset_path)

print(movies_dataset.head())

# Function to clean text data
def clean_text(text):
    if not pd.isnull(text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]
        text = ' '.join(tokens)
    else:
        text = ''
    return text

# Columns containing textual information
text_columns = ['movie_title', 'original_title', 'tagline', 'genres_x']

# Clean text columns
for column in text_columns:
    movies_dataset[column] = movies_dataset[column].apply(clean_text)

# Combine text columns into a single corpus
movies_dataset['text_corpus'] = movies_dataset['movie_title'] + ' ' + movies_dataset['original_title'] + ' ' + movies_dataset['tagline'] + ' ' + movies_dataset['genres_x']

# Initialize TF-IDF Vectorizer and Word2Vec model
tfidf_vectorizer = TfidfVectorizer()

# Generate TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_dataset['text_corpus'])
tfidf_matrix = normalize(tfidf_matrix)

# Calculate Cosine Similarity for TF-IDF
tfidf_cosine_similarity = cosine_similarity(tfidf_matrix)

# Prepare corpus for Word2Vec
corpus = [word_tokenize(doc) for doc in movies_dataset['text_corpus']]
word2vec_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)

# Generate Word2Vec features
word2vec_embeddings = np.array([np.mean([word2vec_model.wv[word] for word in document if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for document in corpus])

# Define function to calculate cosine similarity for Word2Vec
def calculate_cosine_similarity_word2vec(vectors):
    similarity = cosine_similarity(vectors)
    return similarity

# Calculate Cosine Similarity for Word2Vec
word2vec_cosine_similarity = calculate_cosine_similarity_word2vec(word2vec_embeddings)

# Function to find top N similar movies based on movie title using TF-IDF or Word2Vec similarity
def top_n_similar_movies(title, n=20, use_tfidf=True):
    # Find index of the movie title
    idx = movies_dataset[movies_dataset['movie_title'] == title].index[0]

    # Choose similarity matrix
    similarity_matrix = tfidf_cosine_similarity if use_tfidf else word2vec_cosine_similarity

    # Get similarity values with other movies
    # Skip the movie itself
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = [x for x in similarity_scores if x[0] != idx]

    # Sort movies based on similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top N most similar movies
    top_n_similar_indices = [x[0] for x in similarity_scores[:n]]

    # Return the top N similar movies
    return movies_dataset['movie_title'].iloc[top_n_similar_indices]

# Example to find top 20 similar movies for a given title
top_similar_movies = top_n_similar_movies('example_movie_title', 20, use_tfidf=True)
print(top_similar_movies)
"""

In [32]:
replace1 = """
# Columns containing textual information
text_columns = ['movie_title', 'original_title', 'tagline', 'genres_x']

# Clean text columns
for column in text_columns:
    movies_dataset[column] = movies_dataset[column].apply(clean_text)

# Combine text columns into a single corpus
movies_dataset['text_corpus'] = movies_dataset['movie_title'] + ' ' + movies_dataset['original_title'] + ' ' + movies_dataset['tagline'] + ' ' + movies_dataset['genres_x']
"""
replacement1 = """
# Columns containing textual information
text_columns =['director_name', 'actor_2_name', 'genres_x', 'actor_1_name',
       'movie_title', 'actor_3_name', 'movie_imdb_link', 'language', 'country',
       'original_title', 'tagline', 'duration_category', 'budget_category',
       'revenue_category']

movies_dataset['text_corpus'] = ""
# Clean text columns
for column in text_columns:
    movies_dataset['text_corpus'] += movies_dataset[column].apply(clean_text) + ' '
"""

Human Intervention required to get the code to run sucessfully

In [33]:
exec(msg_311.replace('path_to_your_dataset.csv', "final_merged_df.csv").replace("print(movies_dataset.head())","").replace("example_movie_title","Avatar").replace(replace1,replacement1))

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


25                                     Titanic
96            The Hobbit An Unexpected Journey
16                                The Avengers
1574                             The New World
24                                   King Kong
14                                Man of Steel
10                            Superman Returns
3788                                   Hatchet
3                        The Dark Knight Rises
1932                                 RoboCop 3
5                                  SpiderMan 3
2002                                    Orphan
4                                  John Carter
429                                    FaceOff
1       Pirates of the Caribbean At Worlds End
288                                End of Days
9            Batman v Superman Dawn of Justice
3592                  Enter the Dangerous Mind
3312                                 Apollo 18
26                   Captain America Civil War
Name: movie_title, dtype: object
