### Data Cleaning and Preprossing
In this notebook the datasets from MovieLens 25M and movies_metadata will be cleaned, merged into one dataframe, and then a corpus will be created and preprocessed

In [None]:
#Import Neccesary Libraries
import numpy as np
import pandas as pd
import re
import json
import ast

### movies.csv
contains 62422 rows and no nulls
movieId
title (title and year)
genres

To Clean:
1. make the title and year separate columns
2. make the genres into a list

In [None]:
#Upload data and create dataframe
movies_df = pd.read_csv("movies.csv")
movies_df.sample(20)
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [None]:
#Pull the year from the title and make a column of years
pattern = re.compile(r"\((\d+)\)")
year_list = []
for i in range(len(movies_df['title'])):
    item = movies_df['title'][i]
    year = pattern.findall(item)
    year_list.append(year)
    if len(year) == 1:
        y = year[0]
        movies_df['title'][i] = movies_df['title'][i].replace(y, ' ')
        
movies_df['year'] = year_list

#Remove Special Characters from title column
movies_df['title'] = movies_df['title'].map(lambda x: re.sub(r'\W+', ' ', x))

#Remove special characters from genres
movies_df['genres'] = movies_df['genres'].map(lambda x: re.sub(r'\W+', ' ', x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


### tags.csv
1093360 rows
contains tags assigned to the movies by user
* Create a row in the dataframe with a list of tags from users about the movie.  This will be used in the corpus

In [None]:
tags_df = pd.read_csv("tags.csv")
tags_df.head()
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093360 entries, 0 to 1093359
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   userId     1093360 non-null  int64 
 1   movieId    1093360 non-null  int64 
 2   tag        1093344 non-null  object
 3   timestamp  1093360 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 33.4+ MB


In [None]:
#make a list of all the tags for each movie
unique_movieid = tags_df['movieId'].unique().tolist()
movie_tag_list = []
for i in range(len(unique_movieid)):
    movieid = unique_movieid[i]
    moviedf = tags_df.loc[tags_df['movieId'] == movieid]
    taglist = moviedf['tag'].values.tolist()
    movie_tag_list.append(taglist)

In [None]:
#Make a dataframe with the movieId and user_tag_list
movie_tag = pd.DataFrame()
movie_tag['movieId'] = unique_movieid
movie_tag['user_tag_list'] = movie_tag_list
movie_tag.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45251 entries, 0 to 45250
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   movieId        45251 non-null  int64 
 1   user_tag_list  45251 non-null  object
dtypes: int64(1), object(1)
memory usage: 707.2+ KB


In [None]:
#Merge the movies_df and movie_tag to create one dataframe
df = pd.merge(movies_df, movie_tag, how = "outer", on="movieId")

### Movies_metedata.csv
45465 rows
To clean we need to 
1. determine the columns not needed (imdb_id, overview, production_companies, spoken_languages, tagline)
2. clean the strings of dictionaries
3. append to dataframe


In [None]:
#Upload the dataframe using specified columns
cols = ['imdb_id', 'original_title','title','overview', 'production_companies', 'tagline']
metadata_df = pd.read_csv("movies_metadata.csv", usecols = cols)
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   imdb_id               45449 non-null  object
 1   original_title        45466 non-null  object
 2   overview              44512 non-null  object
 3   production_companies  45463 non-null  object
 4   tagline               20412 non-null  object
 5   title                 45460 non-null  object
dtypes: object(6)
memory usage: 2.1+ MB


In [None]:
#Drop duplicates
metadata_df1 = metadata_df.drop_duplicates()
#Drop imdb_id with NaN
metadata_df1 = metadata_df1[metadata_df1['imdb_id'].notna()]
#drop overview with nan
metadata_df1 = metadata_df1[metadata_df1['overview'].notna()]

In [None]:
#Locate empty rows in production companies
metadata_df1.loc[metadata_df1['production_companies'] == 'False']
#Drop the rows
metadata_df1 = metadata_df1.loc[metadata_df1['production_companies'] != 'False']

In [None]:
#Cleaning Production companies

#Split data into two datasets, one with no production companies listed
#and a second with production companies
emptylist = metadata_df1.loc[metadata_df1['production_companies'] != '[]'].reset_index()
fulllist = metadata_df1.loc[metadata_df1['production_companies'] == '[]'].reset_index()

for i in range(len(fulllist)):
    fulllist.at[i,'production_companies']= np.nan
    
for i in range(len(emptylist)):
    try:
        value = emptylist['production_companies'][i]
        value = str(value).replace("'", '"')
        value = json.loads(value)
        pro_comp = value
        movie_pro_list = []
        for j in range(len(pro_comp)):
            name = pro_comp[j]['name']
            movie_pro_list.append(name)
        emptylist.at[i,'production_companies']= movie_pro_list
    except:
        pass
    
for i in range(len(emptylist['production_companies'])):
    if type(emptylist['production_companies'][i]) == str:
        emptylist.at[i,'production_companies']= np.nan

frames = [emptylist, fulllist]
test = pd.concat(frames)
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44464 entries, 0 to 11295
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   index                 44464 non-null  int64 
 1   imdb_id               44464 non-null  object
 2   original_title        44464 non-null  object
 3   overview              44464 non-null  object
 4   production_companies  32688 non-null  object
 5   tagline               20389 non-null  object
 6   title                 44461 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.7+ MB


In [None]:
#Remove the tt0 from imdbid
test['imdb_id'] = test['imdb_id'].str[3:]
#Make imdb_id a data type integer
test['imdb_id'] = test['imdb_id'].astype(int)
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44464 entries, 0 to 11295
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   index                 44464 non-null  int64 
 1   imdb_id               44464 non-null  int64 
 2   original_title        44464 non-null  object
 3   overview              44464 non-null  object
 4   production_companies  32688 non-null  object
 5   tagline               20389 non-null  object
 6   title                 44461 non-null  object
dtypes: int64(2), object(5)
memory usage: 2.7+ MB


While examining the dataset it became clear that some of the movies had incorrect imdb labels, the following locates those rows and removes them from the dataframe

In [None]:
#Make a dataframe of the duplicated imdb_ids
duplicate = test[test.duplicated('imdb_id', keep=False)]
duplicate.info()
#Make a dataframe of the unique imdb_ids
unique = test.drop_duplicates(subset = ["imdb_id"])
unique.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1225 entries, 18 to 11291
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   index                 1225 non-null   int64 
 1   imdb_id               1225 non-null   int64 
 2   original_title        1225 non-null   object
 3   overview              1225 non-null   object
 4   production_companies  877 non-null    object
 5   tagline               540 non-null    object
 6   title                 1225 non-null   object
dtypes: int64(2), object(5)
memory usage: 76.6+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 43851 entries, 0 to 11295
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   index                 43851 non-null  int64 
 1   imdb_id               43851 non-null  int64 
 2   original_title        43851 non-null  object
 3   overview              43851 

### links.csv

Use this dataframe to connect the previous two; this will also help to elimate rows with incorrect imdb_ids

In [None]:
links_df = pd.read_csv("links.csv")
links_df.shape
links_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  62423 non-null  int64  
 1   imdbId   62423 non-null  int64  
 2   tmdbId   62316 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.4 MB


In [None]:
#Rename columns so the dataframes have matching columns
links_df = links_df.rename(columns={"imdbId": "imdb_id"})


Merge the duplicate dataset with the links
Then merge the links with df then compare titles

In [None]:
# merge duplicates with links
duplicates_links = duplicate.merge(links_df, on = ['imdb_id'], how = 'inner')
duplicates_links.head()
# merge with df
duplicates_links_df = duplicates_links.merge(df, on = ['movieId'], how = 'inner')
duplicates_links_df.head()

Unnamed: 0,index,imdb_id,original_title,overview,production_companies,tagline,title_x,movieId,tmdbId,title_y,genres,year,user_tag_list
0,18,112281,Ace Ventura: When Nature Calls,"Summoned from an ashram in Tibet, Ace finds hi...","[O Entertainment, Warner Bros., Morgan Creek P...",New animals. New adventures. Same hair.,Ace Ventura: When Nature Calls,19,9273.0,Ace Ventura When Nature Calls,Comedy,[1995],"[detective, childhood classic, Jim Carrey, com..."
1,23526,112281,Tangled Ever After,The kingdom is in a festive mood as everyone g...,[Walt Disney Animation Studios],,Tangled Ever After,19,9273.0,Ace Ventura When Nature Calls,Comedy,[1995],"[detective, childhood classic, Jim Carrey, com..."
2,166,113114,Free Willy 2 - The Adventure Home,Jesse becomes reunited with Willy three years ...,"[Regency Enterprises, Alcor Films, Canal+, Don...",The adventure is back. The fun is back. Willy'...,Free Willy 2 - The Adventure Home,169,9073.0,Free Willy 2 The Adventure Home,Adventure Children Drama,[1995],"[family, family, human animal relationship, ki..."
3,37585,113114,Lethal Seduction,High School senior Mark Richards has never min...,[Indy Entertainment],,Lethal Seduction,169,9073.0,Free Willy 2 The Adventure Home,Adventure Children Drama,[1995],"[family, family, human animal relationship, ki..."
4,178,113820,Mighty Morphin Power Rangers: The Movie,Power up with six incredible teens who out-man...,"[Twentieth Century Fox Film Corporation, Saban...",The Power Is On!,Mighty Morphin Power Rangers: The Movie,181,9070.0,Mighty Morphin Power Rangers The Movie,Action Children,[1995],"[Action, based on tv series, ethnic diversity,..."


In [None]:
#drop rows where first couple characters in string do not match

#remove special characters from title_x
#clean_text = re.sub(r'[^\w\s]',' ',cleaned_text)
duplicates_links_df['title_clean_x'] = duplicates_links_df['title_x'].map(lambda x: re.sub(r'[^\w\s]','', x))
duplicates_links_df['title_clean_y'] = duplicates_links_df['title_y'].map(lambda x: re.sub(r'[^\w\s]','', x))

#Make the titles into sets
duplicates_links_df['title_clean_x'] = [ set(item) for item in duplicates_links_df['title_clean_x']]
duplicates_links_df['title_clean_y'] = [ set(item) for item in duplicates_links_df['title_clean_y']]

#dataframe where title_clean_x = title_y
duplicates = duplicates_links_df[duplicates_links_df['title_clean_x'] == duplicates_links_df['title_clean_y']]
duplicates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324 entries, 0 to 969
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 324 non-null    int64  
 1   imdb_id               324 non-null    int64  
 2   original_title        324 non-null    object 
 3   overview              324 non-null    object 
 4   production_companies  251 non-null    object 
 5   tagline               183 non-null    object 
 6   title_x               324 non-null    object 
 7   movieId               324 non-null    int64  
 8   tmdbId                324 non-null    float64
 9   title_y               324 non-null    object 
 10  genres                324 non-null    object 
 11  year                  324 non-null    object 
 12  user_tag_list         276 non-null    object 
 13  title_clean_x         324 non-null    object 
 14  title_clean_y         324 non-null    object 
dtypes: float64(1), int64(3)

In [None]:
#Make the new cleaned duplicates dataframe have the original columns
duplicate = duplicates[['imdb_id', 'original_title', 'overview', 'production_companies',
                        'tagline', 'title_x']]
duplicate = duplicate.rename(columns={"title_x": "title"})

#Merge the duplicate dataframe back with the unique dataframe
frames = [duplicate, unique]

newdf = pd.concat(frames)

newdf.info()

#Combine clean duplicate with unique
newdf[newdf.duplicated('imdb_id', keep=False)]
newdf.loc[newdf['imdb_id'] == 113820] #113114, 112281, 113820
#drop duplicate values based on imdb
newdf = newdf.drop_duplicates(subset='imdb_id', keep="last")
newdf.info()
len(pd.unique(newdf['imdb_id']))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44175 entries, 0 to 11295
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               44175 non-null  int64  
 1   original_title        44175 non-null  object 
 2   overview              44175 non-null  object 
 3   production_companies  32622 non-null  object 
 4   tagline               20369 non-null  object 
 5   title                 44172 non-null  object 
 6   index                 43851 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 2.7+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 43851 entries, 0 to 11295
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               43851 non-null  int64  
 1   original_title        43851 non-null  object 
 2   overview              43851 non-null  object 
 3   pro

43851

In [None]:
#Merge dataframe with links
newdf = newdf.merge(links_df, on = ['imdb_id'], how = 'outer')
newdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78161 entries, 0 to 78160
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               78161 non-null  int64  
 1   original_title        43851 non-null  object 
 2   overview              43851 non-null  object 
 3   production_companies  32371 non-null  object 
 4   tagline               20186 non-null  object 
 5   title                 43848 non-null  object 
 6   index                 43851 non-null  float64
 7   movieId               62423 non-null  float64
 8   tmdbId                62316 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 6.0+ MB


### Make a final dataframe

In [None]:
newdf.info()
#lets drop rows original_title

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78161 entries, 0 to 78160
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               78161 non-null  int64  
 1   original_title        43851 non-null  object 
 2   overview              43851 non-null  object 
 3   production_companies  32371 non-null  object 
 4   tagline               20186 non-null  object 
 5   title                 43848 non-null  object 
 6   index                 43851 non-null  float64
 7   movieId               62423 non-null  float64
 8   tmdbId                62316 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 6.0+ MB


In [None]:
dataframe1 = newdf[['movieId', 'imdb_id', 'tmdbId', 'title', 'overview', 
                    'production_companies','tagline']]
dataframe1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78161 entries, 0 to 78160
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               62423 non-null  float64
 1   imdb_id               78161 non-null  int64  
 2   tmdbId                62316 non-null  float64
 3   title                 43848 non-null  object 
 4   overview              43851 non-null  object 
 5   production_companies  32371 non-null  object 
 6   tagline               20186 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 4.8+ MB


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62423 entries, 0 to 62422
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   movieId        62423 non-null  int64 
 1   title          62423 non-null  object
 2   genres         62423 non-null  object
 3   year           62423 non-null  object
 4   user_tag_list  45251 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.9+ MB


In [None]:
#merge with df on movieId
content_df = dataframe1.merge(df, on = ['movieId'], how = 'outer')
content_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78161 entries, 0 to 78160
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               62423 non-null  float64
 1   imdb_id               78161 non-null  int64  
 2   tmdbId                62316 non-null  float64
 3   title_x               43848 non-null  object 
 4   overview              43851 non-null  object 
 5   production_companies  32371 non-null  object 
 6   tagline               20186 non-null  object 
 7   title_y               62423 non-null  object 
 8   genres                62423 non-null  object 
 9   year                  62423 non-null  object 
 10  user_tag_list         45251 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 7.2+ MB


In [None]:
#drop title_x
content_df.drop(['title_x'], axis = 1, inplace = True)
content_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78161 entries, 0 to 78160
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               62423 non-null  float64
 1   imdb_id               78161 non-null  int64  
 2   tmdbId                62316 non-null  float64
 3   overview              43851 non-null  object 
 4   production_companies  32371 non-null  object 
 5   tagline               20186 non-null  object 
 6   title_y               62423 non-null  object 
 7   genres                62423 non-null  object 
 8   year                  62423 non-null  object 
 9   user_tag_list         45251 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 6.6+ MB


In [None]:
#rename title_y to title
content_df = content_df.rename({'title_y': 'title'}, axis=1)
content_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78161 entries, 0 to 78160
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               62423 non-null  float64
 1   imdb_id               78161 non-null  int64  
 2   tmdbId                62316 non-null  float64
 3   overview              43851 non-null  object 
 4   production_companies  32371 non-null  object 
 5   tagline               20186 non-null  object 
 6   title                 62423 non-null  object 
 7   genres                62423 non-null  object 
 8   year                  62423 non-null  object 
 9   user_tag_list         45251 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 6.6+ MB


In [None]:
#remove rows from content_df where title is nan
content_df = content_df[content_df['title'].notna()]
content_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62423 entries, 0 to 78160
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               62423 non-null  float64
 1   imdb_id               62423 non-null  int64  
 2   tmdbId                62316 non-null  float64
 3   overview              28113 non-null  object 
 4   production_companies  21572 non-null  object 
 5   tagline               13910 non-null  object 
 6   title                 62423 non-null  object 
 7   genres                62423 non-null  object 
 8   year                  62423 non-null  object 
 9   user_tag_list         45251 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 5.2+ MB


### Preprocessing
Now that we have the desired dataframe it is time to create a clean corpus for nlp analysis

In [None]:
#import necessary libraries for preprocessing
import nltk
nltk.download('punkt')
nltk.download('maxent_treebank_pos_tagger')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('wordnet')
nltk.download('stopwords')
stopword_list = set(stopwords.words('english'))
import gensim
from gensim.models import Word2Vec 
import re
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Functions for preprocessing

#POS tagger
wnl = WordNetLemmatizer()
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
        
    wordsList = nltk.word_tokenize(text)
    tagged_text = nltk.pos_tag(wordsList)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text

#Lemmatize
def lemmatize(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

#Remove Stopwords
def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

#Remove special characters
def remove_spec_char (text):
    clean_text = re.sub(r'[^\w\s]',' ',text)
    return (clean_text)

In [None]:
#remove special characters, lemmatize, remove stopwords for tagline and overview
content_df['tagline_clean'] = content_df['tagline'].apply(lambda x: remove_stopwords(lemmatize(remove_spec_char(x))) if type(x) == str else x)
content_df['overview_clean'] = content_df['overview'].apply(lambda x: remove_stopwords(lemmatize(remove_spec_char(x))) if type(x) == str else x)
content_df['tagline_clean'] = content_df['tagline_clean'].apply(lambda x: nltk.word_tokenize(x) if type(x) == str else x)
content_df['overview_clean'] = content_df['overview_clean'].apply(lambda x: nltk.word_tokenize(x) if type(x) == str else x)

In [None]:
#to clean genres: lowercase and tokenize
content_df['genres'] = content_df['genres'].str.lower()
content_df['genres'] = content_df['genres'].apply(lambda x: nltk.word_tokenize(x) if type(x) == str else x)


In [None]:
#function to clean and remove duplicates from a list
def clean_list(given_list):
  given_list = [str(x) for x in given_list]
  a = [x.lower() for x in given_list]
  b = [remove_stopwords(lemmatize(remove_spec_char(x))) for x in a]
  mylist = b
  mylist = list(dict.fromkeys(mylist))
  return(mylist)

In [None]:
#apply clean_list function to user_tag_list_clean
content_df['user_tag_list_clean'] = content_df['user_tag_list'].apply(lambda x: clean_list(x) if type(x) == list else x)


In [None]:
#make production company lowercase
content_df['production_companies'] = content_df['production_companies'].apply(lambda x: x.str.lower() if type(x) == str else x)

In [None]:
#make np.nan into empty lists
for row in content_df.loc[content_df.overview_clean.isnull(), 'overview_clean'].index:
    content_df.at[row, 'overview_clean'] = []
for row in content_df.loc[content_df.tagline_clean.isnull(), 'tagline_clean'].index:
    content_df.at[row, 'tagline_clean'] = []
for row in content_df.loc[content_df.genres.isnull(), 'genres'].index:
    content_df.at[row, 'genres'] = []
for row in content_df.loc[content_df.user_tag_list_clean.isnull(), 'user_tag_list_clean'].index:
    content_df.at[row, 'user_tag_list_clean'] = []
for row in content_df.loc[content_df.production_companies.isnull(), 'production_companies'].index:
    content_df.at[row, 'production_companies'] = []
for row in content_df.loc[content_df.year.isnull(), 'year'].index:
    content_df.at[row, 'year'] = []

In [None]:
#Make a corpus & tokenized corpus for the dataframe
sum_column = content_df['overview_clean']+content_df['tagline_clean']+content_df['genres']+content_df['user_tag_list_clean']+content_df['production_companies']
content_df['corpus_tokens'] = sum_column

content_df['corpus'] = content_df['corpus_tokens'].apply(lambda x: ' '.join(x))

content_df['corpus'] = content_df['corpus'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

In [None]:
#Make a pkl file of content_df
content_df.to_pickle("content_df.pkl")

In [None]:
#Make a smaller dataframe to work with
small_df = content_df[content_df['production_companies'].map(lambda d: len(d)) > 0]
small_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21572 entries, 0 to 37621
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               21572 non-null  float64
 1   imdb_id               21572 non-null  int64  
 2   tmdbId                21572 non-null  float64
 3   overview              21572 non-null  object 
 4   production_companies  21572 non-null  object 
 5   tagline               12333 non-null  object 
 6   title                 21572 non-null  object 
 7   genres                21572 non-null  object 
 8   year                  21572 non-null  object 
 9   user_tag_list         19739 non-null  object 
 10  tagline_clean         21572 non-null  object 
 11  overview_clean        21572 non-null  object 
 12  user_tag_list_clean   21572 non-null  object 
 13  corpus_tokens         21572 non-null  object 
 14  corpus                21572 non-null  object 
dtypes: float64(2), int6

In [None]:
#Make a pkl file of the small_df
small_df.to_pickle("small_df.pkl")