# Content-Based Movie Recommendation

Dataset Source:

https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots/

In [67]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

In [68]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [69]:
df = pd.read_csv("../data/wiki_movie_plots_deduped.csv")
df.shape

(34886, 8)

In [70]:
df.sample(5)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
15335,2007,The Simpsons Movie,American,David Silverman,"Dan Castellaneta, Julie Kavner, Nancy Cartwrig...",animation,https://en.wikipedia.org/wiki/The_Simpsons_Movie,"While performing on Lake Springfield, the band..."
8642,1971,Born to Win,American,Ivan Passer,"George Segal, Paula Prentiss, Karen Black",unknown,https://en.wikipedia.org/wiki/Born_to_Win,"The film follows Jay Jay (Segal), a former hai..."
3422,1942,Ship Ahoy,American,Edward Buzzell,"Eleanor Powell, Red Skelton",musical comedy,https://en.wikipedia.org/wiki/Ship_Ahoy,Tallulah Winters is a dancing star who is hire...
6957,1958,"Another Time, Another Place",American,Lewis Allen,"Lana Turner, Barry Sullivan, Sean Connery, Gly...",melodrama,"https://en.wikipedia.org/wiki/Another_Time,_An...","An American reporter, Sara Scott (Turner) is w..."
28058,2009,I. G.,Malayalam,B. Unnikrishnan,"Suresh Gopi, Sai Kumar, Jagathy Sreekumar, Vij...","action, thriller",https://en.wikipedia.org/wiki/I._G.,IG is an action movie which Suresh Gopi dons t...


## Converting to String

In [71]:
df.columns

Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')

In [72]:
df = df.astype(str)
df.dtypes

Release Year        object
Title               object
Origin/Ethnicity    object
Director            object
Cast                object
Genre               object
Wiki Page           object
Plot                object
dtype: object

## Data Cleaning

Action Items:

* Lower-Case the whole data frame
* Director: Removing 'Director:' and 'Cast:'
* Director, Cast: Removing '\r\n', '/n' and '/r'

* Genre: Replacing '/' with Space
* Director, Cast, Genre: Removing 'Uknonwn' and 'Nan'

* Director: Separating Directors and Actors names
* Director, Cast: Checking if the names are separated with ' and '

* Director, Cast: Merging the first names and last names together
* Director, Cast: Adding the words of 'Director' and 'Actor' as prefix

* Plot: Removing English Stopwords
* Doc: Removing special characters

In [73]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [74]:
df["Director"] = df["Director"].str.replace("director:", "", regex=False)
df["Director"] = df["Director"].str.replace("cast:", "", regex=False)
df["Director"] = df["Director"].str.replace("\r\n", " ", regex=False)
df["Cast"] = df["Cast"].str.replace("\r\n", " ", regex=False)
df["Genre"] = df["Genre"].str.replace("/", " ", regex=False)

df["Director"] = df["Director"].str.replace("unknown", "", regex=False)
df["Cast"] = df["Cast"].str.replace("unknown", "", regex=False)
df["Genre"] = df["Genre"].str.replace("unknown", "", regex=False)

df["Director"] = df["Director"].str.replace("nan", "", regex=False)
df["Cast"] = df["Cast"].str.replace("nan", "", regex=False)
df["Genre"] = df["Genre"].str.replace("nan", "", regex=False)


df["Director"] = df["Director"].str.replace(" and ", ",", regex=False)
df["Cast"] = df["Cast"].str.replace(" and ", ",", regex=False)


df["Director"] = df["Director"].str.replace(" ", "", regex=False)
df["Cast"] = df["Cast"].str.replace(" ", "", regex=False)

df["Director"] = df["Director"].str.replace(",", " ", regex=False)
df["Cast"] = df["Cast"].str.replace(",", " ", regex=False)


In [75]:
df["Director"] = np.where(df["Director"].str.len() > 0,
                          'director' + df["Director"],
                          df["Director"])

df["Cast"] = np.where(df["Cast"].str.len() > 0,
                      'actor' + df["Cast"],
                      df["Cast"])

df["Director"] = df["Director"].str.replace(" ", " director", regex=False)
df["Cast"] = df["Cast"].str.replace(" ", " actor", regex=False)


In [80]:
df.sample(5)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
1359,1932,mr. robinson crusoe,american,directora.edwardsutherland,actordouglasfairbanks actorwilliamfarnum,adventure,https://en.wikipedia.org/wiki/mr._robinson_crusoe,"the film opens with a title card that reads ""f..."
15166,2007,the brothers solomon,american,directorbobodenkirk,actorwillarnett actorwillforte actorchimcbride...,comedy,https://en.wikipedia.org/wiki/the_brothers_sol...,john and dean are two sheltered happy-go-lucky...
1273,1932,but the flesh is weak,american,directorjackconway,actorrobertmontgomery actornoragregor actorhea...,comedy,https://en.wikipedia.org/wiki/but_the_flesh_is...,"max clement and his father florian, short of m..."
15334,2007,shrek the third,american,directorchrismiller directorramanhui,actormikemyers actoreddiemurphy actorcamerondiaz,animation,https://en.wikipedia.org/wiki/shrek_the_third,prince charming vows that he will become king ...
1918,1935,the last outpost,american,directorcharlesbarton directorlouisj.gasnier,actorcarygrant actorclauderains actorkathleenb...,"drama, war",https://en.wikipedia.org/wiki/the_last_outpost...,"in kurdistan during world war i, michael andre..."


## Merging the document

In [None]:
df["doc"] = df[['Release Year', 'Title', 'Origin/Ethnicity',
                'Director', 'Cast', 'Genre', 'Plot']].agg(' '.join, axis=1)
df.sample(5)

## TF-IDF

In [33]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["doc"])
column_names = vectorizer.get_feature_names_out()

df_tf_idf = pd.DataFrame(X.toarray(), columns=column_names)
df_tf_idf.shape

(34886, 148454)

## Cosine Similarity