# Configurations

In [1]:
import ast
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

In [2]:
DATA_DIRECTORY = "../data/raw/"

In [3]:
def show(data):
    print(data.shape)
    display(data.head(3))

In [4]:
def clean_data(x):
    return [i.lower().replace(" ", "") for i in x] if x is not pd.NA else ""

# Data Acquisition

In [5]:
top_rated_movies = pd.read_csv(DATA_DIRECTORY + "tmdb/top_rated_movies_english.csv")
show(top_rated_movies)

(9031, 14)


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/rSPw7tgCH9c6NqICZef4kZjFOQ5.jpg,"[18, 80]",238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",160.502,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,1972-07-07,The Godfather,False,8.7,19184
1,False,/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg,"[18, 80]",278,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,150.884,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,1995-03-17,The Shawshank Redemption,False,8.7,25170
2,False,/kGzFbGhp99zva6oZODW5atUtnqi.jpg,"[18, 80]",240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,87.692,/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg,1975-02-14,The Godfather Part II,False,8.589,11563


In [6]:
omdb_data = pd.read_csv(DATA_DIRECTORY + "omdb/data.csv")
show(omdb_data)

(7815, 26)


Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,Time
0,The Immigrant,2013,R,23 May 2014,120 min,"Drama, Romance",James Gray,"James Gray, Ric Menello","Marion Cotillard, Joaquin Phoenix, Jeremy Renner",1921. In search of a new start and the America...,...,6.6,33296,tt1951181,movie,20 Feb 2015,"$2,025,328",,,True,2023-12-25 10:00:28.706861
1,Happy Death Day,2017,PG-13,13 Oct 2017,96 min,"Comedy, Horror, Mystery",Christopher Landon,Scott Lobdell,"Jessica Rothe, Israel Broussard, Ruby Modine","A teenage girl, trying to enjoy her birthday, ...",...,6.6,158561,tt5308322,movie,02 Jan 2018,"$55,683,845",,,True,2023-12-25 10:00:29.119556
2,Persona,1966,Not Rated,16 Mar 1967,83 min,"Drama, Thriller",Ingmar Bergman,Ingmar Bergman,"Bibi Andersson, Liv Ullmann, Margaretha Krook","A young nurse, Alma, is put in charge of Elisa...",...,8.1,128047,tt0060827,movie,11 Mar 2017,,,,True,2023-12-25 10:00:29.519696


In [7]:
keywords = pd.read_csv(DATA_DIRECTORY + "tmdb/keywords.csv")
show(keywords)

(9025, 2)


Unnamed: 0,id,keywords
0,238,"['based on novel or book', 'loss of loved one'..."
1,278,"['prison', 'friendship', 'police brutality', '..."
2,240,"['italian american', 'cuba', 'italy', 'gangste..."


In [8]:
additional_info = pd.read_csv(DATA_DIRECTORY + "tmdb/additional_info.csv")
show(additional_info)

(9025, 7)


Unnamed: 0,id,budget,revenue,imdb_id,runtime,tagline,production_countries
0,238,6000000,245066411,tt0068646,175,An offer you can't refuse.,US
1,278,25000000,28341469,tt0111161,142,Fear can hold you prisoner. Hope can set you f...,US
2,240,13000000,102600000,tt0071562,202,,US


In [9]:
genres = pd.read_csv(DATA_DIRECTORY + "tmdb/genres.csv")
show(genres)

(19, 2)


Unnamed: 0,id,name
0,28,Action
1,12,Adventure
2,16,Animation


In [10]:
movies_providers = pd.read_csv(DATA_DIRECTORY + "tmdb/movie_providers.csv")
show(movies_providers)

(34403, 4)


Unnamed: 0,id,link,transaction_type,provider_id
0,238,https://www.themoviedb.org/movie/238-the-godfa...,rent,167
1,238,https://www.themoviedb.org/movie/238-the-godfa...,rent,2
2,238,https://www.themoviedb.org/movie/238-the-godfa...,rent,10


In [11]:
provider_id = 8
movies_providers[movies_providers["provider_id"] == provider_id]["id"]

47          129
94          680
140      568332
155         550
172         598
          ...  
34201    407559
34302    351460
34381     22345
34385    505423
34397    485774
Name: id, Length: 1115, dtype: int64

# Preprocessing

In [12]:
raw_data = (
    top_rated_movies[["id", "title", "overview", "genre_ids", "release_date"]]
    .merge(keywords[["id", "keywords"]], how="left", on="id")
    .merge(additional_info[["id", "tagline", "imdb_id"]], how="left", on="id")
    .merge(
        omdb_data[["imdbID", "Director", "Writer", "Actors"]],
        how="left",
        left_on="imdb_id",
        right_on="imdbID",
    )
)


show(raw_data)

(9031, 12)


Unnamed: 0,id,title,overview,genre_ids,release_date,keywords,tagline,imdb_id,imdbID,Director,Writer,Actors
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]",1972-07-07,"['based on novel or book', 'loss of loved one'...",An offer you can't refuse.,tt0068646,tt0068646,Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola","Marlon Brando, Al Pacino, James Caan"
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[18, 80]",1995-03-17,"['prison', 'friendship', 'police brutality', '...",Fear can hold you prisoner. Hope can set you f...,tt0111161,tt0111161,Frank Darabont,"Stephen King, Frank Darabont","Tim Robbins, Morgan Freeman, Bob Gunton"
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]",1975-02-14,"['italian american', 'cuba', 'italy', 'gangste...",,tt0071562,tt0071562,Francis Ford Coppola,"Francis Ford Coppola, Mario Puzo","Al Pacino, Robert De Niro, Robert Duvall"


In [13]:
genre_data = raw_data.copy()
genres_map = genres.set_index("id").to_dict()["name"]
mapped_genres = (
    genre_data[["id", "genre_ids"]]
    .set_index("id")["genre_ids"]
    .apply(ast.literal_eval)
    .explode()
    .replace(genres_map)
    .astype("str")
)
genres_column = mapped_genres.groupby(mapped_genres.index).agg(", ".join).reset_index()
genre_data = genre_data.merge(genres_column, how="left", on="id", suffixes=("_drop", ""))

genre_data = genre_data.drop(["genre_ids_drop", "release_date", "imdb_id", "imdbID"], axis=1)

genre_data = genre_data.rename(
    {"Director": "directors", "Writer": "writers", "Actors": "actors", "genre_ids": "genres"}, axis=1
)

show(genre_data)

(9031, 9)


Unnamed: 0,id,title,overview,keywords,tagline,directors,writers,actors,genres
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","['based on novel or book', 'loss of loved one'...",An offer you can't refuse.,Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola","Marlon Brando, Al Pacino, James Caan","Drama, Crime"
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"['prison', 'friendship', 'police brutality', '...",Fear can hold you prisoner. Hope can set you f...,Frank Darabont,"Stephen King, Frank Darabont","Tim Robbins, Morgan Freeman, Bob Gunton","Drama, Crime"
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,"['italian american', 'cuba', 'italy', 'gangste...",,Francis Ford Coppola,"Francis Ford Coppola, Mario Puzo","Al Pacino, Robert De Niro, Robert Duvall","Drama, Crime"


In [14]:
nl_data = genre_data.copy()
ps = PorterStemmer()

nl_data["keywords"] = nl_data["keywords"].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [""])

tokenizer = RegexpTokenizer(r"\w+")
nl_data["overview"] = nl_data["overview"].astype(str).apply(lambda x: tokenizer.tokenize(x))
nl_data["tagline"] = nl_data["tagline"].astype(str).apply(lambda x: tokenizer.tokenize(x))

to_list = ["genres", "actors", "writers", "directors"]
for feature in to_list:
    nl_data[feature] = nl_data[feature].astype(str).apply(lambda x: x.split(",") if x != "" else pd.NA)

stopwords = nltk.corpus.stopwords.words("english")
stem_stopwords = ["keywords", "overview", "tagline"]
for feature in stem_stopwords:
    nl_data[feature] = nl_data[feature].apply(
        lambda x: [ps.stem(word) for words in x for word in words.split() if word.lower() not in stopwords]
    )

to_clean = ["keywords", "directors", "writers", "actors", "genres"]
for feature in to_clean:
    nl_data[feature] = nl_data[feature].apply(clean_data)


def create_soup_plot(x):
    return f"{' '.join(x['overview'])} {' '.join(x['tagline'])}"


def create_soup_general(x):
    return f"{' '.join(x['keywords'])} {' '.join(x['directors'])} {' '.join(x['writers'])} {' '.join(x['actors'])} {' '.join(x['genres'])}"


nl_data["soup_plot"] = nl_data.apply(create_soup_plot, axis=1)
nl_data["soup_general"] = nl_data.apply(create_soup_general, axis=1)
show(nl_data)

(9031, 11)


Unnamed: 0,id,title,overview,keywords,tagline,directors,writers,actors,genres,soup_plot,soup_general
0,238,The Godfather,"[span, year, 1945, 1955, chronicl, fiction, it...","[base, novel, book, loss, love, one, love, fir...","[offer, refus]",[francisfordcoppola],"[mariopuzo, francisfordcoppola]","[marlonbrando, alpacino, jamescaan]","[drama, crime]",span year 1945 1955 chronicl fiction italian a...,base novel book loss love one love first sight...
1,278,The Shawshank Redemption,"[frame, 1940, doubl, murder, wife, lover, upst...","[prison, friendship, polic, brutal, corrupt, b...","[fear, hold, prison, hope, set, free]",[frankdarabont],"[stephenking, frankdarabont]","[timrobbins, morganfreeman, bobgunton]","[drama, crime]",frame 1940 doubl murder wife lover upstand ban...,prison friendship polic brutal corrupt base no...
2,240,The Godfather Part II,"[continu, saga, corleon, crime, famili, young,...","[italian, american, cuba, itali, gangster, pra...",[nan],[francisfordcoppola],"[francisfordcoppola, mariopuzo]","[alpacino, robertdeniro, robertduvall]","[drama, crime]",continu saga corleon crime famili young vito c...,italian american cuba itali gangster prais sym...


In [15]:
count_vectorizer = CountVectorizer(stop_words="english")

In [16]:
plot_matrix = count_vectorizer.fit_transform(nl_data["soup_plot"])
general_matrix = count_vectorizer.fit_transform(nl_data["soup_general"])
plot_similarity = cosine_similarity(plot_matrix, plot_matrix)
general_similarity = cosine_similarity(general_matrix, general_matrix)

weight_plot = 0.5
result_similarity = weight_plot * plot_similarity + (1 - weight_plot) * general_similarity

In [17]:
indices = pd.Series(nl_data.index, index=nl_data["title"])

In [18]:
[movie for movie in list(nl_data["title"]) if movie.startswith("Fear Street")]

['Fear Street: 1978', 'Fear Street: 1666', 'Fear Street: 1994']

In [19]:
title = "Fear Street: 1994"

idx = indices[title]

scores = list(enumerate(result_similarity[idx]))
scores = sorted(scores, key=lambda x: x[1], reverse=True)

movies = [score[0] for score in scores]
nl_data["title"].iloc[movies]

4265                 Fear Street: 1994
2487                 Fear Street: 1666
2232                 Fear Street: 1978
7065    Percy Jackson: Sea of Monsters
7309                      Solomon Kane
                     ...              
9011                     Baby Geniuses
9013                        Rollerball
9018                        Epic Movie
9023        Lucky Luke and the Daltons
9026                    Disaster Movie
Name: title, Length: 9031, dtype: object