# Project Overview

The project is intended to replicate the Netflix movie recommender system.

# Importing Libaries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import ast
import string
import re

In [None]:
credits_df = pd.read_csv(r"C:\Users\kazir\OneDrive\Desktop\Github\Movie_Model\Movie_MODEL\Movie_MOD\Project\CSVs\tmdb_5000_credits.csv")
movies_df = pd.read_csv(r"C:\Users\kazir\OneDrive\Desktop\Github\Movie_Model\Movie_MODEL\Movie_MOD\Project\CSVs\tmdb_5000_movies.csv")

# Data Exploration

Explore the data we are dealing with:
- Understanding the shape, columns, and rows in the data.
- Type of data.
- Look for any missing values.
- Summarize the differences and similarities between the datasets.

In [None]:
# Datasets contain the same amount of columns# Datasets contain the same amount of columns
print (f'The shape of the movies file is: {movies_df.shape}')
print (f'The shape of the credits file is: {credits_df.shape}')

In [None]:
# The movies dataset has interesting columns that can be used for the machine learning model. For example, the vote_count and the keywords columns
print (f'The columns of the movies dataset are: {movies_df.columns}\n')
print (f'The columns of the credits dataset are: {credits_df.columns}')

## Data Types

**Movies:**
- Contains some integer and float values in the dataset.
- Most of the data types are a object.

**Credits:**
- Only one integer value column.
- All the other data types are an object.

In [None]:
print (f'The movies dataset types are:\n{movies_df.dtypes}\n')
print (f'The credits dataset types are:\n{credits_df.dtypes}')

## Missing Values

Summing both datasets for any missing value columns. This way we don't run into an error moving forward and are aware of the data.

In [None]:
movies_df.isna().sum()

In [None]:
# The credits file is not missing any of the values
credits_df.isna().sum()

## ID Column

Note: The **ID** column can be used later in the project to match the datasets.

In [None]:
movies_df['id'][:5]

In [None]:
credits_df['movie_id'][:5]

## Side-by-Side Comparison

Using the first row only to see the similarities and differences between the datasets. This will help us understand how the data is organized.

In [None]:
movies_df.loc[0]

In [None]:
credits_df.loc[0]

## Movie Status

Most of the movies in the dataset have been released.

In [None]:
movies_df.status.value_counts()

In [None]:
movies_df.sort_values(by='popularity', ascending=False).head(3)

# Filtering Dataset

In [None]:
# Movies that are repeated in the dataset
movies_df['title'].value_counts()

In [None]:
movies_df['id'].value_counts()

### Summary
The movies are not popular so we might be able to drop these columns.

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(data=movies_df['popularity'])
plt.xlabel('Popularity')
plt.ylabel('Count')
plt.xlim(0,150)
plt.show()

In [None]:
movies_df['vote_count'].describe()

In [None]:
movies_df['popularity'].describe()

## Data Transformation

### Unpacking
The functions below are using the *ast* library to unpack from the columns that are packed. Each function convert the data into new columns without the packed columns bundled into one.

#### Movie Data

In [None]:
def genres_clean(data):
    extract = []
    for i in ast.literal_eval(data):
        extract.append(i['name'])
    return extract
movies_df['genres'] = movies_df['genres'].apply(genres_clean)

In [None]:
def keywords_clean(data):
    extract = []
    for i in ast.literal_eval(data):
        extract.append(i['name'])
    return extract
movies_df['keywords'] = movies_df['keywords'].apply(keywords_clean)

In [None]:
def languages_clean(data):
    extract = []
    for i in ast.literal_eval(data):
        extract.append(i['name'])
    return extract
movies_df['spoken_languages'] = movies_df['spoken_languages'].apply(languages_clean)

In [None]:
def production_clean(data):
    extract = []
    for i in ast.literal_eval(data):
        extract.append(i['name'])
    return extract
movies_df['production_companies'] = movies_df['production_companies'].apply(production_clean)

#### Credits Data

In [None]:
def cast_clean(data):
    extract = []
    for i in ast.literal_eval(data):
        extract.append(i['name'])
    return extract

credits_df['cast'] = credits_df['cast'].apply(cast_clean)

In [None]:
def crew_clean(data):
    extract = []
    for i in ast.literal_eval(data):
        extract.append(i['name'])
    return extract

credits_df['crew'] = credits_df['crew'].apply(crew_clean)

# Fixing Null Values

## Adding to Missing Overviews

Adding missing overviews back into the dataset.

## Overview

In [None]:
movies_df.loc[movies_df.original_title == 'The Dark Knight','overview'] = "When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice."
movies_df.loc[movies_df.original_title == 'Inside Out','overview'] = "After young Riley is uprooted from her Midwest life and moved to San Francisco, her emotions - Joy, Fear, Anger, Disgust and Sadness - conflict on how best to navigate a new city, house, and school."
movies_df.loc[movies_df.original_title == 'Guardians of the Galaxy','overview'] = "A group of intergalactic criminals must pull together to stop a fanatical warrior with plans to purge the universe."
movies_df.loc[movies_df.original_title == 'Interstellar','overview'] = "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival."
movies_df.loc[movies_df.original_title == 'Inception','overview'] = "A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O., but his tragic past may doom the project and his team to disaster."
movies_df.loc[movies_df.original_title == 'The Lord of the Rings: The Fellowship of the Ring','overview'] = "A meek Hobbit from the Shire and eight companions set out on a journey to destroy the powerful One Ring and save Middle-earth from the Dark Lord Sauron."
movies_df.loc[movies_df.original_title == 'Django Unchained','overview'] = "With the help of a German bounty-hunter, a freed slave sets out to rescue his wife from a brutal plantation owner in Mississippi."
movies_df.loc[movies_df.original_title == 'The Wolf of Wall Street','overview'] = "Based on the true story of Jordan Belfort, from his rise to a wealthy stock-broker living the high life to his fall involving crime, corruption and the federal government."
movies_df.loc[movies_df.original_title == 'The Lord of the Rings: The Return of the King','overview'] = "Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze from Frodo and Sam as they approach Mount Doom with the One Ring."
movies_df.loc[movies_df.original_title == 'The Lord of the Rings: The Two Towers','overview'] = "While Frodo and Sam edge closer to Mordor with the help of the shifty Gollum, the divided fellowship makes a stand against Sauron's new ally, Saruman, and his hordes of Isengard."
movies_df.loc[movies_df.original_title == 'The Lion King','overview'] = "After the murder of his father, a young lion prince flees his kingdom only to learn the true meaning of responsibility and bravery."
movies_df.loc[movies_df.original_title == 'The Matrix','overview'] = "When a beautiful stranger leads computer hacker Neo to a forbidding underworld, he discovers the shocking truth--the life he knows is the elaborate deception of an evil cyber-intelligence."
movies_df.loc[movies_df.original_title == 'Fight Club','overview'] = "An insomniac office worker and a devil-may-care soap maker form an underground fight club that evolves into much more."
movies_df.loc[movies_df.original_title == 'The Green Mile','overview'] = "The lives of guards on Death Row are affected by one of their charges: a black man accused of child murder and rape, yet who has a mysterious gift."
movies_df.loc[movies_df.original_title == 'Forrest Gump','overview'] = "The presidencies of Kennedy and Johnson, the Vietnam War, the Watergate scandal and other historical events unfold from the perspective of an Alabama man with an IQ of 75, whose only desire is to be reunited with his childhood sweetheart"
movies_df.loc[movies_df.original_title == 'Se7en','overview'] = "Two detectives, a rookie and a veteran, hunt a serial killer who uses the seven deadly sins as his motives."
movies_df.loc[movies_df.original_title == "Schindler's List",'overview'] = "In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis."
movies_df.loc[movies_df.original_title == 'The Shawshank Redemption','overview'] = "Over the course of several years, two convicts form a friendship, seeking consolation and, eventually, redemption through basic compassion."
movies_df.loc[movies_df.original_title == 'The Empire Strikes Back','overview'] = "After the Rebels are overpowered by the Empire, Luke Skywalker begins Jedi training with Yoda, while his friends are pursued across the galaxy by Darth Vader and bounty hunter Boba Fett."
movies_df.loc[movies_df.original_title == 'The Silence of the Lambs','overview'] = "A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims."
movies_df.loc[movies_df.original_title == 'Back to the Future','overview'] = "Marty McFly, a 17-year-old high school student, is accidentally sent 30 years into the past in a time-traveling DeLorean invented by his close friend, the maverick scientist Doc Brown."
movies_df.loc[movies_df.original_title == '千と千尋の神隠し','overview'] = "During her family's move to the suburbs, a sullen 10-year-old girl wanders into a world ruled by gods, witches, and spirits, and where humans are changed into beasts."
movies_df.loc[movies_df.original_title == 'The Imitation Game','overview'] = "During World War II, the English mathematical genius Alan Turing tries to crack the German Enigma code with help from fellow mathematicians while attempting to come to terms with his troubled private life."
movies_df.loc[movies_df.original_title == 'Chiamatemi Francesco - Il Papa della gente','overview'] = "The story of Pope Francis' life."
movies_df.loc[movies_df.original_title == 'The Godfather: Part II','overview'] = "The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his grip on the family crime syndicate."
movies_df.loc[movies_df.original_title == 'Star Wars','overview'] = "Luke Skywalker joins forces with a Jedi Knight, a cocky pilot, a Wookiee and two droids to save the galaxy from the Empire's world-destroying battle station, while also attempting to rescue Princess Leia from the mysterious Darth Vader."
movies_df.loc[movies_df.original_title == 'Pulp Fiction','overview'] = "The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption."
movies_df.loc[movies_df.original_title == 'The Godfather','overview'] = "The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son."
movies_df.loc[movies_df.original_title == 'Whiplash','overview'] = "A promising young drummer enrolls at a cut-throat music conservatory where his dreams of greatness are mentored by an instructor who will stop at nothing to realize a student's potential."
movies_df.loc[movies_df.original_title == 'To Be Frank, Sinatra at 100','overview'] = "The life of Frank Sinatra, as an actor and singer and the steps along the way that led him to become such an icon."
movies_df.loc[movies_df.original_title == 'Food Chains','overview'] = "There is so much interest in food these days yet there is almost no interest in the hands that pick that food. In the US, farm labor has always been one of the most difficult and poorly paid jobs and has relied on some of the nation's most vulnerable people. While the legal restrictions which kept people bound to farms, like slavery, have been abolished, exploitation still exists, ranging from wage theft to modern-day slavery. These days, this exploitation is perpetuated by the corporations at the top of the food chain: supermarkets. Their buying power has kept wages pitifully low and has created a scenario where desperately poor people are willing to put up with anything to keep their jobs."

# Merge Data

### Credits Drop

In [None]:
# Just beacuse we do not want this data to be repeated
credits_df.drop(columns=['title'], inplace=True)

In [None]:
merged_df = movies_df.merge(credits_df, left_on='id', right_on='movie_id')
merged_df.head(3)

In [None]:
merged_df['production_companies'][0]

# 50% DataFrame

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
pd.options.mode.chained_assignment = None

In [None]:
cols_to_use = ['title', 'overview', 'genres', 'keywords', 'cast', 'crew','production_companies',\
               'popularity', 'vote_average', 'vote_count']
filtered_df_by_50 = merged_df[cols_to_use]
filtered_df_by_50[:5]

## Creating a Text Cleaner

In [None]:
# Initializing Porter Stemmer and Stopwords
ps = PorterStemmer()
stopwords = stopwords.words('English')

In [None]:
def overview_formatter(text):
    new_text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', new_text)
    stem_text = [ps.stem(word) for word in tokens if word not in stopwords]
    clean_text = ' '.join(word for word in stem_text)
    return clean_text

filtered_df_by_50['overview'] = filtered_df_by_50['overview'].apply(lambda x: overview_formatter(x))

In [None]:
filtered_df_by_50.head(3)

# TFIDF-Vectorizer

In [None]:
# Intialize the vectorizer
vectorizer = TfidfVectorizer()

In [None]:
counts = vectorizer.fit_transform(filtered_df_by_50['overview'])
counts

In [None]:
# display the dataframe
sparse_matrix = pd.DataFrame(counts.toarray(), columns=vectorizer.get_feature_names_out())
sparse_matrix.head(10)

In [None]:
sparse_matrix['movie'] = filtered_df_by_50['title']

In [None]:
sparse_matrix

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Split independent and dependent variables
x = sparse_matrix.drop(columns='movie')
y = sparse_matrix['movie']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.30, random_state=42)

In [None]:
# Train the model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest

In [None]:
random_forest.fit(x_train, y_train)

In [None]:
y_pred = random_forest.predict(x_test)

print (f"The accuracy score is {accuracy_scoreuracy_score(y_test, y_pred)}")