In [None]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [None]:
df = pd.read_csv('movies.csv')

## Basic Overview of dataset

In [None]:
df.head()

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df.columns

## Extracting id and respective poster_path
this will be helpful in showing posters along with recommended movie names.

In [None]:
poster_paths = df[['id', 'poster_path']]

In [None]:
pickle.dump(poster_paths, open('poster_paths.pkl', 'wb'))

## Filtering columns 
Only keeping those columns which will help in building recommender system.

In [None]:
df['original_language'].value_counts()

In [None]:
df[df['original_language'] == 'en'].shape[0]/df.shape[0]
# more than 75% of movies are in english so not important column

In [None]:
df['adult'].value_counts()

### columns to include
- id
- title
- overview
- genre_names 
- keywords
- cast 
- director

In [None]:
df2 = df.copy()

In [None]:
df = df[['id', 'title', 'overview', 'genre_names', 'keywords', 'cast', 'director']]

## Handling duplicate values

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df[['title', 'director']].duplicated().sum()

In [None]:
df[['title', 'director']].drop_duplicates(inplace=True)

## Handling missing values

In [None]:
df.isna().sum()

In [None]:
df[df['title'].isna()]

In [None]:
df[df['genre_names'].isna()]

In [None]:
df[df['keywords'].isna()]

In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum()

## Formatting textual columns into list

In [None]:
df['genre_names'][0]

In [None]:
ast.literal_eval("['Action', 'Comedy', 'Science Fiction']")

In [None]:
df['keywords'][0]

In [None]:
df['cast'][0]

In [None]:
df['director'][0]

In [None]:
def create_list(name):
    return [name]

### Creating lists from all the textual columns

In [None]:
df['genre_names'] = df['genre_names'].apply(ast.literal_eval)
df['keywords'] = df['keywords'].apply(ast.literal_eval)
df['cast'] = df['cast'].apply(ast.literal_eval)
df['director'] = df['director'].apply(create_list)
df['overview'] = df['overview'].apply(lambda x:x.split())

### Removing spaces between words so that meaning remains

In [None]:
def collapse(L):
    l = []
    for i in L:
        l.append(i.replace(' ', ''))
    return l

In [None]:
df['genre_names'].apply(collapse)

In [None]:
df['keywords'].apply(collapse)

In [None]:
df['cast'].apply(collapse)

In [None]:
df['overview'] = df['overview'].apply(collapse)
df['genre_names'] = df['genre_names'].apply(collapse)
df['keywords'] = df['keywords'].apply(collapse)
df['cast'] = df['cast'].apply(collapse)
df['director'] = df['director'].apply(collapse)

In [None]:
df['tags'] = df['overview'] + df['genre_names'] + df['keywords'] + df['cast'] + df['director']

### Creating new dataframe by dropping columns which are already included into tags

In [None]:
new_df = df.drop(columns=['overview', 'genre_names', 'keywords', 'cast', 'director'])

In [None]:
new_df

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
nltk.download('stopwords')

In [None]:
stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = text.split()
    processed_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
    return ' '.join(processed_words)

In [None]:
new_df['processed_tags'] = new_df['tags'].apply(preprocess_text)

## Vectorizing strings

In [None]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vector = cv.fit_transform(new_df['processed_tags']).toarray()

In [None]:
vector

In [None]:
vector.shape

In [None]:
similarity = cosine_similarity(vector)

In [None]:
similarity

In [None]:
new_df[new_df['title'] == 'The Lego Movie'].index[0]

In [None]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x:x[1])
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)

In [None]:
recommend('Deadpool & Wolverine')

In [None]:
import pickle

In [None]:
pickle.dump(new_df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

### Processing for fetching poster path

In [None]:
poster_paths[poster_paths['id'] == '704239']['poster_path'].values[0]

In [None]:
new_df[new_df['id'] == '704239']