In [9]:
import pandas as pd
import numpy as np
import re
import nltk
pd.set_option('display.max_columns', None)

In [None]:

df = pd.read_csv("IMDB_Top250Engmovies2_OMDB_Detailed.csv")
df.head()

In [11]:
len(df)

250

In [12]:
df['Plot'][0]

'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'

In [13]:
# convert lowercase and remove numbers, punctuations, spaces, etc.,
df['clean_plot'] = df['Plot'].str.lower()
df['clean_plot'] = df['clean_plot'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
df['clean_plot'] = df['clean_plot'].apply(lambda x: re.sub('\s+', ' ', x))
df['clean_plot']

0      two imprisoned men bond over a number of years...
1      the aging patriarch of an organized crime dyna...
2      the early life and career of vito corleone in ...
3      when the menace known as the joker emerges fro...
4      a jury holdout attempts to prevent a miscarria...
                             ...                        
245    the desperate life of a chronic alcoholic is f...
246    a something supervising staff member of a resi...
247    a newspaper editor uses every trick in the boo...
248    an old man makes a long journey by lawn mover ...
249    a mumbai teen reflects on his upbringing in th...
Name: clean_plot, Length: 250, dtype: object

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# tokenize the sentence
df['clean_plot'] = df['clean_plot'].apply(lambda x: nltk.word_tokenize(x))
df['clean_plot']

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# remove stopwords
stop_words = nltk.corpus.stopwords.words('english')
plot = []
for sentence in df['clean_plot']:
    temp = []
    for word in sentence:
        if word not in stop_words and len(word) >= 3:
            temp.append(word)
    plot.append(temp)
plot[0]

In [None]:
df['clean_plot'] = plot

df.head()

In [None]:
df['Genre'] = df['Genre'].apply(lambda x: x.split(','))
df['Actors'] = df['Actors'].apply(lambda x: x.split(',')[:4])
df['Director'] = df['Director'].apply(lambda x: x.split(','))

In [None]:
df['Actors'][0]

In [None]:
def clean(sentence):
    temp = []
    for word in sentence:
        temp.append(word.lower().replace(' ', ''))
    return temp

In [None]:
df['Genre'] = [clean(x) for x in df['Genre']]
df['Actors'] = [clean(x) for x in df['Actors']]
df['Director'] = [clean(x) for x in df['Director']]

In [None]:
df['Actors'][0]     

In [None]:
columns = ['clean_plot', 'Genre', 'Actors', 'Director']
l = []
for i in range(len(df)):
    words = ''
    for col in columns:
        words += ' '.join(df[col][i]) + ' '
    l.append(words)
l

In [None]:
df['clean_input'] = l
df = df[['Title', 'clean_input']]
df.head()   

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf = TfidfVectorizer()
features = tfidf.fit_transform(df['clean_input'])

In [None]:
# create cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(features, features)
print(cosine_sim)

In [None]:
index = pd.Series(df['Title'])
index.head()

In [None]:
def recommend_movies(title):
    movies = []
    idx = index[index == title].index[0]
    # print(idx)
    score = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    top10 = list(score.iloc[1:11].index)
    # print(top10)
    
    for i in top10:
        movies.append(df['Title'][i])
    return (movies[0], movies[1], movies[2], movies[3], movies[4], movies[5], movies[6], movies[7], movies[8], movies[9])

In [None]:
recommend_movies('Strangers on a Train')

In [None]:
!pip install -q gradio
import gradio as gr

In [None]:
reccomend = gr.Interface(
    fn = recommend_movies,
    inputs = "text",
    outputs = ["text", "text", "text", "text", "text", "text", "text", "text", "text", "text"],
    title = "Predict your next binge"
)

In [None]:
reccomend.launch(debug=False)