# Initial Import

In [1]:
import pandas as pd
import numpy as np

from string import punctuation
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel, linear_kernel, cosine_similarity

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data Import

In [8]:
df = pd.read_excel('./tmdb_movies_may_20_clean.xlsx')

In [9]:
df

Unnamed: 0,id,original_title,popularity,adult,overview,releaseDate,poster_path,genres
0,419704.0,Ad Astra,517.169,0.0,The near future a time when both hope and har...,2019-09-17,https://image.tmdb.org/t/p/w185_and_h278_bestv...,Drama Science Fiction
1,338762.0,Bloodshot,183.316,0.0,After he and his wife are murdered marine Ray...,2020-03-05,https://image.tmdb.org/t/p/w185_and_h278_bestv...,Action Science Fiction
2,385103.0,Scoob!,144.906,0.0,In Scooby-Doo’s greatest adventure yet see th...,2020-05-15,https://image.tmdb.org/t/p/w185_and_h278_bestv...,Adventure Animation Comedy Mystery Family
3,576156.0,The Lovebirds,118.500,0.0,A couple experiences a defining moment in thei...,2020-05-22,https://image.tmdb.org/t/p/w185_and_h278_bestv...,Action Comedy Romance
4,686245.0,Survive the Night,97.197,0.0,A disgraced doctor and his family are held hos...,2020-05-22,https://image.tmdb.org/t/p/w185_and_h278_bestv...,Action Thriller
...,...,...,...,...,...,...,...,...
9995,165904.0,?????? 26,10.198,0.0,In the early 1980s in India a group of con ar...,2013-02-07,https://image.tmdb.org/t/p/w185_and_h278_bestv...,Crime Drama Thriller
9996,14940.0,Gigantic,6.556,0.0,Young mattress salesman Brian decides to adopt...,2008-09-09,https://image.tmdb.org/t/p/w185_and_h278_bestv...,Comedy Romance
9997,11112.0,The Associate,8.194,0.0,Laurel Ayres is a businesswoman trying to make...,1996-10-25,https://image.tmdb.org/t/p/w185_and_h278_bestv...,Comedy
9998,11689.0,I due superpiedi quasi piatti,8.943,0.0,An attempted robbery turns to be an unexpected...,1977-04-01,https://image.tmdb.org/t/p/w185_and_h278_bestv...,Action Comedy


In [10]:
df.columns

Index(['id', 'original_title', 'popularity', 'adult', 'overview',
       'releaseDate', 'poster_path', 'genres'],
      dtype='object')

# Selecting features

In [11]:
df_sel = df[['original_title', 'overview', 'genres']]

In [12]:
df_sel

Unnamed: 0,original_title,overview,genres
0,Ad Astra,The near future a time when both hope and har...,Drama Science Fiction
1,Bloodshot,After he and his wife are murdered marine Ray...,Action Science Fiction
2,Scoob!,In Scooby-Doo’s greatest adventure yet see th...,Adventure Animation Comedy Mystery Family
3,The Lovebirds,A couple experiences a defining moment in thei...,Action Comedy Romance
4,Survive the Night,A disgraced doctor and his family are held hos...,Action Thriller
...,...,...,...
9995,?????? 26,In the early 1980s in India a group of con ar...,Crime Drama Thriller
9996,Gigantic,Young mattress salesman Brian decides to adopt...,Comedy Romance
9997,The Associate,Laurel Ayres is a businesswoman trying to make...,Comedy
9998,I due superpiedi quasi piatti,An attempted robbery turns to be an unexpected...,Action Comedy


In [13]:
df_sel['text'] = df_sel['overview'] + ' ' + df_sel['genres']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
df_sel

Unnamed: 0,original_title,overview,genres,text
0,Ad Astra,The near future a time when both hope and har...,Drama Science Fiction,The near future a time when both hope and har...
1,Bloodshot,After he and his wife are murdered marine Ray...,Action Science Fiction,After he and his wife are murdered marine Ray...
2,Scoob!,In Scooby-Doo’s greatest adventure yet see th...,Adventure Animation Comedy Mystery Family,In Scooby-Doo’s greatest adventure yet see th...
3,The Lovebirds,A couple experiences a defining moment in thei...,Action Comedy Romance,A couple experiences a defining moment in thei...
4,Survive the Night,A disgraced doctor and his family are held hos...,Action Thriller,A disgraced doctor and his family are held hos...
...,...,...,...,...
9995,?????? 26,In the early 1980s in India a group of con ar...,Crime Drama Thriller,In the early 1980s in India a group of con ar...
9996,Gigantic,Young mattress salesman Brian decides to adopt...,Comedy Romance,Young mattress salesman Brian decides to adopt...
9997,The Associate,Laurel Ayres is a businesswoman trying to make...,Comedy,Laurel Ayres is a businesswoman trying to make...
9998,I due superpiedi quasi piatti,An attempted robbery turns to be an unexpected...,Action Comedy,An attempted robbery turns to be an unexpected...


# Data Cleaning

In [15]:
df_sel.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
stopwords = set(stopwords.words('english'))

In [18]:
text = df_sel['text'].values[0]

In [19]:
def clean_text(text):
    ctext = ''.join([c for c in text.lower() if c not in punctuation])
#     ctext = ' '.join([word for word in ctext.split(' ') if word not in stopwords])
#     return " ".join(ctext.split())
    return ctext

In [20]:
clean_text(text)

'the near future  a time when both hope and hardships drive humanity to look to the stars and beyond while a mysterious phenomenon menaces to destroy life on planet earth  astronaut roy mcbride undertakes a mission across the immensity of space and its many perils to uncover the truth about a lost expedition that decades before boldly faced emptiness and silence in search of the unknown drama science fiction'

In [21]:
df_sel['clean_text'] = df_sel['text'].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
df_sel = df_sel[['original_title', 'clean_text']]

In [28]:
df_sel['original_title'] = df_sel['original_title'].astype('str')
df_sel['clean_text'] = df_sel['clean_text'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [29]:
df_sel['original_title'] = df_sel['original_title'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [30]:
df_sel.head(2)

Unnamed: 0,original_title,clean_text
0,ad astra,the near future a time when both hope and har...
1,bloodshot,after he and his wife are murdered marine ray...


In [31]:
df_sel['clean_text'].values[1]

'after he and his wife are murdered  marine ray garrison is resurrected by a team of scientists enhanced with nanotechnology  he becomes a superhuman  biotech killing machine—bloodshot as ray first trains with fellow supersoldiers  he cannot recall anything from his former life but when his memories flood back and he remembers the man that killed both him and his wife  he breaks out of the facility to get revenge  only to discover that theres more to the conspiracy than he thought action science fiction'

# Tf-Idf Transformer

In [140]:
tfidf = TfidfVectorizer(analyzer='word', min_df=0, stop_words='english')

In [141]:
vec_arr = tfidf.fit_transform(df_sel['clean_text'])

In [142]:
vec_arr.shape

(9963, 31949)

# Cosine Similarity

In [156]:
sim_mat = cosine_similarity(vec_arr, vec_arr)

In [157]:
sim_mat.shape

(9963, 9963)

In [190]:
# smd = smd.reset_index()
titles = df_sel['original_title']
indices = pd.Series(df_sel.index, index=df_sel['original_title'])

In [159]:
df_sel[df_sel['original_title']=='ad astra']

Unnamed: 0,original_title,clean_text
0,ad astra,the near future a time when both hope and har...


In [160]:
indices[indices == 100].index[0]

'deadpool'

In [161]:
indices['the dark knight']

30

In [162]:
indices

original_title
ad astra                            0
bloodshot                           1
scoob!                              2
the lovebirds                       3
survive the night                   4
                                 ... 
?????? 26                        9995
gigantic                         9996
the associate                    9997
i due superpiedi quasi piatti    9998
the prophecy 3: the ascent       9999
Length: 9963, dtype: int64

In [163]:
sim_mat[100]

array([0.00504064, 0.01469436, 0.02583184, ..., 0.01551424, 0.00990835,
       0.        ])

In [164]:
indices['dunkirk']

321

# Getting a recommendation

In [221]:
def get_recommendations(title):
    idx = indices[title]
    print(idx)
    sim_scores = list(enumerate(sim_mat[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    print(sim_scores)
    print(movie_indices)
    return titles.iloc[movie_indices].values

In [222]:
get_recommendations('1917'.lower())

9
[(8438, 0.3275894920348251), (5952, 0.3107936905092054), (856, 0.2680565804690932), (8213, 0.23596399613181512), (8882, 0.22771704963955441)]
[8438, 5952, 856, 8213, 8882]


array(['mediterraneo', 'saints and soldiers', 'we were soldiers',
       "kelly's heroes", '??????'], dtype=object)

In [167]:
indices[indices == 'the green mile']

  res_values = method(rvalues)


Series([], dtype: int64)

# Saving similarity matrix

In [169]:
# indices.to_csv('./indices.csv')
np.save('./movies_sim_may20.npy', sim_mat)

In [170]:
indices.to_frame('index').reset_index().to_csv('./indices_may20.csv')

In [171]:
ind_df = indices.to_frame('index').reset_index()

In [172]:
list(ind_df[ind_df['index'].isin([2342, 2121, 2005, 2217])]['original_title'].values)

['two weeks notice', 'the fifth estate', 'runaway jury', 'pathfinder']

In [194]:
indices.to_csv('temp.csv')

In [211]:
sim_mat.shape

(9963, 9963)

In [212]:
indices.shape

(9963,)

In [213]:
ind_df.shape

(9963, 2)

In [214]:
ind_df.describe()

Unnamed: 0,index
count,9963.0
mean,5001.107598
std,2885.835173
min,0.0
25%,2502.5
50%,5001.0
75%,7499.5
max,9999.0


In [215]:
max(indices)

9999

In [216]:
ind_df

Unnamed: 0,original_title,index
0,ad astra,0
1,bloodshot,1
2,scoob!,2
3,the lovebirds,3
4,survive the night,4
...,...,...
9958,?????? 26,9995
9959,gigantic,9996
9960,the associate,9997
9961,i due superpiedi quasi piatti,9998


In [217]:
indices

original_title
ad astra                            0
bloodshot                           1
scoob!                              2
the lovebirds                       3
survive the night                   4
                                 ... 
?????? 26                        9995
gigantic                         9996
the associate                    9997
i due superpiedi quasi piatti    9998
the prophecy 3: the ascent       9999
Length: 9963, dtype: int64

In [223]:
ind_df.iloc[856]

original_title    we were soldiers
index                          862
Name: 856, dtype: object

In [224]:
movie_indices = [8438, 5952, 856, 8213, 8882]

In [229]:
list(ind_df.loc[movie_indices]['original_title'].values)

['mediterraneo',
 'saints and soldiers',
 'we were soldiers',
 "kelly's heroes",
 '??????']