In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load dataset
movies = pd.read_csv(r"C:\Users\nivi1\Downloads\dataset.csv")
print(movies.head(10))
print(movies.describe())
print(movies.info())
print(movies.isnull().sum())
print(movies.columns)



         id                        title                     genre  \
0     278.0     The Shawshank Redemption               Drama,Crime   
1   19404.0  Dilwale Dulhania Le Jayenge      Comedy,Drama,Romance   
2     238.0                The Godfather               Drama,Crime   
3     424.0             Schindler's List         Drama,History,War   
4     240.0       The Godfather: Part II               Drama,Crime   
5  667257.0            Impossible Things              Family,Drama   
6     129.0                Spirited Away  Animation,Family,Fantasy   
7  730154.0               Your Eyes Tell             Romance,Drama   
8  372754.0     Dou kyu sei – Classmates         Romance,Animation   
9  372058.0                   Your Name.   Romance,Animation,Drama   

  original_language                                           overview  \
0                en  Framed in the 1940s for the double murder of h...   
1                hi  Raj is a rich, carefree, happy-go-lucky second...   
2      

In [2]:
# Selecting required columns
movies = movies[['id', 'title', 'overview', 'genre']]
print(movies)


            id                                          title  \
0        278.0                       The Shawshank Redemption   
1      19404.0                    Dilwale Dulhania Le Jayenge   
2        238.0                                  The Godfather   
3        424.0                               Schindler's List   
4        240.0                         The Godfather: Part II   
...        ...                                            ...   
9995   10196.0                             The Last Airbender   
9996  331446.0                       Sharknado 3: Oh Hell No!   
9997   13995.0                                Captain America   
9998    2312.0  In the Name of the King: A Dungeon Siege Tale   
9999  455957.0                                         Domino   

                                               overview  \
0     Framed in the 1940s for the double murder of h...   
1     Raj is a rich, carefree, happy-go-lucky second...   
2     Spanning the years 1945 to 1955, a c

In [3]:
# Create new column 'tags'
movies['tags'] = movies['overview'] + movies['genre']
print(movies)

            id                                          title  \
0        278.0                       The Shawshank Redemption   
1      19404.0                    Dilwale Dulhania Le Jayenge   
2        238.0                                  The Godfather   
3        424.0                               Schindler's List   
4        240.0                         The Godfather: Part II   
...        ...                                            ...   
9995   10196.0                             The Last Airbender   
9996  331446.0                       Sharknado 3: Oh Hell No!   
9997   13995.0                                Captain America   
9998    2312.0  In the Name of the King: A Dungeon Siege Tale   
9999  455957.0                                         Domino   

                                               overview  \
0     Framed in the 1940s for the double murder of h...   
1     Raj is a rich, carefree, happy-go-lucky second...   
2     Spanning the years 1945 to 1955, a c

In [4]:
# Drop unnecessary columns
new_data = movies.drop(columns=['overview', 'genre'])
print(new_data)

            id                                          title  \
0        278.0                       The Shawshank Redemption   
1      19404.0                    Dilwale Dulhania Le Jayenge   
2        238.0                                  The Godfather   
3        424.0                               Schindler's List   
4        240.0                         The Godfather: Part II   
...        ...                                            ...   
9995   10196.0                             The Last Airbender   
9996  331446.0                       Sharknado 3: Oh Hell No!   
9997   13995.0                                Captain America   
9998    2312.0  In the Name of the King: A Dungeon Siege Tale   
9999  455957.0                                         Domino   

                                                   tags  
0     Framed in the 1940s for the double murder of h...  
1     Raj is a rich, carefree, happy-go-lucky second...  
2     Spanning the years 1945 to 1955, a chro

In [5]:
# Text vectorization
cv = CountVectorizer(max_features=10000, stop_words='english')
vector = cv.fit_transform(new_data['tags'].values.astype('U')).toarray()
print(vector.shape)

(10000, 10000)


In [6]:

# Cosine similarity
similarity = cosine_similarity(vector)
print(similarity)


[[1.         0.05634362 0.12888482 ... 0.07559289 0.11065667 0.06388766]
 [0.05634362 1.         0.07624929 ... 0.         0.03636965 0.        ]
 [0.12888482 0.07624929 1.         ... 0.02273314 0.06655583 0.08645856]
 ...
 [0.07559289 0.         0.02273314 ... 1.         0.03253    0.02817181]
 [0.11065667 0.03636965 0.06655583 ... 0.03253    1.         0.0412393 ]
 [0.06388766 0.         0.08645856 ... 0.02817181 0.0412393  1.        ]]


In [7]:
# Example index lookup
print(new_data[new_data['title'] == "The Godfather"].index[0])

2


In [8]:
# Show top 5 recommendations for "The Godfather"
distance = sorted(list(enumerate(similarity[2])), reverse=True, key=lambda vector: vector[1])
for i in distance[0:5]:
    print(new_data.iloc[i[0]].title)


The Godfather
The Godfather: Part II
Blood Ties
Joker
Bomb City


In [10]:
# Recommender function
def recommand(movies):
    index = new_data[new_data['title'] == movies].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector: vector[1])
    for i in distance[0:5]:
        print(new_data.iloc[i[0]].title)

In [11]:
# Test the recommender
recommand("Iron Man")

# Save files with pickle
pickle.dump(new_data, open('movies_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity_list.pkl', 'wb'))

Iron Man
Iron Man 3
Guardians of the Galaxy Vol. 2
Avengers: Age of Ultron
Star Wars: Episode III - Revenge of the Sith
