In [1]:
import numpy as np
import pandas as pd
#stemming class from nltk
from nltk.stem.porter import PorterStemmer
#count vectorizor
from sklearn.feature_extraction.text import CountVectorizer
#cosine similarties to calculate the similarty measure between movies
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("movies_dataset.csv")

In [3]:
df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [4]:
#tag column overview: brief description, genre, actors and director
df['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [5]:
# Get stemming class from nltk library. Will need nltk library installed. pip install nltk
ps = PorterStemmer()

In [6]:
#defining the helper stemming function
def stem(text):
    y=[]
    
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [7]:
ps.stem("programmers")

'programm'

In [8]:
# choose some words to be stemmed
#stemming example. A stemmeing algorithm reduces the words to the root/base word.
words = ["program", "programs", "programmer", "programming", "programmers"]
 
for w in words:
    print(w, " : ", ps.stem(w))

program  :  program
programs  :  program
programmer  :  programm
programming  :  program
programmers  :  programm


In [9]:
#Iterrate over rows.
for index,row in df.iterrows():
    #print(index)
    df.loc[index, 'tags'] = stem(row['tags'])

In [10]:
cv = CountVectorizer(max_features=5000,stop_words='english')
vectors = cv.fit_transform(df['tags']).toarray()

In [11]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [13]:
cv.get_feature_names()



['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '17th',
 '18',
 '18th',
 '18thcenturi',
 '19',
 '1910',
 '1920',
 '1930',
 '1940',
 '1944',
 '1950',
 '1950s',
 '1960',
 '1960s',
 '1970',
 '1970s',
 '1971',
 '1974',
 '1976',
 '1980',
 '1985',
 '1990',
 '1999',
 '19th',
 '19thcenturi',
 '20',
 '200',
 '2003',
 '2009',
 '20th',
 '21st',
 '23',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '70',
 '80',
 'aaron',
 'aaroneckhart',
 'abandon',
 'abduct',
 'abigailbreslin',
 'abil',
 'abl',
 'aboard',
 'abov',
 'abus',
 'academ',
 'academi',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'account',
 'accus',
 'ace',
 'achiev',
 'acquaint',
 'act',
 'action',
 'actionhero',
 'activ',
 'activist',
 'activities',
 'actor',
 'actress',
 'actual',
 'ad',
 'adam',
 'adamsandl',
 'adamshankman',
 'adapt',
 'add',
 'addict',
 'adjust',
 'admir',
 'admit',
 'adolesc',
 'adopt',
 'ador',
 'adrienbrodi',
 'adult'

In [14]:
similarity = cosine_similarity(vectors)

In [15]:
similarity[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

In [16]:
similarity[1]

array([0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
       0.02615329])

In [17]:
similarity.shape

(4806, 4806)

In [18]:
df[df['title'].str.contains("The Lord of the Rings: The Fellowship")]

Unnamed: 0,movie_id,title,tags
262,120,The Lord of the Rings: The Fellowship of the Ring,"young hobbit frodo baggins, after inherit a my..."


In [19]:
df[df['title'].str.contains("The Lord of the Rings: The Fellowship")].index[0]

262

In [20]:
sorted(similarity[262])

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [21]:
for count, value in enumerate(similarity[262]):
    print(count, value)

0 0.10650358071057624
1 0.09007546982220897
2 0.06963575181639443
3 0.05947010334500525
4 0.15322409370161832
5 0.10527936095153945
6 0.04885967564883423
7 0.1385663269313568
8 0.09589266029707681
9 0.07829602926862712
10 0.14532958974404492
11 0.15322409370161832
12 0.1462979547282512
13 0.03641785203646149
14 0.12456821978060995
15 0.13561270072416204
16 0.042601432284230495
17 0.14546434136292807
18 0.12294604428906897
19 0.35305338554802046
20 0.2583768965864695
21 0.1553638665664663
22 0.3484365738222833
23 0.11791665765914455
24 0.12951986753290365
25 0.041293962759267534
26 0.04152273992686998
27 0.11768446184934013
28 0.04716666306365782
29 0.07018624063435963
30 0.16232693100931947
31 0.051502620262460476
32 0.12068965517241377
33 0.07878385971583353
34 0.0
35 0.13401406685472436
36 0.10012021643290914
37 0.10650358071057624
38 0.09847982464479191
39 0.13316902848554255
40 0.12456821978060995
41 0.13407108198891604
42 0.018952451089472584
43 0.05515973485146915
44 0.1368964195

1311 0.07530940414109723
1312 0.0
1313 0.0719194952228076
1314 0.02865341275306264
1315 0.133407469193014
1316 0.04438967616184752
1317 0.05311606738473832
1318 0.0
1319 0.03914801463431356
1320 0.027379283909669674
1321 0.02865341275306264
1322 0.1714152727328595
1323 0.15061880828219446
1324 0.06755660236665673
1325 0.0863465783552691
1326 0.02626128657194451
1327 0.08398387664337813
1328 0.0
1329 0.04962916669854651
1330 0.06189844605901729
1331 0.12174373330944385
1332 0.15130859380629447
1333 0.06963575181639443
1334 0.09284766908852592
1335 0.02680281337094487
1336 0.0
1337 0.07340252743933792
1338 0.024814583349273254
1339 0.0
1340 0.07661204685080916
1341 0.042051713353118
1342 0.05808023194967747
1343 0.05475856781933935
1344 0.08104408984731078
1345 0.12320677759351353
1346 0.06465081838352359
1347 0.12407291674636627
1348 0.03012376165643889
1349 0.11744404390294068
1350 0.05730682550612528
1351 0.06755660236665673
1352 0.015807436935466976
1353 0.051502620262460476
1354 0.0

2665 0.05872202195147034
2666 0.024814583349273254
2667 0.0
2668 0.04876598490941707
2669 0.084103426706236
2670 0.021300716142115247
2671 0.0
2672 0.05598925109558542
2673 0.07661204685080916
2674 0.13130643285972254
2675 0.03641785203646149
2676 0.0
2677 0.0
2678 0.03607265133540433
2679 0.10821795400621297
2680 0.03541071158982555
2681 0.047946330148538406
2682 0.09925833339709302
2683 0.10793322294408637
2684 0.024382992454708534
2685 0.0
2686 0.08304547985373996
2687 0.02680281337094487
2688 0.13085286869662258
2689 0.11259433727776122
2690 0.024814583349273254
2691 0.030949223029508643
2692 0.13407108198891604
2693 0.05515973485146915
2694 0.03012376165643889
2695 0.09104463009115371
2696 0.02285751604412472
2697 0.10253309789938275
2698 0.02865341275306264
2699 0.0
2700 0.07829602926862712
2701 0.07744030926623663
2702 0.0
2703 0.0
2704 0.0
2705 0.2076136996343499
2706 0.0
2707 0.022518867455552243
2708 0.031846487764924096
2709 0.05938556868486719
2710 0.019360077316559157
2711

4017 0.08143279274805704
4018 0.08040844011283461
4019 0.053055810907729095
4020 0.0
4021 0.03448275862068965
4022 0.16162704595880895
4023 0.0
4024 0.10777404608287845
4025 0.02865341275306264
4026 0.0
4027 0.027379283909669674
4028 0.1576320721888657
4029 0.02626128657194451
4030 0.0
4031 0.0
4032 0.0
4033 0.032826608214930636
4034 0.03468266034531822
4035 0.0
4036 0.03790490217894517
4037 0.1142875802206236
4038 0.0
4039 0.09284766908852592
4040 0.031846487764924096
4041 0.0
4042 0.0
4043 0.016162704595880897
4044 0.0
4045 0.020261022461827694
4046 0.024382992454708534
4047 0.12156613477096616
4048 0.07684127768066812
4049 0.0
4050 0.02587601454159973
4051 0.06724826391390865
4052 0.0
4053 0.0
4054 0.0
4055 0.08753762190648169
4056 0.0
4057 0.0
4058 0.022518867455552243
4059 0.05311606738473832
4060 0.0
4061 0.16081688022566923
4062 0.0
4063 0.0
4064 0.0
4065 0.12951986753290365
4066 0.06024752331287778
4067 0.1114172029062311
4068 0.08934271123648291
4069 0.12456821978060993
4070 0

In [22]:
list(enumerate(similarity[262]))

[(0, 0.10650358071057624),
 (1, 0.09007546982220897),
 (2, 0.06963575181639443),
 (3, 0.05947010334500525),
 (4, 0.15322409370161832),
 (5, 0.10527936095153945),
 (6, 0.04885967564883423),
 (7, 0.1385663269313568),
 (8, 0.09589266029707681),
 (9, 0.07829602926862712),
 (10, 0.14532958974404492),
 (11, 0.15322409370161832),
 (12, 0.1462979547282512),
 (13, 0.03641785203646149),
 (14, 0.12456821978060995),
 (15, 0.13561270072416204),
 (16, 0.042601432284230495),
 (17, 0.14546434136292807),
 (18, 0.12294604428906897),
 (19, 0.35305338554802046),
 (20, 0.2583768965864695),
 (21, 0.1553638665664663),
 (22, 0.3484365738222833),
 (23, 0.11791665765914455),
 (24, 0.12951986753290365),
 (25, 0.041293962759267534),
 (26, 0.04152273992686998),
 (27, 0.11768446184934013),
 (28, 0.04716666306365782),
 (29, 0.07018624063435963),
 (30, 0.16232693100931947),
 (31, 0.051502620262460476),
 (32, 0.12068965517241377),
 (33, 0.07878385971583353),
 (34, 0.0),
 (35, 0.13401406685472436),
 (36, 0.100120216432

In [23]:
sorted(list(enumerate(similarity[262])),reverse=True)

[(4805, 0.04004808657316366),
 (4804, 0.07503224734841288),
 (4803, 0.018208926018230744),
 (4802, 0.03790490217894517),
 (4801, 0.14024885950870325),
 (4800, 0.16985121921200538),
 (4799, 0.021300716142115247),
 (4798, 0.13675713199027156),
 (4797, 0.0),
 (4796, 0.031846487764924096),
 (4795, 0.05685735326841775),
 (4794, 0.15161960871578065),
 (4793, 0.0),
 (4792, 0.0),
 (4791, 0.02321191727213148),
 (4790, 0.04716666306365782),
 (4789, 0.032826608214930636),
 (4788, 0.0),
 (4787, 0.11146270717723435),
 (4786, 0.07530940414109723),
 (4785, 0.1592324388246205),
 (4784, 0.0),
 (4783, 0.11461365101225056),
 (4782, 0.0),
 (4781, 0.07074999459548673),
 (4780, 0.14412753434733405),
 (4779, 0.05053986957192689),
 (4778, 0.0),
 (4777, 0.08753762190648169),
 (4776, 0.03012376165643889),
 (4775, 0.04571503208824944),
 (4774, 0.05730682550612528),
 (4773, 0.0719194952228076),
 (4772, 0.07744030926623663),
 (4771, 0.031846487764924096),
 (4770, 0.02285751604412472),
 (4769, 0.0),
 (4768, 0.08009

In [24]:
sorted(list(enumerate(similarity[262])),reverse=True, key=lambda x:x[1])

[(262, 1.0),
 (98, 0.46600351662203704),
 (330, 0.42548147169838163),
 (19, 0.35305338554802046),
 (22, 0.3484365738222833),
 (329, 0.29821002598961355),
 (1622, 0.2913428162916919),
 (20, 0.2583768965864695),
 (1993, 0.25231028011870793),
 (790, 0.24802326206836478),
 (3676, 0.240288519438982),
 (292, 0.23973165074269204),
 (1448, 0.239731650742692),
 (3297, 0.23583331531828908),
 (1924, 0.23541180771537917),
 (1813, 0.23211917272131483),
 (2433, 0.23211917272131483),
 (2350, 0.22978625750451448),
 (1444, 0.22978625750451445),
 (3848, 0.2281052820616688),
 (3392, 0.2250967420452386),
 (3000, 0.22395700438234167),
 (4308, 0.22395700438234167),
 (1487, 0.2230128875437697),
 (1913, 0.22287124708010464),
 (592, 0.22223033948419132),
 (2661, 0.22194838080923762),
 (2659, 0.2219483808092376),
 (3313, 0.22145461294330657),
 (591, 0.22037063867676332),
 (3982, 0.2190342712773574),
 (1202, 0.2185568577202031),
 (4148, 0.21774708517784636),
 (193, 0.21681133918446008),
 (4448, 0.216663137374758

In [27]:
def recommend(movie):
    #movie_index = new_df[new_df['title'] == movie].index[0]
    movie_list = df[df['title'].str.contains(movie)]
    if len(movie_list):  
        movie_idx= movie_list.index[0]
        distances = similarity[movie_idx]
        movies_list = sorted(list(enumerate(distances)),reverse=True, key=lambda x:x[1])[1:6]
    
        #
        print('Recommendations for {0} :\n'.format(movie_list.iloc[0]['title']))
        for i in movies_list:
            print(df.iloc[i[0]].title)
    else:
        return "No movies found. Please check your input"

In [28]:
recommend('The Lord of the Rings: The Fellowship of the Ring') 

Recommendations for The Lord of the Rings: The Fellowship of the Ring :

The Hobbit: An Unexpected Journey
The Lord of the Rings: The Two Towers
The Hobbit: The Battle of the Five Armies
The Hobbit: The Desolation of Smaug
The Lord of the Rings: The Return of the King


In [29]:
recommend('The Matrix')

Recommendations for The Matrix Revolutions :

The Matrix Reloaded
The Matrix
Terminator Genisys
Transcendence
Mad Max
