In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
r_cols = ["user_id", "movie_id", "rating"]
ratings = pd.read_csv("./data/movie_recommender/u.data", sep="\t", names=r_cols, usecols=range(3), encoding="ISO-8859-1")

m_cols = ["movie_id", "title"]
movies = pd.read_csv("./data/movie_recommender/u.item", sep="|", names=m_cols, usecols=range(2), encoding="ISO-8859-1")

ratings = pd.merge(movies, ratings)

In [3]:
ratings

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3
...,...,...,...,...
99998,1678,Mat' i syn (1997),863,1
99999,1679,B. Monkey (1998),863,3
100000,1680,Sliding Doors (1998),863,2
100001,1681,You So Crazy (1994),896,3


In [4]:
movieRatings = ratings.pivot_table(index=["user_id"], columns=["title"], values="rating")
movieRatings

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,3.0,,3.0,...,,,,,,,,,,


In [5]:
# correlate every movie to Start Wars (1977)
similarMovies = movieRatings.corrwith(movieRatings["Star Wars (1977)"])
similarMovies = similarMovies.dropna()
df = pd.DataFrame(similarMovies)
df

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
'Til There Was You (1997),0.872872
1-900 (1994),-0.645497
101 Dalmatians (1996),0.211132
12 Angry Men (1957),0.184289
187 (1997),0.027398
...,...
Young Guns (1988),0.186377
Young Guns II (1990),0.228615
"Young Poisoner's Handbook, The (1995)",-0.007374
Zeus and Roxanne (1997),0.818182


In [6]:
# not filtered by size
similarMovies.sort_values(ascending=False)

title
No Escape (1994)                          1.0
Man of the Year (1995)                    1.0
Hollow Reed (1996)                        1.0
Commandments (1997)                       1.0
Cosi (1996)                               1.0
                                         ... 
Theodore Rex (1995)                      -1.0
I Like It Like That (1994)               -1.0
Two Deaths (1995)                        -1.0
Roseanna's Grave (For Roseanna) (1997)   -1.0
Frankie Starlight (1995)                 -1.0
Length: 1410, dtype: float64

In [7]:
movieStats = ratings.groupby("title").agg({"rating": [np.size, np.mean]})
movieStats

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.600000
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344000
187 (1997),41,3.024390
...,...,...
Young Guns II (1990),44,2.772727
"Young Poisoner's Handbook, The (1995)",41,3.341463
Zeus and Roxanne (1997),6,2.166667
unknown,9,3.444444


In [8]:
popularMovies = movieStats["rating"]["size"] >= 100

In [9]:
movieStats[popularMovies].sort_values([("rating", "mean")], ascending=False)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Close Shave, A (1995)",112,4.491071
Schindler's List (1993),298,4.466443
"Wrong Trousers, The (1993)",118,4.466102
Casablanca (1942),243,4.456790
"Shawshank Redemption, The (1994)",283,4.445230
...,...,...
Spawn (1997),143,2.615385
Event Horizon (1997),127,2.574803
Crash (1996),128,2.546875
Jungle2Jungle (1997),132,2.439394


In [10]:
df = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=["similarity"]))



In [11]:
df.sort_values(["similarity"], ascending=False)[:15]

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Star Wars (1977),584,4.359589,1.0
"Empire Strikes Back, The (1980)",368,4.206522,0.748353
Return of the Jedi (1983),507,4.00789,0.672556
Raiders of the Lost Ark (1981),420,4.252381,0.536117
Austin Powers: International Man of Mystery (1997),130,3.246154,0.377433
"Sting, The (1973)",241,4.058091,0.367538
Indiana Jones and the Last Crusade (1989),331,3.930514,0.350107
Pinocchio (1940),101,3.673267,0.347868
"Frighteners, The (1996)",115,3.234783,0.332729
L.A. Confidential (1997),297,4.161616,0.319065


In [12]:
corrMatrix = movieRatings.corr(method="pearson", min_periods=100)
corrMatrix.dropna(how="all").dropna(axis=1, how="all")

title,101 Dalmatians (1996),12 Angry Men (1957),2001: A Space Odyssey (1968),Absolute Power (1997),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),"Adventures of Priscilla, Queen of the Desert, The (1994)","African Queen, The (1951)",Air Force One (1997),Aladdin (1992),...,Welcome to the Dollhouse (1995),What's Eating Gilbert Grape (1993),When Harry Met Sally... (1989),While You Were Sleeping (1995),William Shakespeare's Romeo and Juliet (1996),Willy Wonka and the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wrong Trousers, The (1993)",Young Frankenstein (1974),Young Guns (1988)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),1.0,,,,,,,,,,...,,,,,,,,,,
12 Angry Men (1957),,1.0,,,,,,,,,...,,,,,,,,,,
2001: A Space Odyssey (1968),,,1.000000,,,,,0.152433,,-0.067992,...,,,0.023712,,,0.147293,0.236830,,-0.001307,
Absolute Power (1997),,,,1.0,,,,,,,...,,,,,,,,,,
"Abyss, The (1989)",,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Willy Wonka and the Chocolate Factory (1971),,,0.147293,,,,,,-0.132312,0.159210,...,,,0.096966,,,1.000000,0.294893,,0.291210,
"Wizard of Oz, The (1939)",,,0.236830,,,,,0.508389,,0.300988,...,,,0.239380,,,0.294893,1.000000,,0.244063,
"Wrong Trousers, The (1993)",,,,,,,,,,,...,,,,,,,,1.0,,
Young Frankenstein (1974),,,-0.001307,,,,,,,0.037317,...,,,0.153234,,,0.291210,0.244063,,1.000000,


In [13]:
firstRatings = movieRatings.iloc[0].dropna(0)
firstRatings

title
Empire Strikes Back, The (1980)    5.0
Gone with the Wind (1939)          1.0
Star Wars (1977)                   5.0
Name: 0, dtype: float64

In [22]:
simCandidates = pd.Series(dtype="object")

for i in range(0 ,len(firstRatings.index)):
    print("Adding sims for " + firstRatings.index[i])
    sims = corrMatrix[firstRatings.index[i]].dropna()
    # scale the movies rating
    sims = sims.map(lambda x: x * firstRatings[i])
    simCandidates = simCandidates.append(sims)
    
simCandidates.sort_values(inplace=True, ascending=False)
simCandidates

Adding sims for Empire Strikes Back, The (1980)
Adding sims for Gone with the Wind (1939)
Adding sims for Star Wars (1977)


Empire Strikes Back, The (1980)    5.000000
Star Wars (1977)                   5.000000
Empire Strikes Back, The (1980)    3.741763
Star Wars (1977)                   3.741763
Return of the Jedi (1983)          3.606146
                                     ...   
Real Genius (1985)                -0.480527
Annie Hall (1977)                 -0.500600
Remains of the Day, The (1993)    -0.560337
Piano, The (1993)                 -0.751857
First Wives Club, The (1996)      -0.972480
Length: 515, dtype: float64

In [24]:
simCandidates = simCandidates.groupby(simCandidates.index).sum()

In [26]:
simCandidates.sort_values(inplace=True, ascending=False)

In [None]:
simCandidates

In [29]:
filteredSims = simCandidates.drop(firstRatings.index)
filteredSims

Return of the Jedi (1983)                    7.178172
Raiders of the Lost Ark (1981)               5.519700
Indiana Jones and the Last Crusade (1989)    3.488028
Bridge on the River Kwai, The (1957)         3.366616
Back to the Future (1985)                    3.357941
                                               ...   
Annie Hall (1977)                           -0.511775
Real Genius (1985)                          -0.552871
Remains of the Day, The (1993)              -0.560337
This Is Spinal Tap (1984)                   -0.636474
First Wives Club, The (1996)                -0.972480
Length: 265, dtype: float64