**STAT 598 Project 4: Movie Recommendation Systems**


By Ian Vetter

Loading in movies and ratings data

In [209]:
import pandas as pd
import numpy as np
import requests

myurl = "https://liangfgithub.github.io/MovieData/movies.dat?raw=true"
response = requests.get(myurl)
movie_lines = response.text.split('\n')
movie_data = [line.split("::") for line in movie_lines if line]
movies = pd.DataFrame(movie_data, columns=['MovieID', 'Title', 'Genres'])
movies['MovieID'] = movies['MovieID'].astype(int)

genres = list(
    sorted(set([genre for genres in movies.Genres.unique() for genre in genres.split("|")]))
)

def get_displayed_movies():
    return movies.head(100)

def get_recommended_movies(new_user_ratings):
    return movies.head(10)

def get_popular_movies(genre: str):
    if genre == genres[1]:
        return movies.head(10)
    else:
        return movies[10:20]

In [210]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [211]:
ratings = pd.read_csv('ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv('movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)

movies.columns = ['MovieID', 'Title', 'Genres']

genres = list(
    sorted(set([genre for genres in movies.Genres.unique() for genre in genres.split("|")]))
)

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [16]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [17]:
print(genres)

['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


**SYSTEM I: Recommendation based on favorite genre**

We will return top movies in the designated genre with highest average rating, but include a penalty term for movies with few reviews, e.g. 4.5 avg rating with 100 reviews holds more weight than a 5.0 avg with 1 reviews

In [20]:
################################

# Helpful movie-2-ID mapping

movies2id = {}

for id in movies['MovieID'].unique():
  movies2id[int(id)] = movies[movies['MovieID'] == id].iloc[0]['Title']



In [184]:
def genreRecommendation(genre, n=10): # n top picks of genre

  movies_in_genre = movies[movies['Genres'].str.contains(str(genre))]

  avg_movie_ratings = {}
  num_movie_ratings = {}

  for id in movies_in_genre['MovieID']:

    avgrating = ratings[ratings['MovieID'] == id]['Rating'].mean()
    num_ratings = len(ratings[ratings['MovieID'] == id]['Rating'])
    avgrating -= 1/(num_ratings*2 + 1)

    if (avgrating > 0):
      avg_movie_ratings[id] = avgrating
      num_movie_ratings[id] = num_ratings

  top_picks = sorted(avg_movie_ratings, key=avg_movie_ratings.get, reverse=True)[:n]

  cols = ['Genre', 'MovieID', 'Title', 'AvgRating', 'NumRatings']
  genre_recs = pd.DataFrame(columns=cols)

  for pick in top_picks:
    row = [genre, pick, movies2id[pick], avg_movie_ratings[pick], num_movie_ratings[pick]]
    genre_recs = pd.concat([genre_recs, pd.DataFrame([row], columns=cols)], ignore_index=True)

  return genre_recs


Example genre recommendations:

In [185]:
genreRecommendation('Action')

Unnamed: 0,Genre,MovieID,Title,AvgRating,NumRatings
0,Action,2905,Sanjuro (1962),4.601501,69
1,Action,2019,Seven Samurai (The Magnificent Seven) (Shichin...,4.559714,628
2,Action,858,"Godfather, The (1972)",4.524741,2223
3,Action,1198,Raiders of the Lost Ark (1981),4.477526,2514
4,Action,260,Star Wars: Episode IV - A New Hope (1977),4.453527,2991
5,Action,1221,"Godfather: Part II, The (1974)",4.35727,1692
6,Action,2028,Saving Private Ryan (1998),4.337166,2653
7,Action,2571,"Matrix, The (1999)",4.315637,2590
8,Action,1197,"Princess Bride, The (1987)",4.303494,2318
9,Action,1233,"Boat, The (Das Boot) (1981)",4.302198,1001


Assembling a table for quick genre recommendations:

In [186]:
cols = ['Genre', 'MovieID', 'Title', 'AvgRating', 'NumRatings']
genre_rec_lookup = pd.DataFrame(columns=cols)

for genre in genres:
  genrerecs = genreRecommendation(genre, n=100)
  genre_rec_lookup = pd.concat([genre_rec_lookup, genrerecs], ignore_index=True)




In [187]:
genre_rec_lookup

Unnamed: 0,Genre,MovieID,Title,AvgRating,NumRatings
0,Action,2905,Sanjuro (1962),4.601501,69
1,Action,2019,Seven Samurai (The Magnificent Seven) (Shichin...,4.559714,628
2,Action,858,"Godfather, The (1972)",4.524741,2223
3,Action,1198,Raiders of the Lost Ark (1981),4.477526,2514
4,Action,260,Star Wars: Episode IV - A New Hope (1977),4.453527,2991
...,...,...,...,...,...
1674,Western,3373,Buck and the Preacher (1972),2.523810,3
1675,Western,416,Bad Girls (1994),2.421380,89
1676,Western,2701,Wild Wild West (1999),2.157983,902
1677,Western,1118,Tashunga (1995),1.666667,1


Example lookup for top Western movies:

In [188]:
genre_rec_lookup[genre_rec_lookup['Genre'] == 'Western']

Unnamed: 0,Genre,MovieID,Title,AvgRating,NumRatings
1612,Western,3607,One Little Indian (1973),4.666667,1
1613,Western,3030,Yojimbo (1961),4.402331,215
1614,Western,1304,Butch Cassidy and the Sundance Kid (1969),4.215293,1419
1615,Western,1283,High Noon (1952),4.177421,403
1616,Western,1201,"Good, The Bad and The Ugly, The (1966)",4.133212,822
...,...,...,...,...,...
1674,Western,3373,Buck and the Preacher (1972),2.523810,3
1675,Western,416,Bad Girls (1994),2.421380,89
1676,Western,2701,Wild Wild West (1999),2.157983,902
1677,Western,1118,Tashunga (1995),1.666667,1


In [189]:
genre_rec_lookup.to_csv('genre_rec_lookup.csv')

**SYSTEM II: IBCF Recommendation**

In [191]:
R = pd.read_csv('/content/I-w9Wo-HSzmUGNNHw0pCzg_bc290b0e6b3a45c19f62b1b82b1699f1_Rmat.csv')

In [192]:
R_c = R.sub(R.mean(axis=1, skipna=True), axis=0)

In [193]:
Rbool = np.array(R_c.notna())
R_cn = np.array(R_c)

n=3706
S_np=np.zeros((n,n))

Generating the similarity matrix S_np based on the conditional cosine similarity:

In [195]:
for i in range(n):
  for j in range(n):
    cands=Rbool[:,i] & Rbool[:,j]
    if (np.sum(cands) > 2):
      m1=R_cn[cands,i]
      m2=R_cn[cands,j]
      numerator = np.sum(m1 * m2)
      denominator_i = np.sqrt(np.sum(np.square(m1)))
      denominator_j = np.sqrt(np.sum(np.square(m2)))
      S_np[i,j]=(1 + numerator / ((denominator_i * denominator_j))) / 2
    else:
      S_np[i,j]=np.nan

In [196]:
names = R.columns
S_fin = pd.DataFrame(S_np, index=names, columns=names)

In [197]:
S_fin

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,1.000000,0.512106,0.392000,0.729637,0.405249,0.344362,0.193479,0.292097,0.275762,0.434214,...,0.525635,0.167886,0.438244,0.204408,0.551756,0.683422,0.290653,0.514043,0.383772,0.414505
m10,0.512106,1.000000,0.547458,0.490472,,0.610983,0.423742,0.460659,0.657699,0.549540,...,0.261701,0.465863,0.448079,0.385735,,0.454464,0.547504,0.668733,0.448290,0.600812
m100,0.392000,0.547458,1.000000,0.482965,,0.836584,0.629538,0.568282,0.811807,0.488525,...,0.410753,0.642616,0.493640,0.193671,0.802844,0.306743,0.629374,0.269576,0.478923,0.612815
m1000,0.729637,0.490472,0.482965,1.000000,,0.180765,,,,0.705223,...,,,0.207393,0.901521,,0.226027,0.668436,,0.725336,0.680574
m1002,0.405249,,,,1.000000,,,,,,...,,,,,,0.722766,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,0.683422,0.454464,0.306743,0.226027,0.722766,0.251738,0.227186,0.140286,0.249062,0.274397,...,0.401180,0.148686,0.470518,0.192858,0.539714,1.000000,0.215561,0.449014,0.307824,0.398517
m996,0.290653,0.547504,0.629374,0.668436,,0.790889,0.711965,0.691134,0.806075,0.621695,...,0.618137,0.779649,0.478071,0.797518,,0.215561,1.000000,0.077113,0.556378,0.622558
m997,0.514043,0.668733,0.269576,,,0.366023,0.932724,0.949228,0.214426,0.210009,...,0.215711,0.866121,0.416222,,0.412018,0.449014,0.077113,1.000000,0.642635,0.460646
m998,0.383772,0.448290,0.478923,0.725336,,0.445008,0.843772,0.604815,0.354571,0.504146,...,,0.698391,0.662904,0.852328,,0.307824,0.556378,0.642635,1.000000,0.642727


Example pairwise similarities:

In [198]:
pairwise_labels = ['m1', 'm10', 'm100', 'm1510', 'm260', 'm3212']
pairwise_sims = S_fin.loc[pairwise_labels, pairwise_labels]
pairwise_sims

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,1.0,0.512106,0.392,,0.741148,
m10,0.512106,1.0,0.547458,,0.534334,
m100,0.392,0.547458,1.0,,0.329694,
m1510,,,,,,
m260,0.741148,0.534334,0.329694,,1.0,
m3212,,,,,,


Top 30 similarity measures, rest NaN:

In [199]:
def keep_top_30(row):
    top_30_values = row.nlargest(31)
    result = pd.Series(np.nan, index=row.index)
    result[top_30_values.index] = top_30_values.values
    return result



S_30 = np.array(S_fin.apply(keep_top_30, axis=1))
S_30 = pd.DataFrame(S_30, index=names, columns=names)
print(S_30)


        m1  m10  m100  m1000  m1002  m1003  m1004  m1005  m1006  m1007  ...  \
m1     1.0  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m10    NaN  1.0   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m100   NaN  NaN   1.0    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m1000  NaN  NaN   NaN    1.0    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m1002  NaN  NaN   NaN    NaN    1.0    NaN    NaN    NaN    NaN    NaN  ...   
...    ...  ...   ...    ...    ...    ...    ...    ...    ...    ...  ...   
m994   NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m996   NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m997   NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m998   NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m999   NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   

       m99  m990  m991  m992  m993  m994  m996  m99

In [None]:
S_30.to_csv('S30.csv')

Smaller similarity matrix for easier Dash/render deployment

In [227]:
S_10 = S_30.iloc[:1000, :1000]
S_10.to_csv('S10.csv')

In [228]:
S_10

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m1998,m1999,m2,m20,m200,m2000,m2001,m2002,m2003,m2004
m1,1.0,,,,,,,,,,...,,,,,,,,,,
m10,,1.0,,,,,,,,,...,,,,,,,,,,
m100,,,1.0,,,,,,,,...,,,,,,,,,,
m1000,,,,1.0,,,,,,,...,,,,,,,,,,
m1002,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m2000,,,,,,,,,,,...,,,,,,1.0,0.7714,,,
m2001,,,,,,,,,,,...,,,,,,,1.0000,,,
m2002,,,,,,,,,,,...,,,,,,,,1.0,,
m2003,,,,,,,,,,,...,,,,,,,,,1.0,


myIBCF recommendation function:

In [218]:
def myIBCF(newuser):

  url3="https://drive.google.com/uc?export=download&id=1NvQOFbcRHcz3GikOFHefUE56xQvhMzKn"
  S30 = pd.read_csv(url3,index_col=0)
  names=S30.columns

  n=len(names)

  # Code Block for handling passed dictionary of ratings in Dash, as opposed to full 3706, vector of ratings

  #newuser_ = np.zeros((n,))
  #names_=[]
  #for na in names:
  #   names_.append(int(na[1:]))
  #names_vec = pd.DataFrame(data=newuser_,index=names_)
  #for k,v in newuser.items():
  #   names_vec.loc[k] = v

  #newuser = np.array(names_vec)

  newuser = np.nan_to_num(newuser, nan=0.0)


  movie_preds = np.zeros((n,))

  for l in range(n):

    rating = newuser[l]

    if (rating > 0):
      movie_preds[l] = 0
      continue

    else:
      non_nan_indxs = np.where(S30.iloc[l] > 0)[0]  #~S30.iloc[l].is_na()

      if (non_nan_indxs.shape[0] < 1):
         movie_preds[l] = 0
         continue

      denom=0
      num=0

      for i in range(non_nan_indxs.shape[0]):

        idx = non_nan_indxs[i]
        num += newuser[idx] * S30.iloc[l][idx]

        if (newuser[idx] > 0):
          denom += S30.iloc[l][idx]

      if (denom == 0):
        movie_preds[l] = 0
        continue

      movie_preds[l] = num / denom

  top_preds = np.argsort(movie_preds)[-10:][::-1]
  print("Scores:", movie_preds[top_preds])
  top_movies = names[top_preds]
  top_movies_ = [int(t[1:]) for t in top_movies]
  moviz = movies[movies['MovieID'].isin(top_movies_)]
  print("Movie picks:")

  return moviz




IBCF Testing:

In [213]:
u1181 = np.array(R.loc['u1181'])
u1351 = np.array(R.loc['u1351'])

In [214]:
uhypothetical = R.loc['u1181'].copy()
uhypothetical[:]=0
uhypothetical['m1613']=5
uhypothetical['m1755']=4
uhypothetical=np.array(uhypothetical)

In [215]:
print(myIBCF(u1181))

Scores: [5.         4.52655916 4.52606601 4.         4.         4.
 4.         4.         4.         4.        ]
Movie picks:
      MovieID                                  Title             Genres
246       249                Immortal Beloved (1994)      Drama|Romance
500       504                       No Escape (1994)      Action|Sci-Fi
739       749        Man from Down Under, The (1943)              Drama
1233     1253  Day the Earth Stood Still, The (1951)       Drama|Sci-Fi
1845     1914                   Smoke Signals (1998)       Comedy|Drama
2013     2082               Mighty Ducks, The (1992)  Children's|Comedy
2724     2793  American Werewolf in Paris, An (1997)      Comedy|Horror
3663     3732                       Fury, The (1978)             Horror
3720     3789                 Pawnbroker, The (1965)              Drama
3829     3899                          Circus (2000)             Comedy


In [216]:
print(myIBCF(u1351))

Scores: [5. 5. 5. 5. 5. 5. 5. 5. 5. 5.]
Movie picks:
      MovieID                             Title         Genres
640       645    Nelly & Monsieur Arnaud (1995)          Drama
842       853                      Dingo (1992)          Drama
1479     1514  Temptress Moon (Feng Yue) (1996)        Romance
1802     1871  Friend of the Deceased, A (1997)   Comedy|Drama
1808     1877                 Little Men (1998)          Drama
1832     1901                 Dear Jesse (1997)    Documentary
1992     2061           Full Tilt Boogie (1997)    Documentary
2058     2127     First Love, Last Rites (1997)  Drama|Romance
2789     2858            American Beauty (1999)   Comedy|Drama
3163     3232              Seven Chances (1925)         Comedy


In [217]:
print(myIBCF(uhypothetical))

Scores: [5. 5. 5. 5. 5. 5. 5. 5. 5. 5.]
Movie picks:
      MovieID                                    Title  \
73         74                      Bed of Roses (1996)   
682       691                 Mrs. Winterbourne (1996)   
755       765                              Jack (1996)   
1004     1017             Swiss Family Robinson (1960)   
2649     2718                Drop Dead Gorgeous (1999)   
2736     2805                  Mickey Blue Eyes (1999)   
2777     2846  Adventures of Milo and Otis, The (1986)   
3185     3254                   Wayne's World 2 (1993)   
3200     3269                     Forever Young (1992)   
3751     3821    Nutty Professor II: The Klumps (2000)   

                        Genres  
73               Drama|Romance  
682             Comedy|Romance  
755               Comedy|Drama  
1004      Adventure|Children's  
2649                    Comedy  
2736            Comedy|Romance  
2777                Children's  
3185                    Comedy  
3200  Adven

In [None]:
%%shell
jupyter nbconvert --to html /PATH/TO/YOUR/NOTEBOOKFILE.ipynb