In [11]:
!pip install surprise



In [12]:
!pip install tqdm



In [73]:
!pip install -upgrade IProgress


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -u


In [74]:
!pip install --upgrade IProgress



In [1]:
from surprise import Dataset
from tqdm import tqdm_notebook
from surprise import Reader
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings = ratings[:10000]
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))
movie_genres = [change_string(g) for g in movies.genres.values]

In [7]:
ratings_and_tags = ratings.merge(tags, how='outer')

In [8]:
ratings_and_tags.head()

Unnamed: 0,userId,movieId,rating,timestamp,tag
0,1,1,4.0,964982703,
1,1,3,4.0,964981247,
2,1,6,4.0,964982224,
3,1,47,5.0,964983815,
4,1,50,5.0,964982931,


In [9]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [10]:
genres_dict = {}
for ind in tqdm_notebook(range(0,len(movies['movieId']))):
    genres_dict[movies['movieId'][ind]] = tfidf_transformer.transform(count_vect.transform([change_string(movies.genres[ind])]))
genres_dict

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for ind in tqdm_notebook(range(0,len(movies['movieId']))):


  0%|          | 0/9742 [00:00<?, ?it/s]

{1: <1x20 sparse matrix of type '<class 'numpy.float64'>'
 	with 5 stored elements in Compressed Sparse Row format>,
 2: <1x20 sparse matrix of type '<class 'numpy.float64'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 3: <1x20 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 4: <1x20 sparse matrix of type '<class 'numpy.float64'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 5: <1x20 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 6: <1x20 sparse matrix of type '<class 'numpy.float64'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 7: <1x20 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 8: <1x20 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 9: <1x20 sparse matrix of type '<class 'numpy.float64'>'
 	with

In [11]:
tags_doc = {}
for uniq in tags.movieId.unique():
    tags_doc[uniq] = ''.join(' '.join(tags[tags['movieId'] == uniq]['tag']).lower())

In [12]:
tags_doc

{60756: 'funny highly quotable will ferrell comedy funny will ferrell funny will ferrell',
 89774: 'boxing story mma tom hardy',
 106782: 'drugs leonardo dicaprio martin scorsese stock market wall street',
 48516: 'way too long leonardo dicaprio suspense twist ending undercover cop atmospheric jack nicholson leonardo dicaprio martin scorsese suspense',
 431: 'al pacino gangster mafia',
 1221: 'al pacino mafia mafia',
 5995: 'holocaust true story holocaust',
 44665: 'twist ending',
 52604: 'anthony hopkins courtroom drama twist ending',
 88094: 'britpop indie record label music',
 144210: 'dumpster diving sustainability',
 1569: 'romantic comedy wedding weddings',
 118985: 'painter',
 119141: 'bloody bromance comedy funny james franco seth rogen',
 109487: 'black hole sci-fi time-travel christopher nolan sci-fi time-travel bad dialogue philosophical issues thought-provoking visually appealing',
 2: 'fantasy magic board game robin williams game',
 110: 'beautiful scenery epic historical 

In [13]:
tags_count = CountVectorizer()
X_train_tags_vec = tags_count.fit_transform(tags_doc.values())
tfidf_tags = TfidfTransformer()
X_tags_train_tfidf = tfidf_tags.fit_transform(X_train_tags_vec)

In [14]:
print(X_tags_train_tfidf.shape, X_train_tfidf.shape)

(1572, 1744) (9742, 20)


In [15]:
ratings['rating'].unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [16]:
from sklearn.neighbors import NearestNeighbors

In [17]:
neigh = NearestNeighbors(n_neighbors=5, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)
neigh_tag = NearestNeighbors(n_neighbors=5, n_jobs=-1, metric='euclidean')
neigh_tag.fit(X_tags_train_tfidf)

In [18]:
X_tags_train_tfidf[0]

<1x1744 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [19]:
ratings[(ratings['movieId']==5)&(ratings['userId']==1)]['rating'].isnull

<bound method Series.isnull of Series([], Name: rating, dtype: float64)>

In [20]:
def find_rating(MovieId, UserId):
    mean_mov = np.sum(ratings[ratings['movieId']==MovieId]['rating'])
    user_rating = np.sum(ratings[(ratings['movieId']==MovieId)&(ratings['userId']==UserId)]['rating'])
    if ((user_rating==0)&(mean_mov!=0.)):
        mean_mov = (mean_mov+user_rating)/(len(ratings[ratings['movieId']==MovieId]['rating'])+
                                           len(ratings[(ratings['movieId']==MovieId)&(ratings['userId']==UserId)]['rating']))
    elif(mean_mov!=0.):
        mean_mov = (0.2*mean_mov+0.8*user_rating)/(len(ratings[ratings['movieId']==MovieId]['rating'])+
                                           len(ratings[(ratings['movieId']==MovieId)&(ratings['userId']==UserId)]['rating']))
    if (mean_mov - int(mean_mov)>=0.5):
        return np.round(mean_mov)
    elif ((mean_mov - int(mean_mov)<0.5)&(mean_mov - int(mean_mov)>=0.25)):
        return int(mean_mov)+0.5
    else:
        return int(mean_mov)

In [21]:
data = {'tags_near_1':[],
       'tags_near_2':[],
       'tags_near_3':[],
       'tags_near_4':[],
       'tags_near_5':[],
       'genres_near_1':[],
       'genres_near_2':[],
       'genres_near_3':[],
       'genres_near_4':[],
       'genres_near_5':[],
       'user':[]}
for i in tqdm_notebook(range(0,len(ratings['rating']))):
    for key in (data.keys()):
        if (key[0]=='t'):
            if (tags_doc.get(ratings['movieId'][i])==None):
                data.get(key).append(np.NaN)
            else:
                data.get(key).append(find_rating(ratings['userId'][i], ratings['movieId'][neigh_tag.kneighbors(
                    tfidf_tags.transform(tags_count.transform(
                     [tags_doc.get(ratings['movieId'][i])])))[1][0][int(key[-1])-1]]))
        elif (key[0]=='g'):
            data.get(key).append(find_rating(ratings['userId'][i], ratings['movieId'][neigh.kneighbors(
            genres_dict.get(ratings['movieId'][i]))[1][0][int(key[-1])-1]]))
        else:
            data.get(key).append(ratings['userId'][i])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(0,len(ratings['rating']))):


  0%|          | 0/10000 [00:00<?, ?it/s]

In [22]:
data['userId'] = data.pop('user')
near_df = pd.DataFrame(data = data )

In [23]:
near_df.head()

Unnamed: 0,tags_near_1,tags_near_2,tags_near_3,tags_near_4,tags_near_5,genres_near_1,genres_near_2,genres_near_3,genres_near_4,genres_near_5,userId
0,4.0,4.0,4.0,4.0,4.0,1.0,4.0,4.0,4.0,4.0,1
1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1
2,,,,,,4.0,4.0,4.0,4.0,4.0,1
3,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1
4,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1


In [24]:
del(ratings['timestamp'])

In [25]:
rec_df = ratings.merge(near_df,how = 'outer', on='userId')

In [26]:
rec_df.head()

Unnamed: 0,userId,movieId,rating,tags_near_1,tags_near_2,tags_near_3,tags_near_4,tags_near_5,genres_near_1,genres_near_2,genres_near_3,genres_near_4,genres_near_5
0,1,1,4.0,4.0,4.0,4.0,4.0,4.0,1.0,4.0,4.0,4.0,4.0
1,1,1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
2,1,1,4.0,,,,,,4.0,4.0,4.0,4.0,4.0
3,1,1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
4,1,1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0


In [27]:
rec_df.dropna(inplace=True)

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
Y = rec_df['rating']
X = rec_df
del(X['rating'])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [31]:
reg = RandomForestRegressor()
reg.fit(X_train, Y_train)
mean_squared_error(reg.predict(X_test),Y_test)

0.00017077294272660376

In [32]:
reg.score(X_test,Y_test)

0.9998420634390862