In [576]:
import numpy as np
import pandas as pd
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import Reader
from surprise import NormalPredictor, KNNBaseline, KNNBasic, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate, KFold

In [2]:
kf = KFold(n_splits=5, random_state=0)

In [8]:
recommenders = (SVD, NormalPredictor, KNNBaseline, KNNBasic, KNNWithZScore, BaselineOnly, CoClustering)
recom_titles = ('SVD', 'Random', 'KNN Baselin', 'KNN Basic', 'KNN z Score', 'Baseline', 'Co-Cluster')

In [4]:
rating_df = pd.read_csv('../pda2019/train-PDA2019.csv')
rating_df.head()

Unnamed: 0,userID,itemID,rating,timeStamp
0,5,648,5,978297876
1,5,1394,5,978298237
2,5,3534,5,978297149
3,5,104,4,978298558
4,5,2735,5,978297919


In [6]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(rating_df[['userID','itemID','rating']], reader)

In [9]:
table = []
fold_n = 0
for rec in recommenders:
    print(recom_titles[fold_n],"started")
    out = cross_validate(rec(), data, ['rmse','mae','fcp'], kf)
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))
    mean_fcp = '{:.3f}'.format(np.mean(out['test_fcp']))
    
    newline = [recom_titles[fold_n], mean_rmse, mean_mae, mean_fcp]
    table.append(newline)
    fold_n += 1

SVD started
Random started
KNN Baselin started
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
KNN Basic started
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
KNN z Score started
Computing the msd similarity m

In [11]:
from tabulate import tabulate
header = ['Recommender', 'Pred accuracy (RMSE)', 'Pred accuracy (MAE)', 'Pred accuracy (FCP)']
print(tabulate(table, header, tablefmt="pipe"))

| Recommender   |   Pred accuracy (RMSE) |   Pred accuracy (MAE) |   Pred accuracy (FCP) |
|:--------------|-----------------------:|----------------------:|----------------------:|
| SVD           |                  0.886 |                 0.695 |                 0.72  |
| Random        |                  1.48  |                 1.184 |                 0.496 |
| KNN Baselin   |                  0.895 |                 0.704 |                 0.713 |
| KNN Basic     |                  0.932 |                 0.731 |                 0.715 |
| KNN z Score   |                  0.929 |                 0.734 |                 0.704 |
| Baseline      |                  0.903 |                 0.713 |                 0.707 |
| Co-Cluster    |                  0.909 |                 0.711 |                 0.713 |


In [12]:
param_grid = {'n_epochs':[5,20], 'lr_all':[0.001, 0.05], 'reg_all': [0.01,0.5]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae', 'fcp'], cv=4)
gs.fit(data)

print('Best RMSE: ',gs.best_score['rmse'])
print('Best params for RMSE', gs.best_params['rmse'])
print('Best FCP: ',gs.best_score['fcp'])
print('Best params for FCP', gs.best_params['fcp'])

Best RMSE:  0.9132852798074944
Best params for RMSE {'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.01}
Best FCP:  0.7085417082545122
Best params for FCP {'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.5}


In [14]:
movies_df = pd.read_csv('../pda2019/content-PDA2019.csv')
movies_df.head()

Unnamed: 0,itemID,visual_f1,visual_f2,visual_f3,visual_f4,visual_f5,visual_f6,visual_f7,title,genres,year,tag
0,89,0.558776,0.754717,0.762726,0.802614,0.789731,0.613746,0.749311,Nick of Time (1995),Action|Thriller,1995,"['Johnny Depp', 'Christopher Walken', 'Johnny ..."
1,93,0.581448,0.637809,0.616174,0.57305,0.609788,0.550628,0.727813,Vampire in Brooklyn (1995),Comedy|Horror|Romance,1995,"['Wes Craven', 'Angela Bassett', 'vampires', '..."
2,94,0.734501,0.632233,0.617406,0.548511,0.58014,0.516373,0.558006,Beautiful Girls (1996),Comedy|Drama|Romance,1996,"['beautiful', 'friendship relations', 'Matt Di..."
3,95,0.623188,0.708219,0.695922,0.527565,0.544942,0.569186,0.698792,Broken Arrow (1996),Action|Adventure|Thriller,1996,"['action packed', 'action packed', 'foqam', 'J..."
4,97,0.679606,0.637854,0.612771,0.778747,0.760236,0.46179,0.540122,"Hate (Haine, La) (1995)",Crime|Drama,1995,"['class conflict', 'angry', 'black and white',..."


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
count = CountVectorizer()
genre_matrix = count.fit_transform(movies_df.loc[:,'genres'])
genre_list = count.get_feature_names()
movies_df['genre_matrix'] = list(genre_matrix.toarray())
movies_df.head()

Unnamed: 0,itemID,visual_f1,visual_f2,visual_f3,visual_f4,visual_f5,visual_f6,visual_f7,title,genres,year,tag,genre_matrix
0,89,0.558776,0.754717,0.762726,0.802614,0.789731,0.613746,0.749311,Nick of Time (1995),Action|Thriller,1995,"['Johnny Depp', 'Christopher Walken', 'Johnny ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,93,0.581448,0.637809,0.616174,0.57305,0.609788,0.550628,0.727813,Vampire in Brooklyn (1995),Comedy|Horror|Romance,1995,"['Wes Craven', 'Angela Bassett', 'vampires', '...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2,94,0.734501,0.632233,0.617406,0.548511,0.58014,0.516373,0.558006,Beautiful Girls (1996),Comedy|Drama|Romance,1996,"['beautiful', 'friendship relations', 'Matt Di...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,95,0.623188,0.708219,0.695922,0.527565,0.544942,0.569186,0.698792,Broken Arrow (1996),Action|Adventure|Thriller,1996,"['action packed', 'action packed', 'foqam', 'J...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,97,0.679606,0.637854,0.612771,0.778747,0.760236,0.46179,0.540122,"Hate (Haine, La) (1995)",Crime|Drama,1995,"['class conflict', 'angry', 'black and white',...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [716]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from ast import literal_eval
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/hadihs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [717]:
lemma = nltk.WordNetLemmatizer()

In [736]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), stop_words=stopwords.words('english'))

In [863]:
def changeGenre_tags(genre, tags):
    genre_list = genre.split('|')
    tag_list = [x.replace(' ', '') for x in literal_eval(tags)]
    tag_list = [lemma.lemmatize(x) for x in tag_list]
    return " ".join(list(map(str.lower, genre_list+tag_list)))

In [855]:
movies_df['descr'] = movies_df.apply(lambda x: changeGenre_tags(x['genres'],x['tag']), axis=1)
movies_df.head()

Unnamed: 0,itemID,visual_f1,visual_f2,visual_f3,visual_f4,visual_f5,visual_f6,visual_f7,title,genres,year,tag,genre_matrix,descr
0,89,0.558776,0.754717,0.762726,0.802614,0.789731,0.613746,0.749311,Nick of Time (1995),Action|Thriller,1995,"['Johnny Depp', 'Christopher Walken', 'Johnny ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",johnnydepp christopherwalken johnnydepp johnba...
1,93,0.581448,0.637809,0.616174,0.57305,0.609788,0.550628,0.727813,Vampire in Brooklyn (1995),Comedy|Horror|Romance,1995,"['Wes Craven', 'Angela Bassett', 'vampires', '...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",wescraven angelabassett vampire clv
2,94,0.734501,0.632233,0.617406,0.548511,0.58014,0.516373,0.558006,Beautiful Girls (1996),Comedy|Drama|Romance,1996,"['beautiful', 'friendship relations', 'Matt Di...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",beautiful friendshiprelations mattdillon natal...
3,95,0.623188,0.708219,0.695922,0.527565,0.544942,0.569186,0.698792,Broken Arrow (1996),Action|Adventure|Thriller,1996,"['action packed', 'action packed', 'foqam', 'J...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",actionpacked actionpacked foqam johntravolta u...
4,97,0.679606,0.637854,0.612771,0.778747,0.760236,0.46179,0.540122,"Hate (Haine, La) (1995)",Crime|Drama,1995,"['class conflict', 'angry', 'black and white',...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",classconflict angry blackandwhite bleak confro...


In [856]:
tfidf_matrix = tf.fit_transform(movies_df['descr'])

In [857]:
tf_feature_names = tf.get_feature_names()
tfidf_matrix

<1832x10802 sparse matrix of type '<class 'numpy.float64'>'
	with 41760 stored elements in Compressed Sparse Row format>

In [104]:
def get_item_profile(item_id):
    idx = movies_df[movies_df['itemID'] == item_id].index[0]
    return tfidf_matrix[idx:idx+1]

In [110]:
from scipy.sparse import vstack

def get_item_profiles(item_ids):
    tmp = [get_item_profile(x) for x in item_ids]
    return vstack(tmp)

In [126]:
from sklearn.preprocessing import normalize

def build_user_profile(user_id):
    user_rating_df = rating_df[rating_df['userID'] == user_id]
    user_item_profiles = get_item_profiles(user_rating_df['itemID'])
    user_item_rating = np.array(user_rating_df['rating']).reshape(-1,1)
    
    user_item_rating_weighted_avg = np.sum(user_item_profiles.multiply(user_item_rating),\
                                           axis=0)/np.sum(user_item_rating)
    return normalize(user_item_rating_weighted_avg)

In [128]:
def build_user_profiles():
    user_ids = np.unique(rating_df['userID'])
    user_profiles = {}
    for user_id in user_ids:
        user_profiles[user_id] = build_user_profile(user_id)
        
    return user_profiles

In [858]:
user_profile_list = build_user_profiles()

In [135]:
from sklearn.metrics.pairwise import cosine_similarity

(1832, 14)

In [834]:
def cbf_recommend_items(user_id, top_n = 10, item_to_ignore = []):
    
    cosine_similarities = cosine_similarity(user_profile_list[user_id], tfidf_matrix)
    #print(cosine_similarities.flatten())
    sim_df = pd.DataFrame({'itemID': movies_df['itemID'].tolist(), \
                                        'similarity': cosine_similarities.flatten() })\
                                   .sort_values(by=['similarity'], ascending=False)
    sim_df = sim_df[~sim_df['itemID'].isin(item_to_ignore)]
    
    if top_n == None:
        return sim_df[['itemID','similarity']]
    return sim_df[['itemID','similarity']].head(top_n)

In [196]:
recommender = SVD(lr_all=0.001, reg_all=0.01)
trainset = data.build_full_trainset()
recommender.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c4251f320>

In [197]:
testset = trainset.build_anti_testset()
svd_pred = recommender.test(testset)

In [833]:
def cf_recommend_items(user_id, top_n=10, items_to_ignore=[]):
    pred_items = pd.DataFrame(columns=['userID', 'itemID', 'rating'])
    predictions = [[x.uid, x.iid, x.est] for x in svd_pred if x.uid == user_id]
    
    for i in range(len(predictions)):
        pred_items.loc[i] = predictions[i]
        
    pred_items = pred_items[~pred_items['itemID'].isin(items_to_ignore)]
    pred_items = pred_items.sort_values(by=['rating'], ascending=False)
    pred_items = pred_items.astype({'userID': np.int, 'itemID': np.int})
    #print(pred_items[pred_items['itemID'] == 318])
    if top_n == None:
        return pred_items
    return pred_items.head(top_n)

In [822]:
from sklearn.preprocessing import MinMaxScaler

In [860]:
def hybrid_recommend(user_id, top_n = 10, items_to_ignore=[]):
    if len(items_to_ignore) == 0:
        items_to_ignore = rating_df[rating_df['userID'] == user_id]['itemID'].tolist()
        
    cf_recom = cf_recommend_items(user_id, None, items_to_ignore)
    cbf_recom = cbf_recommend_items(user_id, None, items_to_ignore)
    
    hybrid_df = cf_recom.merge(cbf_recom, how = 'inner', left_on='itemID', right_on='itemID')
    scaling = MinMaxScaler()
    hybrid_scaled = scaling.fit_transform(hybrid_df[['rating','similarity']])
    normalized_df = pd.DataFrame(hybrid_scaled, columns=['normalized_rating','normalized_similarity'])
    hybrid_df[['normalized_rating', 'normalized_similarity']] = normalized_df
    hybrid_df['weighted'] = 0.7*hybrid_df['normalized_rating']+0.3*hybrid_df['normalized_similarity']
    
    hybrid_df = hybrid_df[~hybrid_df['itemID'].isin(items_to_ignore)]
    hybrid_df = hybrid_df.sort_values(by=['weighted'], ascending=False)
    return hybrid_df.head(top_n)

In [842]:
hybrid_recommend(1)

Unnamed: 0,userID,itemID,rating,similarity,normalized_rating,normalized_similarity,weighted
65,1,1197,4.357986,0.31662,0.889518,0.965861,0.904787
0,1,922,4.687611,0.162035,1.0,0.494295,0.898859
35,1,912,4.434823,0.244539,0.915272,0.745976,0.881412
7,1,750,4.571167,0.159851,0.960971,0.487631,0.866303
6,1,1207,4.5951,0.146338,0.968993,0.446412,0.864477
44,1,3683,4.404503,0.222138,0.905109,0.677641,0.859615
169,1,1394,4.18532,0.285322,0.831644,0.870387,0.839393
10,1,858,4.534956,0.122011,0.948834,0.372198,0.833507
1,1,318,4.683438,0.053519,0.998601,0.163263,0.831534
3,1,1212,4.619059,0.077648,0.977023,0.236867,0.828992


In [187]:
def print_item_id(top_10_df):
    return " "+" ".join(str(s) for s in top_10_df['itemID'].values)

In [864]:
def predict_test(test_filepath, saved_filepath = '', method=0):
    method_titles = ['HYBRID']
    methods = [hybrid_recommend]
    
    test = pd.read_csv(test_filepath)
    user_ids = test['userID']
    
    i = 0
    for user_id in user_ids:
        if i % 20 == 0:
            print('page: ', i // 20, ' of ', len(user_ids)//20)
        
        item_ids = rating_df[rating_df['userID'] == user_id]['itemID']
        recommendations = methods[method](user_id,10,item_ids)
        if recommendations.shape[0] < 10:
            break
        top10 = print_item_id(recommendations)
        test.loc[i, 'recommended_itemIDs'] = top10
        i+=1
    test.to_csv(saved_filepath+"_"+method_titles[method]+".csv", index=False)

In [862]:
predict_test('../pda2019/test-PDA2019.csv', "../test/test-weight7-3", 0)

page:  0  of  99
page:  1  of  99
page:  2  of  99
page:  3  of  99
page:  4  of  99
page:  5  of  99
page:  6  of  99
page:  7  of  99
page:  8  of  99
page:  9  of  99
page:  10  of  99
page:  11  of  99
page:  12  of  99
page:  13  of  99
page:  14  of  99
page:  15  of  99
page:  16  of  99
page:  17  of  99
page:  18  of  99
page:  19  of  99
page:  20  of  99
page:  21  of  99
page:  22  of  99
page:  23  of  99
page:  24  of  99
page:  25  of  99
page:  26  of  99
page:  27  of  99
page:  28  of  99
page:  29  of  99
page:  30  of  99
page:  31  of  99
page:  32  of  99
page:  33  of  99
page:  34  of  99
page:  35  of  99
page:  36  of  99
page:  37  of  99
page:  38  of  99
page:  39  of  99
page:  40  of  99
page:  41  of  99
page:  42  of  99
page:  43  of  99
page:  44  of  99
page:  45  of  99
page:  46  of  99
page:  47  of  99
page:  48  of  99
page:  49  of  99
page:  50  of  99
page:  51  of  99
page:  52  of  99
page:  53  of  99
page:  54  of  99
page:  55  of  99
pa