In [1]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from collections import Counter
from ast import literal_eval

import sqlite3

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

def cossim_matrix(a, b):
    cossim_values = cosine_similarity(a.values, b.values)
    cossim_df = pd.DataFrame(data=cossim_values, columns = a.index.values, index=a.index)

    return cossim_df

In [59]:
ratings_df = pd.read_csv('../media/ratings.csv')
movies_df = pd.read_csv('../media/result_movie.csv')

In [4]:
new_ratings = ratings_df.copy()

In [5]:
user_ids = sorted(list(set(new_ratings['userId'].values)))
movie_ids = sorted(list(set(new_ratings['movieId'].values)))

In [6]:
sparse_matrix = pd.DataFrame(index=movie_ids, columns=user_ids)
sparse_matrix = new_ratings.pivot(index='movieId',columns='userId',values='rating')

In [7]:
item_sparse_matrix = sparse_matrix.fillna(0)
item_sparse_matrix.shape

(1323, 610)

In [8]:
item_cossim_df = cossim_matrix(item_sparse_matrix, item_sparse_matrix)
item_cossim_df

Unnamed: 0_level_0,1,2,3,5,6,7,9,10,11,14,...,134368,134853,138036,139385,142488,148626,152081,164179,166528,176371
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.410562,0.296917,0.308762,0.376316,0.277491,0.232586,0.395573,0.323976,0.178144,...,0.093093,0.306236,0.164749,0.190320,0.187171,0.195092,0.192923,0.188209,0.174998,0.140250
2,0.410562,1.000000,0.282438,0.287795,0.297009,0.228576,0.044835,0.417693,0.322252,0.099373,...,0.133123,0.214336,0.169038,0.186162,0.140917,0.166895,0.215054,0.194696,0.233863,0.203355
3,0.296917,0.282438,1.000000,0.417802,0.284257,0.402831,0.304840,0.242954,0.249568,0.176544,...,0.023965,0.047096,0.015985,0.065771,0.075475,0.048176,0.039672,0.053996,0.056977,0.078187
5,0.308762,0.287795,0.417802,1.000000,0.298969,0.474002,0.335058,0.218061,0.272182,0.255333,...,0.077755,0.091252,0.052843,0.073468,0.046137,0.102923,0.045206,0.055172,0.061720,0.045416
6,0.376316,0.297009,0.284257,0.298969,1.000000,0.244105,0.214088,0.386414,0.289365,0.287064,...,0.052090,0.153317,0.099389,0.167784,0.126655,0.150291,0.095166,0.213196,0.177203,0.174638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148626,0.195092,0.166895,0.048176,0.102923,0.150291,0.044639,0.016727,0.083328,0.074458,0.000000,...,0.348503,0.431625,0.336723,0.450637,0.543839,1.000000,0.260862,0.471627,0.312948,0.485044
152081,0.192923,0.215054,0.039672,0.045206,0.095166,0.032749,0.000000,0.075879,0.047165,0.000000,...,0.400781,0.601073,0.382704,0.343479,0.291736,0.260862,1.000000,0.365101,0.443488,0.360249
164179,0.188209,0.194696,0.053996,0.055172,0.213196,0.052497,0.016702,0.126859,0.066301,0.000000,...,0.216599,0.344605,0.299578,0.474764,0.427710,0.471627,0.365101,1.000000,0.504255,0.508258
166528,0.174998,0.233863,0.056977,0.061720,0.177203,0.055538,0.016429,0.127093,0.072004,0.000000,...,0.145697,0.374077,0.426355,0.393330,0.307928,0.312948,0.443488,0.504255,1.000000,0.453601


In [10]:
userId_grouped = new_ratings.groupby('userId')
item_prediction_result_df = pd.DataFrame(index=list(userId_grouped.indices.keys()),
                                         columns=item_sparse_matrix.index)
item_prediction_result_df

movieId,1,2,3,5,6,7,9,10,11,14,...,134368,134853,138036,139385,142488,148626,152081,164179,166528,176371
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [11]:
for userId, group in userId_grouped:
    user_sim = item_cossim_df.loc[group['movieId']]
    user_rating = group['rating']
    sim_sum = user_sim.sum(axis=0).map(lambda x : 1 if x==0 else x)
    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum)
    item_prediction_result_df.loc[userId] = pred_ratings

In [12]:
item_prediction_result_df

movieId,1,2,3,5,6,7,9,10,11,14,...,134368,134853,138036,139385,142488,148626,152081,164179,166528,176371
1,4.343411,4.318948,4.335208,4.293005,4.340020,4.271416,4.139427,4.303626,4.318285,4.262924,...,4.356490,4.411819,4.404259,4.380452,4.383746,4.391647,4.414105,4.371888,4.392515,4.369639
2,3.976596,3.999494,3.972689,3.998880,3.941156,3.907336,3.923428,3.958293,3.924116,3.564516,...,4.118106,4.007901,4.086450,4.002762,4.038687,4.063200,4.069322,4.018382,4.054624,4.052331
3,1.467675,1.407807,1.553380,1.254897,1.709500,1.273850,1.574655,1.571123,1.235117,0.975641,...,1.380429,1.454222,1.495425,1.521055,1.538234,1.556870,1.476172,1.741167,1.829577,1.514969
4,3.433412,3.412214,3.391481,3.460489,3.416928,3.415055,3.222381,3.344855,3.429022,3.439116,...,3.446253,3.401095,3.406571,3.386933,3.473988,3.491963,3.369666,3.443570,3.379383,3.400270
5,3.626970,3.571471,3.558690,3.569244,3.567086,3.572594,3.471979,3.488956,3.529407,3.571533,...,3.525721,3.662113,3.565946,3.650733,3.671484,3.706497,3.657735,3.652010,3.622451,3.672374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.624248,3.561109,3.529423,3.556385,3.630716,3.600376,3.455706,3.558511,3.613917,3.760268,...,3.536729,3.618839,3.598472,3.608312,3.610133,3.621646,3.590760,3.580269,3.567836,3.589145
607,3.732352,3.717863,3.730316,3.715388,3.772683,3.720095,3.746685,3.729695,3.769887,3.680142,...,3.656055,3.704786,3.654509,3.760306,3.740525,3.737516,3.662066,3.755283,3.749723,3.723745
608,3.222037,3.186715,3.072645,3.040187,3.296021,3.048633,2.915328,3.215543,3.118271,3.123985,...,3.428042,3.405529,3.477620,3.437118,3.351844,3.389049,3.419062,3.390804,3.385552,3.342094
609,3.319207,3.310582,3.290287,3.289242,3.331396,3.289580,3.223021,3.338518,3.312419,3.284783,...,3.407052,3.372934,3.391989,3.382959,3.337419,3.365241,3.354598,3.383243,3.354406,3.335957


In [16]:
a = item_prediction_result_df.loc[1].sort_values(ascending=False)[:6]

In [21]:
a

movieId
96821     4.442533
3435      4.433549
101       4.426744
112552    4.424879
3504      4.423323
2146      4.422930
Name: 1, dtype: object

In [24]:
user_sparse_matrix = sparse_matrix.fillna(0).transpose()
user_sparse_matrix.reset_index(inplace=True)
user_sparse_matrix.set_index('userId',inplace=True)

In [25]:
user_cossim_df = cossim_matrix(user_sparse_matrix, user_sparse_matrix)
user_cossim_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.037869,0.100261,0.223379,0.163606,0.175822,0.184722,0.144432,0.099703,0.022937,...,0.089545,0.188110,0.353374,0.088316,0.194791,0.244831,0.302467,0.343953,0.123091,0.204099
2,0.037869,1.000000,0.000000,0.005731,0.020680,0.039334,0.036683,0.033332,0.000000,0.107118,...,0.252059,0.022381,0.022257,0.000000,0.000000,0.053098,0.018158,0.065925,0.040379,0.166747
3,0.100261,0.000000,1.000000,0.006076,0.010961,0.010722,0.000000,0.010601,0.000000,0.000000,...,0.011860,0.011388,0.079182,0.000000,0.029323,0.037624,0.047353,0.052881,0.000000,0.071728
4,0.223379,0.005731,0.006076,1.000000,0.160527,0.109622,0.117678,0.067370,0.019533,0.049605,...,0.100738,0.122595,0.415711,0.053860,0.105461,0.297748,0.157972,0.181060,0.035484,0.158953
5,0.163606,0.020680,0.010961,0.160527,1.000000,0.400008,0.131564,0.479141,0.000000,0.044396,...,0.083210,0.461289,0.145935,0.311521,0.212313,0.162884,0.180409,0.172473,0.349434,0.098930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.244831,0.053098,0.037624,0.297748,0.162884,0.160673,0.287317,0.151689,0.119193,0.149795,...,0.262156,0.174857,0.471086,0.101970,0.231771,1.000000,0.224499,0.407753,0.107218,0.366555
607,0.302467,0.018158,0.047353,0.157972,0.180409,0.207452,0.210317,0.214378,0.018566,0.012994,...,0.102961,0.233052,0.290518,0.142287,0.172670,0.224499,1.000000,0.312274,0.224847,0.198808
608,0.343953,0.065925,0.052881,0.181060,0.172473,0.246527,0.368084,0.220108,0.128477,0.105790,...,0.164856,0.233383,0.355384,0.183694,0.235092,0.407753,0.312274,1.000000,0.171446,0.440799
609,0.123091,0.040379,0.000000,0.035484,0.349434,0.345728,0.129822,0.526249,0.000000,0.037151,...,0.051307,0.467669,0.096345,0.328547,0.163953,0.107218,0.224847,0.171446,1.000000,0.093558


In [26]:
movieId_grouped = new_ratings.groupby('movieId')
user_prediction_result_df = pd.DataFrame(index=list(movieId_grouped.indices.keys()), columns=user_sparse_matrix.index)
user_prediction_result_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148626,,,,,,,,,,,...,,,,,,,,,,
152081,,,,,,,,,,,...,,,,,,,,,,
164179,,,,,,,,,,,...,,,,,,,,,,
166528,,,,,,,,,,,...,,,,,,,,,,


In [27]:
for movieId, group in movieId_grouped:
    user_sim = user_cossim_df.loc[group['userId']]
    user_rating = group['rating']
    sim_sum = user_sim.sum(axis=0).map(lambda x : 1 if x==0 else x)

    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / sim_sum
    user_prediction_result_df.loc[movieId] = pred_ratings

In [29]:
user_prediction_result_df = user_prediction_result_df.transpose()

In [30]:
user_prediction_result_df.loc[1].sort_values(ascending=False)[:6]

318     4.441352
1104    4.426312
954     4.350363
1204    4.342251
3435    4.340618
475     4.326285
Name: 1, dtype: object

In [31]:
user_prediction_result_df

Unnamed: 0_level_0,1,2,3,5,6,7,9,10,11,14,...,134368,134853,138036,139385,142488,148626,152081,164179,166528,176371
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.901921,3.334011,3.193222,2.922471,3.924899,3.048158,2.844205,3.468803,3.636654,3.745484,...,3.356958,3.904251,3.400093,3.922006,4.172604,3.931175,3.774030,3.856597,4.009528,3.896172
2,3.828239,3.324737,3.063525,2.863332,4.130055,3.052164,2.377468,3.654257,3.704707,3.656155,...,3.604239,3.893725,3.533928,4.013309,4.248927,4.033828,3.911077,3.953735,3.914429,3.967427
3,3.885143,3.335005,3.200749,2.813341,3.852242,3.095697,2.948430,3.465785,3.545310,3.534891,...,3.452940,3.891903,3.565731,3.981294,4.027469,3.804938,3.898560,3.736494,4.110705,3.940361
4,3.869918,3.290680,3.158796,2.829189,3.908600,2.988762,2.641890,3.401809,3.497219,3.776814,...,3.510150,3.873218,3.373344,3.902571,4.133856,3.952834,3.711962,3.841968,3.986779,3.932973
5,3.972586,3.555856,3.382955,3.138725,3.890152,3.379569,2.926794,3.362127,3.801883,3.945112,...,3.592818,3.934533,3.469585,3.977779,4.215588,3.983206,3.809146,3.962758,3.929912,3.640171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.843565,3.311770,3.130336,2.783598,3.947296,3.030592,2.596818,3.442842,3.534902,3.701363,...,3.494572,3.883051,3.395335,3.874148,4.136436,3.960492,3.813598,3.842762,4.002910,3.907411
607,3.946745,3.403096,3.235424,2.972794,3.947695,3.123401,2.869927,3.462220,3.675232,3.780814,...,3.389390,3.915275,3.480078,3.922777,4.195258,3.919769,3.813152,3.858087,4.029893,3.855878
608,3.876340,3.333110,3.133298,2.921633,3.962439,3.080622,2.829083,3.514012,3.655044,3.731129,...,3.475941,3.907021,3.421046,3.899773,4.157780,3.982646,3.843584,3.838312,4.062023,3.919767
609,3.951461,3.586767,3.406254,3.177583,3.937690,3.388286,2.993890,3.468880,3.867718,3.921592,...,3.465998,3.931121,3.529142,3.916066,4.221592,3.906108,3.867753,3.914526,3.894801,3.657729
