In [53]:
import pandas as pd
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
import csv

https://www.youtube.com/watch?v=d7iIb_XVkZs

In [17]:
movielens_df = pd.read_csv('reviews.training.csv')

In [18]:
movielens_df.head()

Unnamed: 0,reviewerID,asin,overall
0,AMFIPCYDYWGVT,B0090SI56Y,4
1,A3G602Z4DWDZKS,B00005JL99,5
2,A33BOYMVG3U58Y,B00109KN0M,5
3,ANEDXRFDZDL18,B00005JMPT,5
4,A1VN7IS16PY024,B00005AAA9,4


### Create a reader.

In [19]:
reader = Reader(rating_scale=(1,5))

### Create new dataset instance with dataframe and reader.

In [20]:
data = Dataset.load_from_df(movielens_df, reader)

### Set aside part of dataset for training.

In [21]:
trainset, testset = train_test_split(data, test_size=.25)

### Train new SVD with 100 latent features.

In [22]:
model = SVD(n_factors=100)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10db847f0>

In [23]:
model.qi.shape

(49976, 100)

### Map vector back to movie.

In [26]:
item_to_row_idx: dict() = model.trainset._raw2inner_id_items

In [29]:
display(item_to_row_idx)

{'B000GLL1U6': 0,
 '6304923082': 1,
 'B001FB55MG': 2,
 'B0002I831S': 3,
 'B009GUSWDM': 4,
 'B0090SI53C': 5,
 'B005LAIIQC': 6,
 'B00FZM8Z7I': 7,
 'B000AYEL60': 8,
 '079074841X': 9,
 'B005FITIK0': 10,
 'B000DZ95IK': 11,
 '6304366078': 12,
 'B00005JLGJ': 13,
 'B0000WN0ZU': 14,
 'B006KCZBK6': 15,
 'B00E8RK5OC': 16,
 'B000AM6MVO': 17,
 '6303305520': 18,
 'B000E33VWW': 19,
 'B0001ME57Q': 20,
 'B000KGGZZ8': 21,
 '1558803564': 22,
 '0767808460': 23,
 'B000CC1TNI': 24,
 'B0002DB5N6': 25,
 'B000BC8SXS': 26,
 'B004H0M2R8': 27,
 'B003L20IEW': 28,
 'B005GYSUYI': 29,
 'B002VPE1AW': 30,
 'B000068CNX': 31,
 'B00005JKQZ': 32,
 'B00008DDXB': 33,
 'B001TLZ2ZW': 34,
 '6305428115': 35,
 'B001JXPC64': 36,
 '6301967690': 37,
 'B00007JMEA': 38,
 'B007K3JCAE': 39,
 'B000055ZF1': 40,
 'B000GUJYZQ': 41,
 'B00003CXXM': 42,
 'B00BNAE6M4': 43,
 'B0001Z51LC': 44,
 'B000F4PDF8': 45,
 'B00BB8LU5Y': 46,
 'B009AMAKWM': 47,
 '0792837614': 48,
 'B0017VG65E': 49,
 'B000UNYJWW': 50,
 '0792838416': 51,
 'B0053O8A8M': 52,
 '6

### Identifying an example movie and latent features.

In [30]:
B00FZM8Z7I_row_idx: int = item_to_row_idx['B00FZM8Z7I']

In [31]:
model.qi[B00FZM8Z7I_row_idx]

array([ 0.22723818,  0.03547674,  0.08222737, -0.02776564, -0.11109725,
       -0.22744303,  0.09128558, -0.4081204 ,  0.26679917,  0.00638147,
       -0.2590243 ,  0.21716366, -0.05737435, -0.13442479,  0.1695284 ,
        0.05806044, -0.30081283,  0.07022995, -0.0550971 ,  0.33080868,
       -0.03539563,  0.24895535,  0.04214402,  0.04287681, -0.16299343,
       -0.34940498,  0.07830562,  0.00980316,  0.3059433 ,  0.04829765,
       -0.19751752, -0.34173203,  0.2326833 , -0.23383251, -0.06625271,
        0.10461521, -0.18487226,  0.16170697, -0.03964814,  0.01053875,
       -0.19340957, -0.00912545,  0.11963934,  0.05546998,  0.08627149,
        0.03910747, -0.2794061 , -0.06461855, -0.09479248,  0.12874337,
       -0.22814214, -0.1337726 , -0.00667403, -0.00494587, -0.0762251 ,
        0.10079028, -0.07030648,  0.02949861,  0.13653273,  0.27588332,
       -0.04842619, -0.18095592,  0.09683885, -0.02882972,  0.27630064,
        0.04118419, -0.12907916,  0.00640151,  0.08538978,  0.21

### Get a test recommendation for a given user and product.

In [32]:
a_user = 'AMFIPCYDYWGVT'

In [33]:
a_product = 'B0090SI56Y'

In [57]:
prediction = model.predict(a_user, a_product)
prediction[3]

3.681817330816828

In [58]:
with open('reviews.test.unlabeled.csv', 'r') as test_file:
    test_reader = csv.reader(test_file, delimiter=',')
    next(test_reader, None)
    with open('reviews.test.labeled.csv', 'w') as outfile:
        outfile_reader = csv.writer(outfile, delimiter=',')
        outfile_reader.writerow(['datapointID', 'overall'])
        
        for row in test_reader:
            prediction = model.predict(row[1], row[2])
            outfile_reader.writerow([row[0], prediction[3]])

### Make recommendations via cosine similarity.

In [38]:
# Get indices for three movies.
movie_1_idx = model.trainset._raw2inner_id_items['B0090SI56Y']
movie_2_idx = model.trainset._raw2inner_id_items['6304923082']
movie_3_idx = model.trainset._raw2inner_id_items['B001FB55MG']

In [39]:
# Get vectors for three movies.
movie_1_idx = model.qi[movie_1_idx]
movie_2_idx = model.qi[movie_2_idx]
movie_3_idx = model.qi[movie_3_idx]

In [50]:
def get_top_similarities(movie_title: str, model: SVD) -> pd.DataFrame:
    ...

In [51]:
B0090SI56Y_similarities = get_top_similarities('B0090SI56Y', model)
print(B0090SI56Y_similarities)

None
