In [1]:
import pandas as pd
from surprise import SVDpp
from surprise import SVD
from surprise import Dataset
from surprise import accuracy, Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
import random
import numpy as np
random.seed(256)
np.random.seed(256)

### Step 1: Clean the dataset

In [None]:
import re
def clean_data():

    with open("/Users/akshay/Downloads/australian_users_items 2.json", "r") as raw_input:
        with open("/Users/akshay/Downloads/australian_users_items_cleaned.json", "w") as cleaned_file:
            try:
                for line in raw_input:
                    withoutSingleQuotes = line.replace('\'', '\"')
                    main_item_name_indexes = [m.start() for m in re.finditer('item_name', withoutSingleQuotes)]
                    for main_item_name_index in main_item_name_indexes:
                        main_item_name_index = main_item_name_index + 13
                        current_play_index = withoutSingleQuotes[main_item_name_index:].find("playtime_forever")
                        temp = withoutSingleQuotes[main_item_name_index: main_item_name_index + current_play_index - 4 ]
                        repeatingDoubleQuotesIndexes = [m.start() for m in re.finditer("\"", temp)]
                        for i in repeatingDoubleQuotesIndexes:
                            toReplaceIndex = main_item_name_index + i
                            withoutSingleQuotes = withoutSingleQuotes[:toReplaceIndex] + "'" + withoutSingleQuotes[toReplaceIndex + 1:]
                    cleaned_file.write(withoutSingleQuotes)
            except:
                pass
if __name__ == "__main__":
    clean_data()

### Step 2: Prepare the dataset

In [None]:
import json
import sys
import random

def create_dataset():
    resultRows = []
    with open("/Users/akshay/Downloads/australian_users_items_cleaned.json", "r") as cleaned_file:
        for line in cleaned_file:
            try:
                json_obj = json.loads(line)
                user_id = str(json_obj['user_id'])
                items = json_obj['items']
                max_playtime = 0
                min_playtime = sys.maxsize
                for item in items:
                    playtime = item['playtime_forever']
                    if playtime > 0:
                        if playtime > max_playtime:
                            max_playtime = playtime
                        if playtime < min_playtime:
                            min_playtime = playtime
                for item in items:
                    item_id = str(item['item_id'])
                    playtime = item['playtime_forever']
                    if playtime > 0:
                        #scaled_playtime = round((playtime - min_playtime)/(max_playtime - min_playtime), 10)
                        scaled_playtime = (1-(playtime-min_playtime)/(max_playtime-min_playtime)) + 100*((playtime-min_playtime)/(max_playtime-min_playtime))
                        resultRows.append([user_id,item_id,scaled_playtime])
            except:
                pass
    random.shuffle(resultRows)
    with open("/Users/akshay/Downloads/experiment_1_dataset.csv", "w") as experiment_dataset:
        experiment_dataset.write("user_id,item_id,rating\n")
        for entry in resultRows:
            experiment_dataset.write(entry[0] + "," + entry[1] + "," + str(entry[2]) + "\n")
if __name__ == "__main__":
    create_dataset()

In [4]:
users = pd.read_csv('/Users/akshay/Downloads/experiment_1_dataset_train.csv')
reader = Reader(rating_scale=(1, 100))
data = Dataset.load_from_df(users[['user_id', 'item_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.20)

In [5]:
param_grid = {
    "n_factors": [0,1,2],
    "n_epochs": [10, 50, 60],
    "lr_all": [0.0005, 0.0007],
    "reg_all": [0.01, 0.09]
}

gs = GridSearchCV(SVD, param_grid, n_jobs=-1, measures=["rmse"], cv=3, joblib_verbose=10)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   52.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done  96 out of 108 | elapsed: 16.4min remaining:  2.0min


15.121329780506976
{'n_factors': 0, 'n_epochs': 60, 'lr_all': 0.0005, 'reg_all': 0.09}


[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 19.3min finished


In [6]:
svd2 = SVD(n_factors=0, n_epochs=60, lr_all=0.0005, reg_all=0.09, verbose=True)
svd2.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10f028730>

In [7]:
predictions = svd2.test(testset)
print(accuracy.rmse(predictions))

RMSE: 15.1563
15.156332979086843


In [17]:
user_id = "76561198107409283"
item_id = 730
pred = svd2.predict(user_id, item_id)
print(pred.est)

58.55933242841777


In [31]:
import csv
resultRows = []
with open('/Users/akshay/Downloads/experiment_1_dataset_test_with_class.csv') as csvfile:
    fail = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        try:
            user_id = row[0]
            item_id = int(row[1])
            actual_class = row[2]
            if actual_class == "False":
                actual_class = False
            else:
                actual_class = True
            pred = svd2.predict(user_id, item_id)
            predicted_rating = float(pred.est)
            if predicted_rating > 50.0:
                predicted_class = True
            else:
                predicted_class = False
            
            if predicted_class == True and actual_class == True:
                #print(user_id, item_id, predicted_rating, predicted_class, actual_class)
                true_positive = true_positive + 1
            elif predicted_class == False and actual_class == False:
                true_negative = true_negative + 1
            elif predicted_class == True and actual_class == False:
                false_positive = false_positive + 1
            elif predicted_class == False and actual_class == True:
                false_negative = false_negative + 1
        except Exception as e:
            pass
       
print("True Positive: ", true_positive)
print("True Negative: ", true_negative)
print("False Positive: ", false_positive)
print("False Negative: ", false_negative)

True Positive:  4280
True Negative:  620834
False Positive:  4321
False Negative:  20985
