# Generate Game Bundle Recommendations for Users

In [3]:
import pandas as pd
from surprise import SVDpp
from surprise import SVD
from surprise import Dataset
from surprise import accuracy, Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
import random
import numpy as np
random.seed(256)
np.random.seed(256)

### Step 1: Clean the dataset

The raw dataset 'australian_users_items.json' is not in valid JSON format. Following steps are required to clean the raw file:
- For each line:
  - replace the single quotes with double quotes
  - After the first step, the field "item_name" has double quotes which should be single quote. So to tackle this, find those indexes and replace the double quotes with single quotes
  
- Example raw line:
  - {'user_id': 'evcentric', 'items_count': 137, 'steam_id': '76561198007712555', 'user_url': 'http://steamcommunity.com/id/evcentric', 'items': [{'item_id': '454060', 'item_name': 'Blueprint"s Tycoon', 'playtime_forever': 23, 'playtime_2weeks': 0}, {'item_id': '466170', 'item_name': 'Idling to Rule the Gods', 'playtime_forever': 28545, 'playtime_2weeks': 1554}]}

- Example cleaned line:
  - {"user_id": "evcentric", "items_count": 137, "steam_id": "76561198007712555", "user_url": "http://steamcommunity.com/id/evcentric", "items": [{"item_id": "454060", "item_name": 'Blueprint's Tycoon", "playtime_forever": 23, "playtime_2weeks": 0}, {"item_id": "466170", "item_name": "Idling to Rule the Gods", "playtime_forever": 28545, "playtime_2weeks": 1554}]}

In [13]:
import re
def clean_data():

    with open("/Users/akshay/Downloads/australian_users_items.json", "r") as raw_input:
        with open("/Users/akshay/Downloads/australian_users_items_cleaned.json", "w") as cleaned_file:
            try:
                for line in raw_input:
                    withoutSingleQuotes = line.replace('\'', '\"')
                    main_item_name_indexes = [m.start() for m in re.finditer('item_name', withoutSingleQuotes)]
                    for main_item_name_index in main_item_name_indexes:
                        main_item_name_index = main_item_name_index + 13
                        current_play_index = withoutSingleQuotes[main_item_name_index:].find("playtime_forever")
                        temp = withoutSingleQuotes[main_item_name_index: main_item_name_index + current_play_index - 4 ]
                        repeatingDoubleQuotesIndexes = [m.start() for m in re.finditer("\"", temp)]
                        for i in repeatingDoubleQuotesIndexes:
                            toReplaceIndex = main_item_name_index + i
                            withoutSingleQuotes = withoutSingleQuotes[:toReplaceIndex] + "'" + withoutSingleQuotes[toReplaceIndex + 1:]
                    cleaned_file.write(withoutSingleQuotes)
            except:
                pass
if __name__ == "__main__":
    clean_data()

### Step 3: Prepare the dataset

- To predict the ratings for users, we need the dataset in the format of <b>user_id,item_id,rating</b> <br>
Since the dataset does not have an explicit "rating" value, we will use implicit ratings to estimate the user's likeliness towards the game. <br>
- Each user entry JSON has a "playtime_forever" value. We can use this value to estimate the rating the user would give this game. For example, if the user has been playing a particular game for a long time, we can sufficiently assume the user would rate this game higly. <br>
- The "playtime_forever" metric values varies a lot between users, so we cannot use the raw value as it would skew our results. For example, if a user's average playtime is 100 hours, and they have spent 5 hours playing a game, then the rating shouldn't be that high. But if a user's average playtime is 10 hours, and they have spent 5 hours playing a gam, then the rating should be high. So we normalize the rating between 1 and 100 by taking into consideration the minimum playtime and maximum playtime of each user.
- The playtime is scaled to a rating between 1 and 100 using the following formula:
  - <b>scaled_playtime = (1-(playtime-min_playtime)/(max_playtime-min_playtime)) + 100*((playtime-min_playtime)/(max_playtime-min_playtime))</b>
- Another decision made is to ignore items if the user has a playtime of 0 for that item. The reason being that there can be several reasons the playtime can be empty. For example, if the user has only recently purchased the game, or the user has not gotten around to playing this game yet. Since we don't have any extra information to infer such reasons, it's better to not include 0 values and it will significantly skew the scaled rating.

In [5]:
import json
import sys
import random

def create_dataset():
    resultRows = []
    with open("/Users/akshay/Downloads/australian_users_items_cleaned.json", "r") as cleaned_file:
        for line in cleaned_file:
            try:
                json_obj = json.loads(line)
                user_id = str(json_obj['user_id'])
                items = json_obj['items']
                max_playtime = 0
                min_playtime = sys.maxsize
                for item in items:
                    playtime = item['playtime_forever']
                    if playtime > 0:
                        if playtime > max_playtime:
                            max_playtime = playtime
                        if playtime < min_playtime:
                            min_playtime = playtime
                for item in items:
                    item_id = str(item['item_id'])
                    playtime = item['playtime_forever']
                    if playtime > 0:
                        scaled_playtime = (1-(playtime-min_playtime)/(max_playtime-min_playtime)) + 100*((playtime-min_playtime)/(max_playtime-min_playtime))
                        resultRows.append([user_id,item_id,scaled_playtime])
            except:
                pass
    random.shuffle(resultRows)
    with open("/Users/akshay/Downloads/experiment_1_dataset.csv", "w") as experiment_dataset:
        experiment_dataset.write("user_id,item_id,rating\n")
        for entry in resultRows:
            experiment_dataset.write(entry[0] + "," + entry[1] + "," + str(entry[2]) + "\n")
if __name__ == "__main__":
    create_dataset()

The final dataset for analysis looks like this: <br>
It has 2601684 rows

In [30]:
users_full = pd.read_csv('/Users/akshay/Downloads/experiment_1_dataset.csv')
users_full.head()

Unnamed: 0,user_id,item_id,rating
0,Lulllabi,220860,1.154185
1,76561198076749377,312280,1.490666
2,76561198040566460,341720,1.613636
3,cyan101,346900,1.416118
4,MRmeohme,57300,2.038601


In [31]:
users.shape

(3252104, 3)

### Step 4: Training the Model

- To perform model training, first we need to split the dataset generated in the previous step into training and testing subsets. As the dataset has 3252104 rows, roughly 80%, i.e 2601685 rows will be part of the training subset, and the rest as testing subset

  - Split the experiment_1_dataset.csv using the following command on the terminal:
    - split -l 2601685 experiment_1_dataset.csv train
    - Rename the traina and trainb files as experiment_1_dataset_train.csv and experiment_1_dataset_test.csv

In [6]:
# Reading the training dataset as DataFrame and converting to Dataset object for surprise library. 
# Mention the rating scale 1 - 100 while initilializing the Reader

users = pd.read_csv('/Users/akshay/Downloads/experiment_1_dataset_train.csv')
reader = Reader(rating_scale=(1, 100))
data = Dataset.load_from_df(users[['user_id', 'item_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.20, shuffle=True)

- We will use SVD algorithm to extract information from latent factors. The only information in the dataset about the relationship between games and users is whether the user has purchased the game and the time played. This is not enough information to build models using decision trees or KNN. <br>
We perform a Grid Search to find the most suitable hyperparameters as follows:

In [7]:
param_grid = {
    "n_factors": [0,1,2],
    "n_epochs": [10, 50, 60],
    "lr_all": [0.0005, 0.0007],
    "reg_all": [0.01, 0.09]
}

gs = GridSearchCV(SVD, param_grid, n_jobs=-1, measures=["rmse"], cv=3, joblib_verbose=10)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   54.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done  96 out of 108 | elapsed: 16.3min remaining:  2.0min


15.118828593579039
{'n_factors': 0, 'n_epochs': 60, 'lr_all': 0.0005, 'reg_all': 0.09}


[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 19.3min finished



- We run the Grid Search again with different set of parameters. Note the "-1" passed for n_jobs paramters. This instructs the Python runtime to utilize all the cores of the machine and run concurrent workers for doing the Grid Search


In [8]:
param_grid_2 = {
    "n_factors": [5,7,9],
    "n_epochs": [60,70,90],
    "lr_all": [0.0001, 0.0006],
    "reg_all": [0.12, 0.15]
}

gs = GridSearchCV(SVD, param_grid_2, n_jobs=-1, measures=["rmse"], cv=3, joblib_verbose=10)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 42.6min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 46.5min
[Parallel(n_jobs=-1)]: Done  96 out of 108 | elapsed: 52.0min remaining:  6.5min


15.682724470001697
{'n_factors': 9, 'n_epochs': 60, 'lr_all': 0.0001, 'reg_all': 0.12}


[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 57.2min finished


- Our earlier Grid Search hyperparamters had a lower RMSE, so we will train the model on the full trainset using those hyperparamters

In [9]:
svd_model = SVD(n_factors=0, n_epochs=60, lr_all=0.0005, reg_all=0.09, verbose=True)
full_trainset = data.build_full_trainset()
svd_model.fit(full_trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11a0435e0>

- Perform K-Fold Cross Validation to mesaure the accuracy of the model on randomized K=5 splits of data

In [10]:
cross_validate(svd_model, data, measures=['RMSE'], cv=5, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

{'test_rmse': array([15.12304094, 15.12905844, 15.11311565, 15.1183519 , 15.14055615]),
 'fit_time': (92.7524209022522,
  95.74028491973877,
  93.7231879234314,
  95.69433236122131,
  91.7487223148346),
 'test_time': (8.085649967193604,
  7.969265937805176,
  8.369022846221924,
  7.3205320835113525,
  7.356119871139526)}

### Step 6: Make Predictions using the SVD Model

- Now that the model is trained, we can make predictions of a user rating for a particular game (item_id)

In [11]:
user_id = "deathfatel"
item_id = 377160
pred = svd_model.predict(user_id, item_id)
print(pred.est)

24.283107543017255


### Step 7: Generate dataset for the Recommendation classifier

- We can now use this ratings predictor model to recommend games as well as game bundles to user. For performing that task, we need to be able to predict whether to recommend a particular game to a user or not. We need a new prepared dataset to perform this hypothesis. The dataset should have the following schema:
  - <b>user_id,item_id,class</b>
  
- This dataset can be generated from experiment_1_dataset_test.csv. If the predicted rating is above 50, we recommend the game to the user
- For each row, if the predicted rating is above 50, set the class as True, else False

In [None]:
def generate_test_data():
    with open("/Users/akshay/Downloads/experiment_1_dataset_test.csv", "r") as test_file:
        with open("/Users/akshay/Downloads/experiment_1_dataset_test_with_class.csv", "w") as class_test_file:
            for line in test_file:
                splits = line.split(",")
                user_id, item_id, rating = splits[0], splits[1], splits[2]
                if float(rating) > 50:
                    class_test_file.write(user_id + "," + item_id + "," + "True\n")
                else:
                    class_test_file.write(user_id + "," + item_id + "," + "False\n")

if __name__ == "__main__":
    generate_test_data()

### Step 8: Evaluate the Recommendation classifier

- We use the generated model to predict the user rating for an item, and subsequently the class. If the predicted class and actual class is the same, we have made a successful recommendation. 
- Based on the above technique of evaluation, we calculate the True Positive, True Negative, False Positive and False Negative metrics

In [34]:
import csv
resultRows = []
with open('/Users/akshay/Downloads/experiment_1_dataset_test_with_class.csv') as csvfile:
    fail = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        try:
            user_id = row[0]
            item_id = int(row[1])
            actual_class = row[2]
            if actual_class == "False":
                actual_class = False
            else:
                actual_class = True
            pred = svd_model.predict(user_id, item_id)
            predicted_rating = float(pred.est)
            if predicted_rating > 50.0:
                predicted_class = True
            else:
                predicted_class = False
            if predicted_class == True and actual_class == True:
                #print(user_id, item_id, predicted_rating, predicted_class, actual_class)
                true_positive = true_positive + 1
            elif predicted_class == False and actual_class == False:
                true_negative = true_negative + 1
            elif predicted_class == True and actual_class == False:
                false_positive = false_positive + 1
            elif predicted_class == False and actual_class == True:
                false_negative = false_negative + 1
        except Exception as e:
            pass
       
print("True Positive: ", true_positive)
print("True Negative: ", true_negative)
print("False Positive: ", false_positive)
print("False Negative: ", false_negative)

True Positive:  4324
True Negative:  620739
False Positive:  4416
False Negative:  20941


In [35]:
precision = true_positive/(true_positive + false_positive)
recall = true_positive/(true_positive + false_negative)

print(precision)
print(recall)

0.49473684210526314
0.1711458539481496


### Step 9: Get Recommendation for a user_id for any item_id

- We can get recommendation values for each user_id - item_id combination as follows:

In [71]:
def to_recommend(user_id, item_id):
    pred = svd_model.predict(user_id, item_id)
    predicted_rating = float(pred.est)
    print(predicted_rating)
    if predicted_rating > 50.0:
        predicted_class = True
    else:
        predicted_class = False
    return predicted_class


In [80]:
to_recommend("RhinoSquad",377160)

23.65173220261376


False

### Step 10: Make bundle predictions for users

- Now for each user, we can calculate the cumulative rating of the entire game bundle. This function, given a user_id, will calculate the rating for each item in each game bundle. If the cumulative rating of the bundle is above a threshold, the whole bundle is recommended to the user
- The threshold selected for this analysis is 25, i.e if the cumulative rating for the bundle by the user is above 25, then the bundle is recommended to the user

In [68]:
def get_bundles_for_all_users():
    user_ids = []
    with open("/Users/akshay/Downloads/experiment_1_dataset_test.csv", "r") as users_file:
        readCSV = csv.reader(users_file, delimiter=',')
        for row in readCSV:
            user_id = row[0]
            with open("/Users/akshay/Downloads/bundle_data_cleaned.json", "r") as cleaned_file:
                bundle_reccos = []
                for line in cleaned_file:
                    try:
                        bundle_rating = 0
                        json_obj = json.loads(line)
                        items = json_obj["items"]
                        bundle_name = json_obj["bundle_name"]
                        for item in items:
                            item_id = int(item["item_id"])
                            pred = svd_model.predict(user_id, item_id)
                            predicted_rating = float(pred.est)
                            bundle_rating = bundle_rating + predicted_rating
                            
                        average_rating_for_bundle = float(bundle_rating/len(items))
                        if average_rating_for_bundle > 25.0:
                            bundle_reccos.append(bundle_name)
                    except Exception as e:
                        pass
                if len(bundle_reccos) > 1 and len(bundle_reccos) < 10:
                    print("User: " + user_id + "  Bundles: " + str(bundle_reccos) + "\n")

get_bundles_for_all_users()

User: nosferatuzodd  Bundles: ['The Binding of Isaac : Rebirth + Afterbirth Bundle', 'Left 4 Dead Bundle']

User: 76561198043626578  Bundles: ['The Binding of Isaac : Rebirth + Afterbirth Bundle', 'Left 4 Dead Bundle']

User: 76561198033718152  Bundles: ['The Elder Scrolls V: Skyrim + Add-Ons', 'The Binding of Isaac : Rebirth + Afterbirth Bundle', 'Left 4 Dead Bundle']



- To generate bundle recommendation for a specific user, the following code snippet can be used:

In [67]:
def get_bundles_for_user(user_id):
    with open("/Users/akshay/Downloads/bundle_data_cleaned.json", "r") as cleaned_file:
        bundle_reccos = []
        for line in cleaned_file:
            try:
                bundle_rating = 0
                json_obj = json.loads(line)
                items = json_obj["items"]
                bundle_name = json_obj["bundle_name"]
                for item in items:
                    item_id = int(item["item_id"])
                    pred = svd_model.predict(user_id, item_id)
                    predicted_rating = float(pred.est)
                    bundle_rating = bundle_rating + predicted_rating

                average_rating_for_bundle = float(bundle_rating/len(items))
                if average_rating_for_bundle > 25.0:
                    bundle_reccos.append(bundle_name)
            except Exception as e:
                pass
        if len(bundle_reccos) > 1 and len(bundle_reccos) < 10:
            print("User: " + user_id + "  Bundles: " + str(bundle_reccos) + "\n")

get_bundles_for_user("76561198033718152")

User: 76561198033718152  Bundles: ['The Elder Scrolls V: Skyrim + Add-Ons', 'The Binding of Isaac : Rebirth + Afterbirth Bundle', 'Left 4 Dead Bundle']

