In [2]:
# useful packages
import numpy as np
import csv
import math
import pandas as pd
from sklearn import preprocessing
from sklearn import datasets
from sklearn import cluster
import matplotlib.pyplot as plt
import itertools

In [4]:
# load data as a dataframe with pandas
user_history = pd.read_csv("user_history.csv")
user_history_without_user_ID = user_history.drop(['USER ID'],axis=1)
user_ratings = pd.read_csv("user_ratings.csv")

In [5]:
user_history.shape

(4500, 101)

In [6]:
user_ratings.shape

(33725, 3)

To get a better sense of the data, using pandas, you can look at the top 5 rows using the head function:

In [7]:
user_history.head()

Unnamed: 0,USER ID,alpine kimono,sweden kansas,student icon,supreme ivan,albert charlie,heavy trapeze,fabric tokyo,brother robin,tiger catalog,...,cigar lagoon,equal comedy,bombay podium,helena robot,prodigy rhino,jumbo gray,radius wizard,fame quiz,bazaar complex,glass slogan
0,100950,2.192897,0.361397,0.531663,0.016105,0.694338,1.250662,1.483259,1.799682,0.664616,...,0.309236,2.622033,1.242243,2.94756,0.693319,0.625303,2.352295,1.548417,1.517709,1.508219
1,100956,2.776597,0.788821,1.187149,0.473049,2.245112,1.998881,0.072812,1.44197,2.264368,...,2.304845,1.498307,0.319484,0.089212,3.157167,2.789594,1.003377,1.141516,2.011509,0.377898
2,100962,0.281717,5.046727,4.407484,2.138591,1.075562,0.385842,0.626482,0.026648,1.949374,...,1.463952,0.601814,1.98313,2.364877,0.429133,2.75807,0.563619,0.271453,0.579626,1.785609
3,100969,0.943147,1.165713,2.016138,1.236626,0.973435,2.514205,0.022476,1.091282,1.320748,...,1.428416,3.791742,1.10207,3.250911,1.209403,0.246261,0.558631,1.163652,1.922758,1.00804
4,100974,0.485729,4.633607,4.120416,1.497073,1.463875,0.867737,0.877514,0.019603,1.751483,...,1.179954,1.031741,1.935182,2.012611,0.553173,2.614605,0.313479,0.143246,0.914407,2.011048


In [8]:
user_ratings.head()

Unnamed: 0,USER ID,PRODUCT,RATING
0,100950,secret postage,4
1,100950,violet saga,4
2,100950,pepper chicago,3
3,100950,nina sailor,2
4,100950,front salami,3


In [9]:
user_ratings_table = pd.pivot_table(user_ratings,values='RATING', index='USER ID',
                    columns='PRODUCT')
user_ratings_table.head()

PRODUCT,adrian crater,anagram mentor,apropos pizza,bandit anatomy,banjo ladder,barcode arnold,bazooka diagram,bronze mystic,calypso zigzag,casino unit,...,tourist micro,tripod stand,update lola,ventura puzzle,viking llama,violet saga,vista queen,viva avalon,voodoo planet,wheel gibson
USER ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100950,,,,,8.0,,,,5.0,,...,,,,,,4.0,,,,
100956,,,,,,5.0,,,,,...,,,,,,,,,,
100962,,,,,,,,,,,...,,,,,,,,,,
100969,,10.0,1.0,,5.0,,,,,,...,,,5.0,,,0.0,,,8.0,2.0
100974,,,,9.0,,5.0,,1.0,,,...,,,,,,,,9.0,,


In [10]:
np.sum(~np.isnan(user_ratings_table.to_numpy()))

33725

In [11]:
# encode users as indices
le_user = preprocessing.LabelEncoder()
le_user.fit(user_history['USER ID'])

user_with_rating_index = le_user.transform(user_ratings['USER ID'])
all_user_index = le_user.transform(user_history['USER ID'])
user_ratings['USER INDEX'] = user_with_rating_index
user_ratings.head()

# encode products as indices
le_product = preprocessing.LabelEncoder()
le_product.fit(user_ratings['PRODUCT'])

product_ID = le_product.transform(user_ratings['PRODUCT'])
user_ratings['PRODUCT ID'] = product_ID
user_ratings.head()

Unnamed: 0,USER ID,PRODUCT,RATING,USER INDEX,PRODUCT ID
0,100950,secret postage,4,0,54
1,100950,violet saga,4,0,70
2,100950,pepper chicago,3,0,41
3,100950,nina sailor,2,0,35
4,100950,front salami,3,0,18


We can take the data in user_history and turn it into a matrix using the following command:

In [12]:
X = user_history.to_numpy()

For an introduction and short tutorial on pandas, you are invited to read over https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html.

In [13]:
history_users = set(user_history['USER ID'].unique())

In [14]:
user_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33725 entries, 0 to 33724
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   USER ID     33725 non-null  int64 
 1   PRODUCT     33725 non-null  object
 2   RATING      33725 non-null  int64 
 3   USER INDEX  33725 non-null  int64 
 4   PRODUCT ID  33725 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 1.3+ MB


In [15]:
user_ratings

Unnamed: 0,USER ID,PRODUCT,RATING,USER INDEX,PRODUCT ID
0,100950,secret postage,4,0,54
1,100950,violet saga,4,0,70
2,100950,pepper chicago,3,0,41
3,100950,nina sailor,2,0,35
4,100950,front salami,3,0,18
...,...,...,...,...,...
33720,117449,garden jimmy,4,2999,19
33721,117449,evita soviet,8,2999,16
33722,117449,tourist micro,7,2999,65
33723,117449,diploma moment,7,2999,14


In [59]:
# use KNN on user_history data and set the classification as the average 
# from the user_ratings tables of the nearest neighbors
import math
import numpy as np
import scipy.stats
# Calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)
 
def get_dist(user_history,user_id):
    train = user_history.to_numpy()
    #test row will be the row from user_history where USER ID == user_id
    test_row = user_history.to_numpy()
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row[0][1:], train_row[1:])
        user_id = int(train_row[0])
        distances.append((user_id, dist))
    distances.sort(key=lambda tup: tup[1])
    return distances[1:]
    

def get_product_neighbors(user_ratings,distances,product,user_id,num_neighbors=2):
    product_users = list(user_ratings[user_ratings.PRODUCT == product]['USER ID'])
    #find the subset of user_history for those users with product rating
    
    neighbors = list()
    n = 0
    i=0
    while n<num_neighbors:
        if distances[i][0] in product_users:
            neighbors.append(distances[i])
            n+=1
        i+=1
    return neighbors


    
# Make a prediction with neighbors
def predict_classification(user_ratings,product,product_neighbors):
    # get_product_neighbors returns the n nearest neighbors that have a rating for given product
    # so make_prediction() should find the ratings for the n neighbors for that product
    # and return the average rating as the prediction 
    # for the original user_id
    # given a list of distances, find the average distance and set that =1
    # calculate the standard deviation of the set of ratings
    # find the z-value (how many std.dev's away) the single distance is from the average
    # e.g. if the average distance is 40, and a distance is 42, this will be a value less than 1
    # for a distance of 38, this will be a value greater than 1
    
    ratings = []
    dists = []
    for neighbor in product_neighbors:
        user_id = neighbor[0]
        dist = neighbor[1]
        rating = int(user_ratings[(user_ratings['USER ID']==int(user_id)) & (user_ratings['PRODUCT'] ==product)]['RATING'])
        #print('user id {} had a rating of {} for product {}'.format(user_id,rating,product))
        dists.append(dist)
        ratings.append(rating)
    
    #find the weighted average
    zscores = scipy.stats.zscore(dists)
    weights = [abs(x) for x in zscores]
    rating_sum = 0
    for i in range(len(dists)):
        rating_sum += ratings[i]*weights[i]
    final_rating = rating_sum/len(dists)
    
    if final_rating >10:
        return 5
    elif final_rating <0:
        return 0
    else:
        return final_rating    


In [None]:
i=0
list_of_users = list(user_history['USER ID'].unique())
list_of_products = list(user_ratings['PRODUCT'].unique())
#create a dictionary of new ratings
updated_ratings = {}
#create a copy of the user ratings dataframe to append new ratings to
copy_user_ratings = user_ratings
print('Starting predictings for {} users and {} products.'.format(len(list_of_users),len(list_of_products)))
for user_id in list_of_users:
    distances = get_dist(user_history,user_id)
    user_index = list(user_ratings[user_ratings['USER ID']==user_id]['USER INDEX'].unique())[0]
    for product in list_of_products:
        product_neighbors = get_product_neighbors(user_ratings,distances,product,user_id,num_neighbors=5)
        product_id = list(user_ratings[user_ratings.PRODUCT==product]['PRODUCT ID'].unique())[0]
        if i%1000==0:
            print('{} new users have predicted ratings'.format(i))
        #print('    - Product: ',product)
        rating = user_ratings[(user_ratings['USER ID']==int(user_id)) & (user_ratings['PRODUCT'] ==product)]['RATING']
        if len(rating) ==0:
            user_product_rating = predict_classification(user_ratings,product,product_neighbors)
            #print('    -User ID {} Product {} has a predicted rating of {}'.format(user_id,product,user_product_rating))
            updated_ratings[(user_id,product)] = user_product_rating
            row = {'USER ID':user_id,'PRODUCT':product,'RATING':user_product_rating,'USER INDEX':user_index,'PRODUCT ID':product_id}
            copy_user_ratings=copy_user_ratings.append(row, ignore_index=True)
        i+=1  
copy_user_ratings.to_csv('user_ratings.csv')

Starting predictings for 4500 users and 75 products.
0 new users have predicted ratings
1000 new users have predicted ratings
2000 new users have predicted ratings
3000 new users have predicted ratings


In [None]:
'''~~~~~~NOT NEEDED CODE~~~~~~~~~~~~~~~'''
#find the subset of user_history for those users with product rating
train = user_history[user_history['USER ID'].isin(product_users)]
#train should just be an nd-array
train = train.to_numpy()
#test row will be the row from user_history where USER ID == user_id
test_row = user_history[user_history['USER ID']==user_id].to_numpy()
test_row

distances = list()
for train_row in train:
    print('train_row: ',train_row)
    print('test_row', test_row)
    dist = euclidean_distance(test_row[0], train_row)
    print(dist)
    distances.append((train_row, dist))
distances.sort(key=lambda tup: tup[1])

import numpy as np
import scipy.stats

dist = np.array([143.3,32.5,57])
ratings = np.array([5,2,4])
print(dist.mean())
print('std',np.std(dist))
zscores = scipy.stats.zscore(dist)
weights = [abs(x) for x in zscores]

rating_sum = 0
for i in range(len(dist)):
    rating_sum += ratings[i]*weights[i]
final_rating = rating_sum/len(dist)
if final_rating >5:
    return 5
elif final_rating <0:
    return 0
else:
    return final_rating


In [16]:
user_ratings_table = pd.pivot_table(user_ratings,values='RATING', index='USER ID',
                    columns='PRODUCT')
user_ratings_table.head()

PRODUCT,adrian crater,anagram mentor,apropos pizza,bandit anatomy,banjo ladder,barcode arnold,bazooka diagram,bronze mystic,calypso zigzag,casino unit,...,tourist micro,tripod stand,update lola,ventura puzzle,viking llama,violet saga,vista queen,viva avalon,voodoo planet,wheel gibson
USER ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100950,,,,,8.0,,,,5.0,,...,,,,,,4.0,,,,
100956,,,,,,5.0,,,,,...,,,,,,,,,,
100962,,,,,,,,,,,...,,,,,,,,,,
100969,,10.0,1.0,,5.0,,,,,,...,,,5.0,,,0.0,,,8.0,2.0
100974,,,,9.0,,5.0,,1.0,,,...,,,,,,,,9.0,,


In [22]:
user_results = pd.read_csv("results.csv")
user_results.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,USER ID,PRODUCT,RATING,USER INDEX,PRODUCT ID
0,0,0,100950,secret postage,4.0,0,54
1,1,1,100950,violet saga,4.0,0,70
2,2,2,100950,pepper chicago,3.0,0,41
3,3,3,100950,nina sailor,2.0,0,35
4,4,4,100950,front salami,3.0,0,18


In [18]:
user_results_table = pd.pivot_table(user_results,values='RATING', index='USER ID',
                    columns='PRODUCT')
user_results_table.head()

PRODUCT,adrian crater,anagram mentor,apropos pizza,bandit anatomy,banjo ladder,barcode arnold,bazooka diagram,bronze mystic,calypso zigzag,casino unit,...,tourist micro,tripod stand,update lola,ventura puzzle,viking llama,violet saga,vista queen,viva avalon,voodoo planet,wheel gibson
USER ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100950,4.083149,7.080706,4.928562,6.287717,8.0,6.632329,6.10855,3.973406,5.0,2.348351,...,0.744634,4.154948,3.376589,0.212029,0.265492,4.0,4.318837,2.883574,6.106494,5.653546
100956,4.083149,7.080706,4.928562,6.287717,7.555244,5.0,6.10855,3.973406,4.63912,2.348351,...,0.744634,4.154948,3.376589,0.212029,0.265492,4.324377,4.318837,2.883574,6.106494,5.653546
100962,4.083149,7.080706,4.928562,6.287717,7.555244,6.632329,6.10855,3.973406,4.63912,2.348351,...,0.744634,4.154948,3.376589,0.212029,0.265492,4.324377,4.318837,2.883574,6.106494,5.653546
100969,4.083149,10.0,1.0,6.287717,5.0,6.632329,6.10855,3.973406,4.63912,2.348351,...,0.744634,4.154948,5.0,0.212029,0.265492,0.0,4.318837,2.883574,8.0,2.0
100974,4.083149,7.080706,4.928562,9.0,7.555244,5.0,6.10855,1.0,4.63912,2.348351,...,0.744634,4.154948,3.376589,0.212029,0.265492,4.324377,4.318837,9.0,6.106494,5.653546


In [21]:
user_results_table.to_csv("final_results.csv")