In [12]:
import numpy as np
import pandas as pd

import math


In [13]:
items_pd = pd.read_table(".\\ml-100k\\u.item", header=None, names=['item id', 'title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy'
    , 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], delimiter='|', encoding = "ISO-8859-1")
items_pd.head()
ITEMS = items_pd.shape[0]

In [14]:
users_pd = pd.read_table(".\\ml-100k\\u.user", header=None, names=['user id', 'age', 'gender', 'occupation', 'zip code'], delimiter='|', encoding = "ISO-8859-1")
users_pd.head()
USERS = users_pd.shape[0]

In [15]:
u1_base_pd = pd.read_table(".\\ml-100k\\u1.base", header=None, names=['user id', 'item id', 'rating', 'timestamp'])
u1_base_pd.head()

Unnamed: 0,user id,item id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [16]:
def to_data_pd(input):
    data_array = np.empty((USERS, ITEMS))
    for _, rating in input.iterrows():
        data_array[rating['user id'] - 1][rating['item id'] - 1] = rating['rating']
    output_pd = pd.DataFrame(data_array, index=range(1,USERS+1), columns=range(1,ITEMS+1))
    return output_pd
    

In [17]:
u1_data_pd = to_data_pd(u1_base_pd)
u1_data_pd.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
def find_average(data_pd):
    avgs = []
    for _, user in data_pd.iterrows():
        avgs.append(user.sum() / user.astype(bool).sum())

    return avgs
    

In [19]:
averages = find_average(u1_data_pd)


In [20]:
#This implements the Pearson metric as found in equation (2.2)
def pearson(user1, user2):
    intersection = [k for k in range(1,ITEMS) if u1_data_pd.iloc[user1-1][k] > 0 and u1_data_pd.iloc[user2-1][k] > 0]
    if len(intersection) <= 1:
        return 0

    bottom1 = math.sqrt( sum([(u1_data_pd.iloc[user1-1][k] - averages[user1-1]) ** 2 for k in intersection]))
    bottom2 = math.sqrt( sum([(u1_data_pd.iloc[user2-1][k] - averages[user2-1]) ** 2 for k in intersection]))

    if bottom1 * bottom2 == 0:
        return 0
        
    top = sum([(u1_data_pd.iloc[user1-1][k] - averages[user1-1]) * (u1_data_pd.iloc[user2-1][k] - averages[user2-1]) for k in intersection])

    return top / (bottom1 * bottom2)

In [27]:
# I don't know if this is the way to go about this.
# It takes a long time to run. However, I think it'set
# necessary because we'll have to calculate these values
# sooner or later anyways.
'''
pearson_matrix = np.empty((USERS,USERS))
for i in range(1, USERS):
    for j in range(1, USERS):
        pm = pearson(i,j)
        print(i, j, pm)
        pearson_matrix[i - 1][j - 1] = pm
print(pearson_matrix)
'''

# okay let's try to multithread this

import threading

THREADCOUNT = 4

def pearson_matrix_threaded(outputs, i_range):
    output_np = np.zeros((USERS,USERS))
    for i in i_range:
        for j in range(1, USERS):
            pm = pearson(i,j)
            #print(i, j, pm)
            output_np[i - 1][j - 1] = pm
    outputs.append(pd.DataFrame(output_np))

outputs = []
threads = [threading.Thread(target=pearson_matrix_threaded, args=(outputs, range(k, USERS, THREADCOUNT),)) for k in range(THREADCOUNT)]
for thread in threads:
    thread.start()

for thread in threads:
    thread.join()


df = pd.concat(outputs)
df.head()



In [26]:
print(df)

          0    1         2         3    4         5         6    7    \
0    0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
1    0.521808  1.0  0.140028 -0.514496  0.0  0.385681  0.775999  0.0   
2    0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
3    0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
4    0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
..        ...  ...       ...       ...  ...       ...       ...  ...   
938  0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
939  0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
940  0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
941  0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
942  0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0   

          8    9    ...  933  934  935  936  937  938  939  940  941  942  
0    0.000000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0 