In [1]:
import numpy as np
import pandas as pd

import math


In [2]:
items_pd = pd.read_table(".\\ml-100k\\u.item", header=None, names=['item id', 'title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy'
    , 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], delimiter='|', encoding = "ISO-8859-1")
items_pd.head()
ITEMS = items_pd.shape[0]

In [3]:
users_pd = pd.read_table(".\\ml-100k\\u.user", header=None, names=['user id', 'age', 'gender', 'occupation', 'zip code'], delimiter='|', encoding = "ISO-8859-1")
users_pd.head()
USERS = users_pd.shape[0]

In [4]:
u1_base_pd = pd.read_table(".\\ml-100k\\u1.base", header=None, names=['user id', 'item id', 'rating', 'timestamp'])
u1_base_pd.head()

Unnamed: 0,user id,item id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [5]:
def to_data_pd(input):
    data_array = np.empty((USERS, ITEMS))
    for _, rating in input.iterrows():
        data_array[rating['user id'] - 1][rating['item id'] - 1] = rating['rating']
    output_pd = pd.DataFrame(data_array, index=range(1,USERS+1), columns=range(1,ITEMS+1))
    return output_pd
    

In [6]:
u1_data_pd = to_data_pd(u1_base_pd)
u1_data_pd.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
def find_average(data_pd):
    avgs = []
    for _, user in data_pd.iterrows():
        avgs.append(user.sum() / user.astype(bool).sum())

    return avgs
    

In [8]:
averages = find_average(u1_data_pd)


In [15]:
#This implements the Pearson metric as found in equation (2.2)
def pearson(user1, user2):
    intersection = [k for k in range(1,ITEMS) if u1_data_pd.iloc[user1-1][k] > 0 and u1_data_pd.iloc[user2-1][k] > 0]
    if intersection == []:
        return 0
    top = sum([(u1_data_pd.iloc[user1-1][k] - averages[user1-1]) * (u1_data_pd.iloc[user2-1][k] - averages[user2-1]) for k in intersection])
    bottom1 = math.sqrt( sum([(u1_data_pd.iloc[user1-1][k] - averages[user1-1]) ** 2 for k in intersection]))
    bottom2 = math.sqrt( sum([(u1_data_pd.iloc[user2-1][k] - averages[user2-1]) ** 2 for k in intersection]))
    return top / (bottom1 * bottom2)

In [16]:
# I don't know if this is the way to go about this.
# It takes a long time to run. However, I think it'set
# necessary because we'll have to calculate these values
# sooner or later anyways.
pearson_matrix = np.empty((USERS,USERS))
for i in range(1, USERS):
    for j in range(1, USERS):
        pm = pearson(i,j)
        print(i, j, pm)
        pearson_matrix[i - 1][j - 1] = pm
print(pearson_matrix)

1 1 1.0
1 2 0.5218078002649392
1 3 0.525128582870241
1 4 1.0
1 5 0.14781879428629585
1 6 0.29825794052773624
1 7 0.2275490939066833
1 8 0.8749808291580777
1 9 0.9071112056717623
1 10 -0.04460129214089602
1 11 0.03458369967893123
1 12 -0.12243110202220245
1 13 0.44370311570471577
1 14 -0.028697021430308108
1 15 0.3426794164092612
1 16 0.1302344821048268
1 17 0.17912266303852167
1 18 -0.10302479389214055
1 19 -0.9254978854313478
1 20 0.49896353357120166
1 21 0.6115933690120704
1 22 0.5615883911489011
1 23 0.15321426418797823
1 24 -0.003701193289200385
1 25 0.30333498826775557
1 26 0.3286007649963092
1 27 0.32946441102055024
1 28 0.5191410658836928
1 29 0.9687147417999895
1 30 0.37557256456833155
1 31 -0.14183485774379395
1 32 0.44484045102351627
1 33 0.2747211278973779
1 34 0
1 35 nan
1 36 -1.0


  return top / (bottom1 * bottom2)


1 37 0.8191753801274676
1 38 -0.5201291143713745
1 39 0.9999999999999999
1 40 -0.5347831422844271
1 41 0.1854630656663775
1 42 0.14161544315851396
1 43 0.24899683130869102
1 44 0.6089650211880977
1 45 0.7683836404684555
1 46 0.4964929839539766
1 47 1.0
1 48 -0.33057992405859526
1 49 0.052610009173701164
1 50 -0.6948679946972984
1 51 0.6900834831111966
1 52 0.25520085992071656
1 53 0.358849696904746
1 54 0.3262012563120302
1 55 0.28107350220037686
1 56 0.17874464805972493
1 57 0.4285106468054251
1 58 0.36274456979357644
1 59 0.2805350913473126
1 60 0.4959500623469498
1 61 0.8395533159254396
1 62 0.49475059023546886
1 63 0.2816136015742617
1 64 0.3273623556279753
1 65 0.5387329808659453
1 66 0.4423150879462328
1 67 -0.5631468724409736
1 68 0.7500473626063132
1 69 0.8769651680025043
1 70 0.3703910581196436
1 71 0.4362174833790708
1 72 0.4746505849722753
1 73 0.010287546727457948
1 74 0.233207651466341
1 75 0.7287663144472973
1 76 0.25607820728765646
1 77 0.29712091006903596
1 78 -0.436965

KeyboardInterrupt: 