In [46]:
import numpy as np
import pandas as pd

In [47]:
ratings_df = pd.read_csv('ratings.csv')

In [48]:
ratings_df.head()

Unnamed: 0,userId,outfitID,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [49]:
userInput = [
            {'outfitID':1, 'rating':5},
            {'outfitID':2, 'rating':3.5},
            {'outfitID':296, 'rating':2},
            {'outfitID':1274, 'rating':5},
            {'outfitID':1968, 'rating':4.5}
         ] 
inputOutfit = pd.DataFrame(userInput)
inputOutfit

Unnamed: 0,outfitID,rating
0,1,5.0
1,2,3.5
2,296,2.0
3,1274,5.0
4,1968,4.5


In [50]:
inputOutfit.dtypes

outfitID      int64
rating      float64
dtype: object

In [51]:
userSubset = ratings_df[ratings_df['outfitID'].isin(inputOutfit['outfitID'].tolist())]
userSubset.head()

Unnamed: 0,userId,outfitID,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0


In [52]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [53]:
userSubsetGroup.get_group(4)

Unnamed: 0,userId,outfitID,rating
320,4,296,1.0
422,4,1968,4.0


In [54]:
len(userSubsetGroup.get_group(4))

2

In [55]:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [56]:
userSubsetGroup[0]

(91,
        userId  outfitID  rating
 14121      91         1     4.0
 14122      91         2     3.0
 14173      91       296     4.5
 14316      91      1274     5.0
 14383      91      1968     3.0)

In [57]:
userSubsetGroup[0][0]

91

In [58]:
userSubsetGroup[0][1]

Unnamed: 0,userId,outfitID,rating
14121,91,1,4.0
14122,91,2,3.0
14173,91,296,4.5
14316,91,1274,5.0
14383,91,1968,3.0


In [59]:
userSubsetGroup = userSubsetGroup[0:100]

In [60]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    
    #sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='outfitID')
    inputOutfit = inputOutfit.sort_values(by='outfitID')
    

    #Get the N (total similar movies watched) for the formula 
    nRatings = len(group)
    
    #Get the review scores for the movies that they both have in common
    temp_df = inputOutfit[inputOutfit['outfitID'].isin(group['outfitID'].tolist())]
    
 
    #store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    
    #put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    
    #calculate the pearson correlation between two users, so called, x and y

    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

       


In [61]:
pearsonCorrelationDict.items()


dict_items([(91, 0.0), (177, -0.4688072309384956), (219, -0.526449732896663), (274, -0.626600514784503), (298, -0.17986335575472315), (414, -0.23440361546924784), (474, -0.4102063270711837), (477, 0.0), (480, -0.29417420270727607), (483, -0.8807048459279793), (599, -0.25556221638599463), (608, -0.40032038451271784), (50, -0.44486512077567225), (57, 0.5151515151515151), (68, 0.7715167498104595), (103, -0.8819171036881969), (135, 0.1259881576697424), (182, -0.4061811972299616), (202, 0.3779644730092272), (217, 0.8728715609439696), (226, -0.4879500364742666), (288, -0.2516299559794226), (307, -0.033277916281986085), (318, 0.0419960525565808), (322, -0.76539207448568), (330, 0.2320551167590648), (357, 0.18442777839082938), (434, -0.03683547343418786), (448, -0.2182178902359924), (469, -0.7035264706814485), (561, -0.8819171036881969), (600, -0.8664002254439634), (606, -0.7740339400213734), (610, -0.17407765595569785), (18, -0.5), (19, 0.3273268353539889), (21, 0), (45, -0.9878291611472653),

In [62]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.head()

Unnamed: 0,0
91,0.0
177,-0.468807
219,-0.52645
274,-0.626601
298,-0.179863


In [63]:
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.0,91
1,-0.468807,177
2,-0.52645,219
3,-0.626601,274
4,-0.179863,298


In [64]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
99,1.0,4
98,1.0,1
88,1.0,559
81,0.995871,484
96,0.960769,605


In [65]:
topUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,outfitID,rating
0,1.0,4,21,3.0
1,1.0,4,32,2.0
2,1.0,4,45,3.0
3,1.0,4,47,2.0
4,1.0,4,52,3.0


In [66]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,outfitID,rating,weightedRating
0,1.0,4,21,3.0,3.0
1,1.0,4,32,2.0,2.0
2,1.0,4,45,3.0,3.0
3,1.0,4,47,2.0,2.0
4,1.0,4,52,3.0,3.0


In [67]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('outfitID').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
outfitID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,13.453042,55.998242
2,9.74796,33.758295
3,5.043536,15.181155
4,0.0,0.0
5,1.804616,5.25704


In [68]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['outfitID'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,outfitID
outfitID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.162497,1
2,3.463114,2
3,3.010022,3
4,,4
5,2.913107,5


In [69]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,outfitID
outfitID,Unnamed: 1_level_1,Unnamed: 2_level_1
143355,81.561044,143355
91658,79.855626,91658
69526,75.412918,69526
109487,63.743717,109487
61132,54.14349,61132


In [70]:
recommendation_df.dtypes

weighted average recommendation score    float64
outfitID                                   int64
dtype: object

In [71]:
ratings_df.dtypes

userId        int64
outfitID      int64
rating      float64
dtype: object

In [72]:

outfits=ratings_df.loc[ratings_df['outfitID'].isin(recommendation_df['outfitID'].tolist())]

In [76]:
#Racommended top 10 outfits
outfits[["outfitID"]].head(10)


Unnamed: 0,outfitID
0,1
1,3
2,6
3,47
4,50
5,70
6,101
7,110
8,151
9,157
