In [3]:
import numpy as np
import pandas as pd

In [4]:
ratings_df = pd.read_csv('ratings.csv')

In [5]:
ratings_df.head()

Unnamed: 0,userId,outfitID,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
userInput = [
            {'outfitID':5, 'rating':5},
            {'outfitID':2, 'rating':3.5},
            {'outfitID':296, 'rating':2},
            {'outfitID':1274, 'rating':5},
            {'outfitID':1938, 'rating':4.5}
         ] 
inputOutfit = pd.DataFrame(userInput)
inputOutfit

Unnamed: 0,outfitID,rating
0,5,5.0
1,2,3.5
2,296,2.0
3,1274,5.0
4,1938,4.5


In [7]:
inputOutfit.dtypes

outfitID      int64
rating      float64
dtype: object

In [8]:
userSubset = ratings_df[ratings_df['outfitID'].isin(inputOutfit['outfitID'].tolist())]
userSubset.head()

Unnamed: 0,userId,outfitID,rating
16,1,296,3.0
320,4,296,1.0
533,5,296,5.0
560,6,2,4.0
563,6,5,5.0


In [9]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [10]:
userSubsetGroup.get_group(4)

Unnamed: 0,userId,outfitID,rating
320,4,296,1.0


In [11]:
len(userSubsetGroup.get_group(4))

1

In [12]:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [13]:
userSubsetGroup[0]

(474,
        userId  outfitID  rating
 73093     474         2     3.0
 73094     474         5     1.5
 73172     474       296     4.0
 73466     474      1274     2.0
 73618     474      1938     4.0)

In [14]:
userSubsetGroup[0][0]

474

In [15]:
userSubsetGroup[0][1]

Unnamed: 0,userId,outfitID,rating
73093,474,2,3.0
73094,474,5,1.5
73172,474,296,4.0
73466,474,1274,2.0
73618,474,1938,4.0


In [16]:
userSubsetGroup = userSubsetGroup[0:100]

In [17]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    
    #sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='outfitID')
    inputOutfit = inputOutfit.sort_values(by='outfitID')
    

    #Get the N (total similar movies watched) for the formula 
    nRatings = len(group)
    
    #Get the review scores for the movies that they both have in common
    temp_df = inputOutfit[inputOutfit['outfitID'].isin(group['outfitID'].tolist())]
    
 
    #store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    
    #put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    
    #calculate the pearson correlation between two users, so called, x and y

    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

       


In [18]:
pearsonCorrelationDict.items()


dict_items([(474, -0.6880209161537812), (177, -0.8315218406202999), (414, -0.674199862463242), (483, -0.9198662110077999), (600, -0.8664002254439634), (6, 0.981980506061966), (68, 0.0), (91, 0.24019223070763082), (103, -0.8660254037844402), (107, 0.5), (117, -0.8660254037844402), (219, -0.8660254037844386), (274, -0.6546536707079778), (288, -0.8660254037844386), (298, -0.11470786693528089), (318, 0.3273268353539889), (353, -1.0), (434, 0.0), (448, -0.866025403784439), (458, -0.8660254037844402), (470, 0), (477, 0.5), (480, 0.0), (489, -0.9607689228305225), (561, -0.9819805060619667), (590, -0.944911182523068), (599, 0.0), (604, -0.866025403784439), (608, -0.32732683535398865), (8, 0), (18, -1.0), (21, 0), (23, 1.0), (43, 1.0), (45, -1.0), (50, 0), (57, 0), (58, -1.0), (62, -1.0), (66, -1.0), (84, -1.0), (94, 0), (105, -1.0), (121, -1.0), (122, -1.0), (125, -1.0), (135, -1.0), (140, -1.0), (144, -1.0), (149, 0), (153, -1.0), (160, -1.0), (169, 1.0), (170, 1.0), (181, 1.0), (182, 0), (18

In [19]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.head()

Unnamed: 0,0
474,-0.688021
177,-0.831522
414,-0.6742
483,-0.919866
600,-0.8664


In [20]:
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,-0.688021,474
1,-0.831522,177
2,-0.6742,414
3,-0.919866,483
4,-0.8664,600


In [21]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
66,1.0,240
90,1.0,426
52,1.0,169
97,1.0,476
53,1.0,170


In [22]:
topUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,outfitID,rating
0,1.0,240,1,5.0
1,1.0,240,2,5.0
2,1.0,240,3,4.0
3,1.0,240,10,3.0
4,1.0,240,16,3.0


In [23]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,outfitID,rating,weightedRating
0,1.0,240,1,5.0,5.0
1,1.0,240,2,5.0,5.0
2,1.0,240,3,4.0,4.0
3,1.0,240,10,3.0,3.0
4,1.0,240,16,3.0,3.0


In [24]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('outfitID').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
outfitID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-1.380366,-4.632579
2,-1.803109,1.442261
3,1.12257,9.118772
4,0.981981,2.945942
5,-1.076393,4.941014


In [25]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['outfitID'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,outfitID
outfitID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.356052,1
2,-0.799875,2
3,8.123122,3
4,3.0,4
5,-4.590345,5


In [26]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,outfitID
outfitID,Unnamed: 1_level_1,Unnamed: 2_level_1
8529,109.304456,8529
2840,58.495454,2840
54256,52.34994,54256
160,39.685979,160
2431,35.889246,2431


In [27]:
recommendation_df.dtypes

weighted average recommendation score    float64
outfitID                                   int64
dtype: object

In [28]:
ratings_df.dtypes

userId        int64
outfitID      int64
rating      float64
dtype: object

In [29]:

outfits=ratings_df.loc[ratings_df['outfitID'].isin(recommendation_df['outfitID'].tolist())]

In [30]:
#Racommended top 10 outfits
outfits[["outfitID"]].head(10)


Unnamed: 0,outfitID
0,1
1,3
2,6
3,47
4,50
5,70
6,101
7,110
8,151
9,157
