# Simple Approaches to Recommender Systems
## 2. Making Recommendations Based on Correlation

In [1]:
# import libraries
import numpy as np
import pandas as pd

These datasets are hosted on: https://archive.ics.uci.edu/ml/datasets/Restaurant+%26+consumer+data

They were originally published by: Blanca Vargas-Govea, Juan Gabriel GonzÃ¡lez-Serna, Rafael Ponce-MedellÃ­n. Effects of relevant contextual features in the performance of a restaurant recommender system. In RecSysâ€™11: Workshop on Context Aware Recommender Systems (CARS-2011), Chicago, IL, USA, October 23, 2011.

In [2]:
# import data
frame =  pd.read_csv('rating_final.csv')
cuisine = pd.read_csv('chefmozcuisine.csv')
geodata = pd.read_csv('geoplaces.csv')


In [3]:
# explore data 
frame.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2


In [4]:
# explore data
geodata.head()

Unnamed: 0,placeID,name,country
0,134999,Kiku Cuernavaca,Mexico
1,132825,puesto de tacos,mexico
2,135106,El Rincn de San Francisco,Mexico
3,132667,little pizza Emilio Portes Gil,Mexico
4,132613,carnitas_mata,mexico


In [5]:
# explore data
places =  geodata[['placeID', 'name']]
places.head()

Unnamed: 0,placeID,name
0,134999,Kiku Cuernavaca
1,132825,puesto de tacos
2,135106,El Rincn de San Francisco
3,132667,little pizza Emilio Portes Gil
4,132613,carnitas_mata


In [6]:
# explore data
cuisine.head()

Unnamed: 0,placeID,Rcuisine
0,135110,Spanish
1,135109,Italian
2,135107,Latin_American
3,135106,Mexican
4,135105,Fast_Food


## Grouping and Ranking Data

In [8]:
# rating = pandas dataframe having 2 columns 'placeID' and 'rating' (mean of rating given to each place)
rating = pd.DataFrame(frame.groupby('placeID')['rating'].mean())
rating.head(10)

Unnamed: 0_level_0,rating
placeID,Unnamed: 1_level_1
132560,0.5
132561,0.75
132564,1.25
132572,1.0
132583,1.0
132584,1.333333
132594,0.6
132608,1.0
132609,0.6
132613,1.166667


In [11]:
# rating = pandas dataframe having 3 columns 'placeID', 'rating', 'rating_count' (how many review each place got)
rating['rating_count'] = pd.DataFrame(frame.groupby('placeID')['rating'].count())
rating

Unnamed: 0_level_0,rating,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132560,0.500000,4
132561,0.750000,4
132564,1.250000,4
132572,1.000000,15
132583,1.000000,4
132584,1.333333,6
132594,0.600000,5
132608,1.000000,6
132609,0.600000,5
132613,1.166667,6


In [12]:
# describe rating
rating.describe()
# mean of rating is 1.18

Unnamed: 0,rating,rating_count
count,130.0,130.0
mean,1.179622,8.930769
std,0.349354,6.124279
min,0.25,3.0
25%,1.0,5.0
50%,1.181818,7.0
75%,1.4,11.0
max,2.0,36.0


In [13]:
# rating.sort_values of 'rating_count' and display in decending order
rating.sort_values('rating_count', ascending=False)

Unnamed: 0_level_0,rating,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135085,1.333333,36
132825,1.281250,32
135032,1.178571,28
135052,1.280000,25
132834,1.000000,25
135038,1.208333,24
135060,1.136364,22
135062,1.238095,21
135042,1.250000,20
132862,1.388889,18


In [14]:
# display detail of places where 'placeID'=135085
places[places['placeID'] == 135085]

Unnamed: 0,placeID,name
121,135085,Tortas Locas Hipocampo


In [16]:
# display detail of cuisine where 'placeID'=135085
cuisine[cuisine['placeID'] == 135085]

Unnamed: 0,placeID,Rcuisine
44,135085,Fast_Food


## Preparing Data For Analysis

In [17]:
places_crosstab = pd.pivot_table(data=frame, values='rating', index='userID', columns='placeID')
places_crosstab.head()

placeID,132560,132561,132564,132572,132583,132584,132594,132608,132609,132613,...,135080,135081,135082,135085,135086,135088,135104,135106,135108,135109
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,,,,,,,,,,,...,,,,0.0,,,,,,
U1002,,,,,,,,,,,...,,,,1.0,,,,1.0,,
U1003,,,,,,,,,,,...,2.0,,,,,,,,,
U1004,,,,,,,,,,,...,,,,,,,,2.0,,
U1005,,,,,,,,,,,...,,,,,,,,,,


In [23]:
Tortas_ratings = places_crosstab[135085]
print(Tortas_ratings)
#Tortas_ratings[Tortas_ratings>=0]

userID
U1001    0.0
U1002    1.0
U1003    NaN
U1004    NaN
U1005    NaN
U1006    NaN
U1007    1.0
U1008    NaN
U1009    NaN
U1010    NaN
U1011    NaN
U1012    NaN
U1013    1.0
U1014    NaN
U1015    NaN
U1016    2.0
U1017    NaN
U1018    NaN
U1019    NaN
U1020    NaN
U1021    NaN
U1022    NaN
U1023    NaN
U1024    NaN
U1025    NaN
U1026    NaN
U1027    1.0
U1028    NaN
U1029    1.0
U1030    NaN
        ... 
U1109    2.0
U1110    NaN
U1111    NaN
U1112    NaN
U1113    1.0
U1114    NaN
U1115    NaN
U1116    2.0
U1117    NaN
U1118    NaN
U1119    NaN
U1120    0.0
U1121    NaN
U1122    2.0
U1123    NaN
U1124    NaN
U1125    NaN
U1126    NaN
U1127    NaN
U1128    NaN
U1129    NaN
U1130    NaN
U1131    NaN
U1132    2.0
U1133    NaN
U1134    2.0
U1135    0.0
U1136    NaN
U1137    2.0
U1138    NaN
Name: 135085, Length: 138, dtype: float64


## Evaluating Similarity Based on Correlation

In [21]:
similar_to_Tortas = places_crosstab.corrwith(Tortas_ratings)
corr_Tortas = pd.DataFrame(similar_to_Tortas, columns=['PearsonR'])
corr_Tortas.dropna(inplace=True)
corr_Tortas.head()

  c = cov(x, y, rowvar)
  c *= 1. / np.float64(fact)


Unnamed: 0_level_0,PearsonR
placeID,Unnamed: 1_level_1
132572,-0.428571
132723,0.301511
132754,0.930261
132825,0.700745
132834,0.814823


In [24]:
Tortas_corr_summary = corr_Tortas.join(rating['rating_count'])
Tortas_corr_summary[Tortas_corr_summary['rating_count']>=10].sort_values('PearsonR', ascending=False).head(10)

Unnamed: 0_level_0,PearsonR,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135076,1.0,13
135085,1.0,36
135066,1.0,12
132754,0.930261,13
135045,0.912871,13
135062,0.898933,21
135028,0.892218,15
135042,0.881409,20
135046,0.867722,11
132872,0.840168,12


In [30]:
places_corr_Tortas = pd.DataFrame([135076,135085, 132754, 135045, 135062, 135028, 135042, 135046], index = np.arange(8), columns=['placeID'])
summary = pd.merge(places_corr_Tortas, cuisine,on='placeID')
summary

Unnamed: 0,placeID,Rcuisine
0,135085,Fast_Food
1,132754,Mexican
2,135028,Mexican
3,135042,Chinese
4,135046,Fast_Food


In [28]:
places[places['placeID']==135076]

Unnamed: 0,placeID,name
13,135076,Restaurante Pueblo Bonito


In [27]:
cuisine['Rcuisine'].describe()

count         916
unique         59
top       Mexican
freq          239
Name: Rcuisine, dtype: object