In [4]:
import pandas as pd
import numpy as np
from random import sample 
from scipy.sparse import issparse
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [4]:
# Need to first import the demolib file
from demolib import mapdata
from demolib import getRecommendations_UU
from demolib import getRecommendations_II
from demolib import sparsity
from demolib import predictRatings
from demolib import getitemsimsmatrix
from demolib import pearsonsim
from demolib import cosinesim
from demolib import euclidsim
from demolib import wtavg
from demolib import makeRatingsMatrix

In [5]:
file = "df_CA_Summarization_filtered.csv"
ratings_dfFull = pd.read_csv(file)


In [11]:
ratings_dfFull.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'gPlusUserId', 'closed', 'gps', 'name',
       'phone', 'price', 'gPlusPlaceId', 'rating', 'reviewText', 'reviewCount',
       'userCount', 'loc', 'cleaned_review', 'cleaned_summary',
       'cleaned_education', 'cleaned_address', 'cleaned_hours',
       'cleaned_categories', 'cleaned_jobs', 'sentiment_VADER'],
      dtype='object')

In [6]:
ratings_df=ratings_dfFull[['gPlusUserId','name','rating']]
ratings_df.columns = ['user_id','item_id','rating']
print(ratings_df.shape)
ratings_df[0:6]

(954, 3)


Unnamed: 0,user_id,item_id,rating
0,1.002152e+20,Tamarine,5.0
1,1.019804e+20,BAR CRUDO,5.0
2,1.029165e+20,Bar Agricole,5.0
3,1.060633e+20,Cat and Fiddle Pub & Restaurant,4.0
4,1.074869e+20,Cafe Flore,4.0
5,1.038638e+20,Stout Burgers & Beers,5.0


In [19]:
def mapdata(ratings_df):
  ratings_df["item_id"] = ratings_df["item_id"].astype(str)
  ratings_df["user_id"] = ratings_df["user_id"].astype(str)
  ratings_df["rating"]  = ratings_df["rating"].values.astype(np.float32)
  user_ids = np.sort(ratings_df["user_id"].unique()).tolist()
  umap = {x: i for i, x in enumerate(user_ids)}
  item_ids = np.sort(ratings_df["item_id"].unique()).tolist()
  imap = {x: i for i, x in enumerate(item_ids)}
  ratings_df["user_id"] = ratings_df["user_id"].map(umap) # swap userid for user index
  ratings_df["item_id"] = ratings_df["item_id"].map(imap) # swap itemid for item index
  return ratings_df, umap, imap

In [20]:
# define function to convert the rating events into a (dense) ratings matrix
def makeRatingsMatrix(ratings_df):
  ratings_df, umap, imap = mapdata(ratings_df)
  ratmatrix = pd.pivot_table(ratings_df, index=['user_id'], columns=['item_id'], values=['rating'],aggfunc=[np.mean]).values
  return ratmatrix, umap, imap

In [21]:
# create the (dense) ratings matrix 
ratmatrix, umap, imap = makeRatingsMatrix(ratings_df)
print("user map=", umap)
print("item map=", imap)
print("mapped rating events=\n",ratings_df[0:5])  # to show that the ratings events have been mapped
print("ratings matrix=\n",ratmatrix)

user map= {'100027298100000000000': 0, '100064964300000000000': 1, '100071173100000000000': 2, '100149874600000000000': 3, '100167615300000000000': 4, '100187800700000000000': 5, '100188791600000000000': 6, '100191427200000000000': 7, '100195583100000000000': 8, '100203282600000000000': 9, '100215158900000000000': 10, '100219691400000000000': 11, '100226390700000000000': 12, '100247204500000000000': 13, '100330736800000000000': 14, '100334069800000000000': 15, '100343120700000000000': 16, '100365917800000000000': 17, '100392313900000000000': 18, '100399962200000000000': 19, '100433794300000000000': 20, '100445685400000000000': 21, '100452823500000000000': 22, '100462082800000000000': 23, '100462360000000000000': 24, '100487661000000000000': 25, '100496607900000000000': 26, '100508952600000000000': 27, '100516137400000000000': 28, '100516882800000000000': 29, '100555162400000000000': 30, '100587872100000000000': 31, '100641666600000000000': 32, '100643056300000000000': 33, '100646294100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [27]:
# select a specific user and get their ratings
targetname = "100215158900000000000"
targetrats = ratmatrix[umap[targetname],] 
targetrats

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [7]:
# Find the number users with different reviews
numberCountUser=ratings_df.groupby(["user_id"])["user_id"].count().rename('count')
countDf=ratings_df[["user_id"]]
countDf["count"]=numberCountUser

#numberCountUser.columns=["UserId","Count"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [8]:
countDf=pd.DataFrame(numberCountUser)

In [9]:
countDf.sort_values(by='count',ascending=False)

Unnamed: 0_level_0,count
user_id,Unnamed: 1_level_1
1.104490e+20,7
1.088672e+20,7
1.138995e+20,6
1.111152e+20,6
1.096830e+20,6
...,...
1.052848e+20,1
1.052965e+20,1
1.053083e+20,1
1.053103e+20,1


In [10]:
countDf[countDf["count"]>1].count()

count    94
dtype: int64