In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [34]:
pd_data = pd.read_csv("google_review_ratings.csv")
pd_data = pd_data.iloc[: , :-1]
pd_data.set_index("User",inplace = True)

headers = ["churches","resorts","beaches","parks","theaters","museums","malls","zoo","restaurants","pubs/bars",
           "local_services","burger/pizza_shop","hotels/other_lodgings","juice bars","art_galleries","dance_clubs",
           "swimming_pools","gyms","bakeries","beauty_&_spas","cafes","view_points","monuments","gardens"]

pd_data.columns = headers

pd_data

Unnamed: 0_level_0,churches,resorts,beaches,parks,theaters,museums,malls,zoo,restaurants,pubs/bars,...,art_galleries,dance_clubs,swimming_pools,gyms,bakeries,beauty_&_spas,cafes,view_points,monuments,gardens
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User 1,0.00,0.00,3.63,3.65,5.00,2.92,5.00,2.35,2.33,2.64,...,1.74,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00
User 2,0.00,0.00,3.63,3.65,5.00,2.92,5.00,2.64,2.33,2.65,...,1.74,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00
User 3,0.00,0.00,3.63,3.63,5.00,2.92,5.00,2.64,2.33,2.64,...,1.74,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00
User 4,0.00,0.50,3.63,3.63,5.00,2.92,5.00,2.35,2.33,2.64,...,1.74,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00
User 5,0.00,0.00,3.63,3.63,5.00,2.92,5.00,2.64,2.33,2.64,...,1.74,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
User 5452,0.91,5.00,4.00,2.79,2.77,2.57,2.43,1.09,1.77,1.04,...,5.00,0.66,0.65,0.66,0.69,5.00,1.05,5.0,5.0,1.56
User 5453,0.93,5.00,4.02,2.79,2.78,2.57,1.77,1.07,1.76,1.02,...,0.89,0.65,0.64,0.65,1.59,1.62,1.06,5.0,5.0,1.09
User 5454,0.94,5.00,4.03,2.80,2.78,2.57,1.75,1.05,1.75,1.00,...,0.87,0.65,0.63,0.64,0.74,5.00,1.07,5.0,5.0,1.11
User 5455,0.95,4.05,4.05,2.81,2.79,2.44,1.76,1.03,1.74,0.98,...,5.00,0.64,0.63,0.64,0.75,5.00,1.08,5.0,5.0,1.12


### pd_data.describe()

In [35]:
pd_data.isnull().sum()

churches                 0
resorts                  0
beaches                  0
parks                    0
theaters                 0
museums                  0
malls                    0
zoo                      0
restaurants              0
pubs/bars                0
local_services           0
burger/pizza_shop        1
hotels/other_lodgings    0
juice bars               0
art_galleries            0
dance_clubs              0
swimming_pools           0
gyms                     0
bakeries                 0
beauty_&_spas            0
cafes                    0
view_points              0
monuments                0
gardens                  1
dtype: int64

In [36]:
# replace the missing values with the mean
burger_pizza = pd_data["burger/pizza_shop"].astype("float64").mean()
gardens = pd_data["gardens"].astype("float64").mean()

# replace the missing value by the mean
pd_data["burger/pizza_shop"].replace(np.nan,burger_pizza,inplace = True)
pd_data["gardens"].replace(np.nan, gardens, inplace = True)


In [37]:
# the local services is object we have to convert it to float
pd_data.dtypes

churches                 float64
resorts                  float64
beaches                  float64
parks                    float64
theaters                 float64
museums                  float64
malls                    float64
zoo                      float64
restaurants              float64
pubs/bars                float64
local_services            object
burger/pizza_shop        float64
hotels/other_lodgings    float64
juice bars               float64
art_galleries            float64
dance_clubs              float64
swimming_pools           float64
gyms                     float64
bakeries                 float64
beauty_&_spas            float64
cafes                    float64
view_points              float64
monuments                float64
gardens                  float64
dtype: object

In [38]:
### check the duplication 

In [39]:
pd_data[pd_data.duplicated()]

Unnamed: 0_level_0,churches,resorts,beaches,parks,theaters,museums,malls,zoo,restaurants,pubs/bars,...,art_galleries,dance_clubs,swimming_pools,gyms,bakeries,beauty_&_spas,cafes,view_points,monuments,gardens
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User 5,0.0,0.0,3.63,3.63,5.0,2.92,5.0,2.64,2.33,2.64,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0
User 675,0.0,0.0,1.45,1.43,1.42,1.43,1.48,1.48,2.72,2.73,...,5.0,0.81,0.8,5.0,0.92,0.0,0.0,0.0,0.0,0.0
User 1347,1.06,1.1,5.0,3.28,5.0,5.0,5.0,1.84,1.81,5.0,...,1.77,1.8,0.0,0.0,0.0,0.0,0.0,1.05,1.03,1.04


In [40]:
pd_data.drop_duplicates(inplace = True)

In [41]:
## when we tried to convert the local services type we got an error idicate that there are a string '2\t2.' on the columns
## let's check how much rows have that value

In [42]:
## good there is just one instance let's remove it
pd_data[pd_data["local_services"] == '2\t2.'] 

Unnamed: 0_level_0,churches,resorts,beaches,parks,theaters,museums,malls,zoo,restaurants,pubs/bars,...,art_galleries,dance_clubs,swimming_pools,gyms,bakeries,beauty_&_spas,cafes,view_points,monuments,gardens
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User 2713,1.71,1.68,1.46,1.13,1.12,1.15,1.26,1.17,1.59,0.85,...,5.0,1.08,1.1,1.04,5.0,4.43,5.0,5.0,5.0,2.57


In [43]:
pd_data = pd_data[pd_data["local_services"] != '2\t2.']

In [44]:
pd_data[["local_services"]] = pd_data[["local_services"]].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_data[["local_services"]] = pd_data[["local_services"]].astype(float)


In [45]:
## now we have finished the data cleaning phase let's get into exploratory data analysis

In [46]:
pd_data.to_csv('cleand_data.csv')