In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('dataframes/google_places_data.csv')

In [5]:
train_df, test1_df = train_test_split(df, test_size=0.3, random_state=42)

test_df, val_df= train_test_split(test1_df, test_size=0.4, random_state=42)

# Feature engineering

In this part we will focus on:
* dealing with missing data
* normalizing numerical features
* selecting relevant columns
* adding new features

In [6]:
def check_for_missing(df):
    for column in df.columns:
        missing_count = df[column].isnull().sum()
        print("Missing values in column: ", column, "=", missing_count)

check_for_missing(train_df)

Missing values in column:  business_id = 0
Missing values in column:  phone_number = 0
Missing values in column:  name = 0
Missing values in column:  full_address = 0
Missing values in column:  latitude = 5
Missing values in column:  longitude = 5
Missing values in column:  review_count = 0
Missing values in column:  rating = 0
Missing values in column:  timezone = 0
Missing values in column:  website = 0
Missing values in column:  place_id = 0
Missing values in column:  place_link = 0
Missing values in column:  types = 0
Missing values in column:  Friday = 0
Missing values in column:  Saturday = 0
Missing values in column:  Sunday = 0
Missing values in column:  Monday = 0
Missing values in column:  Tuesday = 0
Missing values in column:  Wednesday = 0
Missing values in column:  Thursday = 0
Missing values in column:  city = 0
Missing values in column:  verified = 0
Missing values in column:  state = 0
Missing values in column:  Friday_morning = 0
Missing values in column:  Friday_after

We have only 5 missing values in columns latitude, longitude, geo_cluster, 
but there are also "hidden" missing values like "Unknown" in website, phone number etc

In [8]:
train_df[train_df['latitude'].isna() | train_df['longitude'].isna()]

Unnamed: 0,business_id,phone_number,name,full_address,latitude,longitude,review_count,rating,timezone,website,place_id,place_link,types,Friday,Saturday,Sunday,Monday,Tuesday,Wednesday,Thursday,city,verified,state,Friday_morning,Friday_afternoon,Friday_evening,Saturday_morning,Saturday_afternoon,Saturday_evening,Sunday_morning,Sunday_afternoon,Sunday_evening,Monday_morning,Monday_afternoon,Monday_evening,Tuesday_morning,Tuesday_afternoon,Tuesday_evening,Wednesday_morning,Wednesday_afternoon,Wednesday_evening,Thursday_morning,Thursday_afternoon,Thursday_evening,geo_cluster,country
5368,0x549175bad87e4499:0xe474185a7c812203,13608880460,IJB Painting Company,Unknown,,,32.0,4.4,America/Los_Angeles,Unknown,ChIJmUR-2Lp1kVQRAyKBfFoYdOQ,https://www.google.com/maps/place/data=!3m1!4b...,"Painter, Service establishment, Painting",8 AM-5 PM,Closed,Closed,8 AM-5 PM,8 AM-5 PM,8 AM-5 PM,8 AM-5 PM,Dubai,True,Closed ⋅ Opens 8 AM Mon,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,United Arab Emirates
5355,0x80dd4a6f6bc3c0d1:0xa0dbcc9e52e67a7d,13105300994,"R Painting,License #752512",Unknown,,,32.0,4.4,America/Los_Angeles,Unknown,ChIJ0cDDa29K3YARfXrmUp7M26A,https://www.google.com/maps/place/data=!3m1!4b...,"Painter, Service establishment",9 AM-5 PM,10 AM-3 PM,Closed,9 AM-5 AM,9 AM-5 PM,9 AM-5 PM,9 AM-5 PM,Dubai,True,Closed ⋅ Opens 9 AM Mon,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,United Arab Emirates
12398,0x886b5c66fee9323d:0x3aa2c26684beba68,18137481822,Election Roofing and Construction LLC,Unknown,,,1.0,4.0,America/New_York,https://election-roofing-and-construction-llc....,ChIJPTLp_mZca4gRaLq-hGbCojo,https://www.google.com/maps/place/data=!3m1!4b...,"Roofing contractor, Service establishment",7 AM-6 PM,10 AM-4:30 PM,Closed,7 AM-6 PM,7 AM-6 PM,7 AM-6 PM,7 AM-6 PM,Dubai,True,Closed ⋅ Opens 7 AM Mon,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,United Arab Emirates
7683,0x3e5f43a21c43cf97:0x5ab0bd09eee41976,971524175130,S&Y Travel Agency,Unknown,,,32.0,4.4,Asia/Dubai,Unknown,ChIJl89DHKJDXz4Rdhnk7gm9sFo,https://www.google.com/maps/place/data=!3m1!4b...,"Travel agency, Service establishment",9 AM-10 PM,9 AM-10 PM,9 AM-10 PM,9 AM-10 PM,9 AM-10 PM,9 AM-10 PM,9 AM-10 PM,Dubai,True,Open ⋅ Closes 10 PM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,United Arab Emirates
8571,0xb2afb323ceda5e5:0x9616560c124aa170,971509014567,Luxury District Real Estate,Unknown,,,32.0,4.4,Asia/Dubai,Unknown,ChIJ5aXtPDL7KgsRcKFKEgxWFpY,https://www.google.com/maps/place/data=!3m1!4b...,"Real estate agency, Service establishment",Open 24 hours,Open 24 hours,9 AM-2 PM,Open 24 hours,Open 24 hours,Open 24 hours,Open 24 hours,Dubai,True,Closed ⋅ Opens 12 AM Mon,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,,United Arab Emirates


Observations with missing latitude and longitude have also missing full address so we can't fill in the data. This is only 5 rows so let's delete this obseravtions, as we will want to use geospacial clusterich for which we need geographical coordinates

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class MissingGeographRemover(BaseEstimator,TransformerMixin ): #removing rows with missing location
    def __init__(self, columns):
        self.columns=columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.dropna(subset=self.columns)
missing_remover = Pipeline(steps=[
    ('missing_values_remover', MissingGeographRemover(columns=['latitude', 'longitude']))
])


Now let's try to get some more information from the fact that a certain business has website and official phone or not. We will create a new features website_known, phone_known with values 0-1, 0 if the website/phone is unknown or 1 if known

In [16]:

class ContactKnownAdder(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        X['Website_known'] = X['website'].apply(lambda x: 0 if x.lower()=='uknown' else 1)
        X['Phone_known'] = X['phone_number'].apply(lambda x: 0 if x.lower()=='uknown' else 1)
        return X
    
contact_known_adder = Pipeline(steps=[ ('new_contact_columns_adder', ContactKnownAdder())])

In [19]:
pipes = [missing_remover, contact_known_adder]

for pipe in pipes:
    pipe.fit(train_df)
    preprocessed_train_df = pipe.transform(train_df)

In [21]:
preprocessed_train_df.shape

(10642, 48)