In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('dataframes/google_places_data.csv')

In [3]:
train_df, test1_df = train_test_split(df, test_size=0.3, random_state=42)

test_df, val_df= train_test_split(test1_df, test_size=0.4, random_state=42)

# Feature engineering

In this part we will focus on:
* dealing with missing data
* normalizing numerical features
* selecting relevant columns
* adding new features

In [4]:
def check_for_missing(df):
    for column in df.columns:
        missing_count = df[column].isnull().sum()
        print("Missing values in column: ", column, "=", missing_count)

check_for_missing(train_df)

Missing values in column:  business_id = 0
Missing values in column:  phone_number = 0
Missing values in column:  name = 0
Missing values in column:  full_address = 0
Missing values in column:  latitude = 5
Missing values in column:  longitude = 5
Missing values in column:  review_count = 0
Missing values in column:  rating = 0
Missing values in column:  timezone = 0
Missing values in column:  website = 0
Missing values in column:  place_id = 0
Missing values in column:  place_link = 0
Missing values in column:  types = 0
Missing values in column:  Friday = 0
Missing values in column:  Saturday = 0
Missing values in column:  Sunday = 0
Missing values in column:  Monday = 0
Missing values in column:  Tuesday = 0
Missing values in column:  Wednesday = 0
Missing values in column:  Thursday = 0
Missing values in column:  city = 0
Missing values in column:  verified = 0
Missing values in column:  state = 0
Missing values in column:  Friday_morning = 0
Missing values in column:  Friday_after

We have only 5 missing values in columns latitude, longitude, geo_cluster, 
but there are also "hidden" missing values like "Unknown" in website, phone number etc

In [5]:
train_df[train_df['latitude'].isna() | train_df['longitude'].isna()]

Unnamed: 0,business_id,phone_number,name,full_address,latitude,longitude,review_count,rating,timezone,website,...,Tuesday_afternoon,Tuesday_evening,Wednesday_morning,Wednesday_afternoon,Wednesday_evening,Thursday_morning,Thursday_afternoon,Thursday_evening,geo_cluster,country
5368,0x549175bad87e4499:0xe474185a7c812203,13608880460,IJB Painting Company,Unknown,,,32.0,4.4,America/Los_Angeles,Unknown,...,0,0,0,0,0,0,0,0,,United Arab Emirates
5355,0x80dd4a6f6bc3c0d1:0xa0dbcc9e52e67a7d,13105300994,"R Painting,License #752512",Unknown,,,32.0,4.4,America/Los_Angeles,Unknown,...,0,0,0,0,0,0,0,0,,United Arab Emirates
12398,0x886b5c66fee9323d:0x3aa2c26684beba68,18137481822,Election Roofing and Construction LLC,Unknown,,,1.0,4.0,America/New_York,https://election-roofing-and-construction-llc....,...,0,0,0,0,0,0,0,0,,United Arab Emirates
7683,0x3e5f43a21c43cf97:0x5ab0bd09eee41976,971524175130,S&Y Travel Agency,Unknown,,,32.0,4.4,Asia/Dubai,Unknown,...,0,0,0,0,0,0,0,0,,United Arab Emirates
8571,0xb2afb323ceda5e5:0x9616560c124aa170,971509014567,Luxury District Real Estate,Unknown,,,32.0,4.4,Asia/Dubai,Unknown,...,1,1,1,1,1,1,1,1,,United Arab Emirates


Observations with missing latitude and longitude have also missing full address so we can't fill in the data. This is only 5 rows so let's delete this obseravtions, as we will want to use geospacial clusterich for which we need geographical coordinates

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class MissingGeographRemover(BaseEstimator,TransformerMixin ): #removing rows with missing location
    def __init__(self, columns):
        self.columns=columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.dropna(subset=self.columns)
missing_remover = Pipeline(steps=[
    ('missing_values_remover', MissingGeographRemover(columns=['latitude', 'longitude']))
])


Now let's try to get some more information from the fact that a certain business has website and official phone or not. We will create a new features website_known, phone_known with values 0-1, 0 if the website/phone is unknown or 1 if known

In [7]:

class ContactKnownAdder(BaseEstimator, TransformerMixin): # adding new binary features 1 - phone number is known, 0 - if not
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        X['Website_known'] = X['website'].apply(lambda x: 0 if x.lower()=='uknown' else 1)
        X['Phone_known'] = X['phone_number'].apply(lambda x: 0 if x.lower()=='uknown' else 1)
        return X
    
contact_known_adder = Pipeline(steps=[ ('new_contact_columns_adder', ContactKnownAdder())])

Now we will create groups from 'types' column, for that we used ChatGPT to determine some of the most important and often present categories, then we classify using Regex 

In [8]:
#checking for keywords in types column
import re
from collections import Counter

keywords=[]
for line in train_df['types'].unique():  
    if pd.notna(line):
        words = re.findall(r'\b\w+\b', line.lower())
        keywords.extend(words)

word_counts = Counter(keywords)
print(word_counts)


Counter({'store': 1771, 'service': 1181, 'shop': 622, 'restaurant': 619, 'dealer': 468, 'agency': 452, 'car': 424, 'repair': 302, 'clothing': 295, 'auto': 291, 'attorney': 289, 'clinic': 255, 'center': 253, 'tourist': 235, 'dental': 233, 'estate': 221, 'supplier': 218, 'attraction': 213, 'park': 207, 'jewelry': 205, 'real': 204, 'bar': 196, 'contractor': 181, 'used': 181, 's': 180, 'and': 179, 'furniture': 171, 'dentist': 170, 'pet': 165, 'hotel': 161, 'tour': 156, 'insurance': 149, 'museum': 144, 'company': 142, 'home': 134, 'designer': 124, 'school': 119, 'consultant': 119, 'law': 115, 'office': 109, 'medical': 109, 'salon': 108, 'church': 103, 'cafe': 98, 'bakery': 97, 'establishment': 97, 'rental': 96, 'supply': 95, 'coffee': 92, 'club': 91, 'flower': 91, 'beauty': 90, 'delivery': 89, 'painter': 88, 'maintenance': 82, 'venue': 79, 'firm': 78, 'art': 78, 'care': 77, 'hair': 75, 'health': 75, 'hospital': 74, 'accessories': 73, 'physical': 73, 'shoe': 72, 'cleaning': 72, 'interior': 6

In [9]:
# determined categories
main_categories = ["Hotel","Restaurant","Shop","Service","Agency","Dealer","Repair","Car","Travel","Fitness","Health","Dental","RealEstate",
    "Office","Tourist","Attraction","Park","Attorney","Bank","Pharmacy","Supermarket","Bakery","Barbershop","Hospital","Library","Museum","Cinema",
    "Store","Jeweler","Beauty"
]

In [15]:
class ClassifyType(BaseEstimator, TransformerMixin): # classify given row to one of the categories above using types column
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        def classify(text):
            for category in main_categories:
                if re.search(category, text, re.IGNORECASE):
                    return category
            return 'Other'
        X['main_category'] = X['types'].apply(classify)
        return X
    
main_category_adder = Pipeline(steps=[('add_grouped_category', ClassifyType())])

In [17]:
df.columns

Index(['business_id', 'phone_number', 'name', 'full_address', 'latitude',
       'longitude', 'review_count', 'rating', 'timezone', 'website',
       'place_id', 'place_link', 'types', 'Friday', 'Saturday', 'Sunday',
       'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'city', 'verified',
       'state', 'Friday_morning', 'Friday_afternoon', 'Friday_evening',
       'Saturday_morning', 'Saturday_afternoon', 'Saturday_evening',
       'Sunday_morning', 'Sunday_afternoon', 'Sunday_evening',
       'Monday_morning', 'Monday_afternoon', 'Monday_evening',
       'Tuesday_morning', 'Tuesday_afternoon', 'Tuesday_evening',
       'Wednesday_morning', 'Wednesday_afternoon', 'Wednesday_evening',
       'Thursday_morning', 'Thursday_afternoon', 'Thursday_evening',
       'geo_cluster', 'country'],
      dtype='object')

Now let's remove irrelevant columns like business_id, phone number, website but also geographical columns other than latitude and longitude because they may be a cause of false data or can also be unique (not sure about the 'name' but leave it for now)

In [21]:
class IrrelevantColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, irrelevant_columns):
        self.irrelevant_columns = irrelevant_columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(columns = self.irrelevant_columns)
    
irrelevant_cols = ['business_id','phone_number','full_address','website','country','city','place_id','place_link', 'types' ]

noise_remover = Pipeline(steps=[('remove_irrelevant_or_mistaken_columns', IrrelevantColumnRemover(irrelevant_cols))])

In [22]:
# make all transformations
pipes = [missing_remover, contact_known_adder, main_category_adder, noise_remover]

for pipe in pipes:
    pipe.fit(train_df)
    preprocessed_train_df = pipe.transform(train_df)

In [23]:
preprocessed_train_df.head(5)

Unnamed: 0,name,latitude,longitude,review_count,rating,timezone,Friday,Saturday,Sunday,Monday,...,Wednesday_morning,Wednesday_afternoon,Wednesday_evening,Thursday_morning,Thursday_afternoon,Thursday_evening,geo_cluster,Website_known,Phone_known,main_category
656,Lead Handyman Services,25.324796,55.417346,13.0,5.0,Asia/Dubai,8-11 AM,8 AM-6 PM,8 AM-6 PM,8 AM-6 AM,...,0,0,0,0,0,0,0.0,1,1,Service
3254,Sunil Ambalavelil - Lawyer and Legal Consultan...,25.18819,55.271341,6.0,5.0,Asia/Dubai,Hours not available,Hours not available,Hours not available,Hours not available,...,0,0,0,0,0,0,0.0,1,1,Attorney
8116,Action,52.186431,5.293758,966.0,4.1,Europe/Amsterdam,8:30 AM-9 PM,8:30 AM-6 PM,10 AM-6 PM,8:30 AM-8 PM,...,0,0,0,0,0,0,2.0,1,1,Shop
1187,LIFE Pharmacy - Hala 7,25.319895,55.385263,136.0,4.5,Asia/Dubai,Open 24 hours,Open 24 hours,Open 24 hours,Open 24 hours,...,1,1,1,1,1,1,0.0,1,1,Pharmacy
15104,درب لتاجير السيارات,25.384023,55.461907,149.0,4.9,Asia/Dubai,Open 24 hours,Open 24 hours,Open 24 hours,Open 24 hours,...,1,1,1,1,1,1,0.0,1,1,Service
