# Project: Geo-pulse - Prediction of number of interests

---

## 1. Importing Libraries

In [1]:
# General Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sci-kit learn Library
# machine learning models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# preprocess
from sklearn.model_selection import train_test_split

# metrics
from sklearn.metrics import mean_squared_error

sns.set_palette('colorblind')

In [2]:
df = pd.read_csv('../data/geo_cleaned.csv', index_col=[0])

In [3]:
df.head()

Unnamed: 0,area_code,hour,segment,density,prepaid,postpaid,gender_male,gender_female,gender_unknown,age_1_19,...,online_shopping_user,inte_travel_and_information,inte_investment,inte_realestate,inte_insurance,inte_automobile,inte_fitness_and_wellness,par_day,area_code_length,time_of_day
0,BAAACBIH,18,visitor,28,9,19,5,11,12,6,...,21,4,3,2,0,3,3,20201104,8,Afterwork
1,BAAACBIH,17,visitor,21,10,11,6,7,8,3,...,11,1,2,0,0,0,0,20201104,8,Work
2,BAAHEBJC,1,visitor,37,17,20,10,8,19,8,...,20,1,2,0,0,4,3,20201104,8,Sleep
3,BAEFBBAC,16,worker,43,21,22,11,9,23,5,...,24,3,3,4,3,5,4,20201104,8,Work
4,BAJKABJB,12,visitor,21,12,9,8,6,7,3,...,12,0,0,0,0,1,1,20201104,8,Work


---

## 2. Data Preparation

In [4]:
# drop unused columns
df.drop(columns=['area_code','par_day','area_code_length','hour'], 
        inplace = True)

In [5]:
# defining column groups
GENDER = df.columns[df.columns.str.contains('gender')]
AGE = df.columns[df.columns.str.contains('age')]
PAID = df.columns[df.columns.str.contains('paid')]
PLAN = df.columns[df.columns.str.contains('pay')]
USER = df.columns[df.columns.str.contains('user')]
INTE = df.columns[df.columns.str.contains('inte')]

col_cat = [GENDER,AGE,PAID,PLAN,USER,INTE]
col_cat_name = ['Gender','Age','Post/Pre Paid','Package Plan',
                'Usage','Interests']

---

## 3. Predict number of interests

In [6]:
def get_metrics(model,X_train=1,X_test=1,y_train=1,y_test=1):
    
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_r2 = model.score(X_train, y_train).round(4)
    test_r2 = model.score(X_test, y_test).round(4)
    train_rmse = mean_squared_error(y_train,y_train_pred).round(2)
    test_rmse = mean_squared_error(y_test,y_test_pred).round(2)
    
    return train_r2,test_r2,train_rmse,test_rmse

In [7]:
def predict_category(model_input,category,X_train=1, X_test=1,
                     y_train=1, y_test=1):
    
    results_dict = {}
    
    for col in category:
        
        model = model_input
        
        X = df.drop(columns=category)
        y = df[col]
        X = pd.get_dummies(X,columns=['segment','time_of_day'])

        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size = 0.2,
                                                        random_state=42)
        metrics = []

        train_r2,test_r2,train_rmse,test_rmse = \
        get_metrics(DummyRegressor(),X_train, X_test, y_train, y_test)
        metrics.extend([train_r2,test_r2,train_rmse,test_rmse])


        train_r2,test_r2,train_rmse,test_rmse = \
        get_metrics(model,X_train, X_test, y_train, y_test)
        metrics.extend([train_r2,test_r2,train_rmse,test_rmse])
    
        results_dict[col] = metrics
    
    return pd.DataFrame(results_dict,
                        index=['base_train_r2','base_test_r2','base_train_rmse',
                               'base_test_rmse','model_train_r2',
                               'model_test_r2','model_train_rmse', 
                               'model_test_rmse'])

### 3.1 Linear Regression

#### 3.1.1 Predict number of interests in certain grid

In [8]:
predict_category(LinearRegression(),INTE)

Unnamed: 0,inte_coffee_lover,inte_travel_and_information,inte_investment,inte_realestate,inte_insurance,inte_automobile,inte_fitness_and_wellness
base_train_r2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
base_test_r2,-0.0097,-0.0006,-0.0047,-0.0005,-0.0004,-0.0014,-0.0024
base_train_rmse,124.18,25.36,28.56,10.14,9.25,29.09,31.5
base_test_rmse,155.16,27.65,28.71,8.29,7.63,30.78,48.25
model_train_r2,0.9247,0.8519,0.8478,0.748,0.7449,0.8403,0.8058
model_test_r2,0.9247,0.7202,0.8136,0.5425,0.6971,0.8332,0.9117
model_train_rmse,9.35,3.75,4.35,2.56,2.36,4.65,6.12
model_test_rmse,11.58,7.73,5.33,3.79,2.31,5.13,4.25


### 3.1.2 Predict number of app users in certain grid

In [9]:
predict_category(LinearRegression(),USER)

Unnamed: 0,music_streamer_user,video_streaming_app_user,merchant_app_user,bank_app_user,fastfood_app_user,food_delivery_app_user,grocery_delivery_app_user,online_shopping_user
base_train_r2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
base_test_r2,-0.0019,-0.0017,-0.0024,-0.0024,-0.0,-0.002,-0.0008,-0.0052
base_train_rmse,59.96,325.2,3.34,503.57,4.61,226.18,87.58,741.48
base_test_rmse,183.31,537.37,4.28,549.31,4.67,270.82,80.95,1088.88
model_train_r2,0.9024,0.9644,0.5732,0.9701,0.6445,0.9488,0.9102,0.9825
model_test_r2,0.7261,0.948,0.5863,0.9345,0.6306,0.9483,0.8494,0.9855
model_train_rmse,5.85,11.57,1.43,15.08,1.64,11.58,7.86,12.94
model_test_rmse,50.12,27.89,1.77,35.88,1.73,13.96,12.18,15.67


### 3.2 XGBoost

#### 3.2.1 Predict number of interests in certain grid

In [10]:
predict_category(XGBRegressor(),INTE)

Unnamed: 0,inte_coffee_lover,inte_travel_and_information,inte_investment,inte_realestate,inte_insurance,inte_automobile,inte_fitness_and_wellness
base_train_r2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
base_test_r2,-0.0097,-0.0006,-0.0047,-0.0005,-0.0004,-0.0014,-0.0024
base_train_rmse,124.18,25.36,28.56,10.14,9.25,29.09,31.5
base_test_rmse,155.16,27.65,28.71,8.29,7.63,30.78,48.25
model_train_r2,1.0,0.9999,0.9999,0.9999,0.9999,0.9999,0.9999
model_test_r2,0.8689,0.8042,0.7729,0.6142,0.5834,0.7942,0.6672
model_train_rmse,0.0,0.0,0.0,0.0,0.0,0.0,0.0
model_test_rmse,20.15,5.41,6.49,3.2,3.18,6.33,16.02


#### 3.2.2 Predict number of app users in certain grid

In [11]:
predict_category(XGBRegressor(),USER)

Unnamed: 0,music_streamer_user,video_streaming_app_user,merchant_app_user,bank_app_user,fastfood_app_user,food_delivery_app_user,grocery_delivery_app_user,online_shopping_user
base_train_r2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
base_test_r2,-0.0019,-0.0017,-0.0024,-0.0024,-0.0,-0.002,-0.0008,-0.0052
base_train_rmse,59.96,325.2,3.34,503.57,4.61,226.18,87.58,741.48
base_test_rmse,183.31,537.37,4.28,549.31,4.67,270.82,80.95,1088.88
model_train_r2,0.9999,1.0,0.9998,1.0,0.9998,1.0,1.0,1.0
model_test_r2,0.4682,0.7764,0.5124,0.9431,0.4926,0.8897,0.8605,0.8715
model_train_rmse,0.0,0.01,0.0,0.01,0.0,0.01,0.0,0.0
model_test_rmse,97.31,119.96,2.08,31.21,2.37,29.8,11.29,139.18
