# Linear Regression in Pandas

Testing basic linear and logistic regression with OHE of some variables from the dataset.

### Setup

In [1]:
# Extension reloader to import a function again when re-running cell 
%load_ext autoreload
%autoreload 2

### Load Configuration

In [2]:
"""
Loads common configuration parameters
"""
from src.utils.configuration_manager import Config
from pathlib import Path, PurePath
from os import getcwd

config_path = PurePath(getcwd(),'config.ini')
config = Config(config_path)

Loading configuration from: /home/justin/Code/interpretability_experiment/config.ini


### Start local Dask Client

In [3]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
    cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
#     cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Setting new client
<Client: 'tcp://127.0.0.1:42759' processes=5 threads=10, memory=20.00 GB>


0,1
Client  Scheduler: tcp://127.0.0.1:42759  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 20.00 GB


## Prepare / Load dataset

In [4]:
from src.preprocessing.dataset_manager import DatasetManager
dataset_manager = DatasetManager(config)

In [5]:
%time dataset_manager.write_dataset(overwrite=False)

Not overwriting existing training and test sets
CPU times: user 498 µs, sys: 83 µs, total: 581 µs
Wall time: 381 µs


In [6]:
X_train, y_train = dataset_manager.get_training_set()
X_test, y_test = dataset_manager.get_test_set()

### Additional Preprocessing Step

In [7]:
categorical_columns_to_transform = ['payment_type']
datetime_columns_to_transform = ['tpep_pickup_datetime', 'tpep_dropoff_datetime']
# datetime_columns_to_transform = []

In [8]:
from dask_ml.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)

In [9]:
import src.preprocessing.preprocessing_pipelines as preprocessing

def preprocess(ddf):
    return preprocessing.ohe_preprocessing_pipeline(encoder, 
                                     ddf, 
                                     categorical_columns_to_transform = categorical_columns_to_transform, 
                                     datetime_columns_to_transform = datetime_columns_to_transform)

X_train = preprocess(X_train)
X_test = preprocess(X_test)

In [10]:
input_columns = X_train.columns.values

In [11]:
X_train.head()

Unnamed: 0_level_0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount_wo_tip,payment_type_1,payment_type_2,...,tpep_dropoff_datetime_hourslot_21,tpep_dropoff_datetime_hourslot_22,tpep_dropoff_datetime_hourslot_23,tpep_dropoff_datetime_day_of_week_0,tpep_dropoff_datetime_day_of_week_1,tpep_dropoff_datetime_day_of_week_2,tpep_dropoff_datetime_day_of_week_3,tpep_dropoff_datetime_day_of_week_4,tpep_dropoff_datetime_day_of_week_5,tpep_dropoff_datetime_day_of_week_6
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.2,11.0,0.0,0.5,0.0,0.3,11.8,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,12.03,39.0,0.0,0.5,5.76,0.3,45.56,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.86,5.5,0.0,0.5,0.0,0.3,6.3,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,1.09,6.0,0.0,0.5,0.0,0.3,6.8,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2.0,1.32,10.0,0.0,0.5,0.0,0.3,10.8,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### Training Linear Regression with Pandas

In [12]:
target = 'tip_fraction'
partitions_to_concat = 20 
import src.preprocessing.dask_to_pandas as dtp

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


import joblib

def train_linear_regression(X_ddf, y_ddf, target, partitions_to_concat, save_to):
    """
    In memory training of linear regression
    """
    X, y = dtp.dask_Xy_to_df(X_ddf, y_ddf, target, partitions_to_concat)
    
    # Using ridge to penalize large weights
    estimator = Ridge()
#     estimator = LinearRegression(normalize=True)
#     estimator = Lasso()
    estimator.fit(X, y=y)
    
    joblib.dump(estimator, save_to)
    return estimator

In [14]:
linear_reg_save = 'trained_models/linr_estimator_w_cat.sav'
lr_estimator = train_linear_regression(X_train, y_train, target, partitions_to_concat, linear_reg_save)

### Test Linear Regression

In [15]:
import numpy as np
import eli5



In [16]:
def test_regression(X_ddf, y_ddf, target, estimator):
    """
    In memory testing of regression model
    """
    X, y = dtp.dask_Xy_to_df(X_ddf, y_ddf, target, partitions_to_concat)
    
    predictions = estimator.predict(X)
    print('MAE: ' + str(abs(predictions-y).mean(axis=0)))
    print('RMSE: ' + str(np.sqrt(((predictions-y)**2).mean(axis=0))))
    

In [17]:
# Normal linear regression
test_regression(X_test, y_test, target, lr_estimator)

MAE: 0.046468316938262644
RMSE: 0.5313604117005752


In [31]:
eli5.explain_weights(lr_estimator, feature_names=input_columns, top=(10,10))

Weight?,Feature
+0.171,<BIAS>
+0.166,payment_type_1
+0.061,tpep_pickup_datetime_day_of_week_6
+0.040,tpep_dropoff_datetime_day_of_week_0
+0.031,tpep_dropoff_datetime_day_of_week_3
+0.029,tpep_pickup_datetime_day_of_week_5
+0.023,tpep_pickup_datetime_hourslot_22
+0.022,tpep_pickup_datetime_hourslot_21
+0.021,tpep_dropoff_datetime_hourslot_5
+0.020,tpep_dropoff_datetime_hourslot_4




In [19]:
# TODO resolve bug that has previous targets values bleed into training set
# print(X_train.columns.values)
# Patch the underlying bug
X_train = X_train.drop(columns = target, axis=1)

### Training Logistic Regression with Pandas

In [20]:
target = 'tipped'

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
def train_logistic_regression(X_ddf, y_ddf, target, partitions_to_concat, save_to):
    """
    In memory training of linear regression
    """
    X, y = dtp.dask_Xy_to_df(X_ddf, y_ddf, target, partitions_to_concat)
        
    estimator = LogisticRegression()
    estimator.fit(X, y=y)
    
    joblib.dump(estimator, save_to)
    return estimator

In [23]:
logistic_reg_save = 'trained_models/logr_estimator_w_cat.sav'
logr_estimator = train_logistic_regression(X_train, y_train, target, partitions_to_concat, logistic_reg_save)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Test Logistic Regression

In [24]:
from sklearn.metrics import classification_report
def test_classification(X_ddf, y_ddf, target, estimator):
    """
    In memory testing of linear regression 
    """
    X, y = dtp.dask_Xy_to_df(X_ddf, y_ddf, target, partitions_to_concat)
    
    predictions = estimator.predict(X)
    print(classification_report(predictions, y))

# TODO Find bug that is adding the target to X_train without removing it after

In [25]:
test_classification(X_train, y_train, target, logr_estimator)

              precision    recall  f1-score   support

       False       0.90      1.00      0.95   1908729
        True       1.00      0.95      0.98   4285976

    accuracy                           0.97   6194705
   macro avg       0.95      0.98      0.96   6194705
weighted avg       0.97      0.97      0.97   6194705



In [30]:
eli5.explain_weights(logr_estimator, feature_names=input_columns, top=(10,10))

Weight?,Feature
+6.526,payment_type_1
+0.265,total_amount_wo_tip
+0.240,mta_tax
+0.140,tpep_pickup_datetime_hourslot_20
+0.129,tpep_pickup_datetime_hourslot_21
+0.127,extra
+0.105,tpep_pickup_datetime_hourslot_22
+0.087,tpep_dropoff_datetime_hourslot_21
+0.082,tpep_pickup_datetime_hourslot_8
+0.068,tpep_pickup_datetime_hourslot_9
