# Linear Regression in Pandas

Testing basic linear and logistic regression with OHE of some variables from the dataset.

### Setup

In [19]:
# Extension reloader to import a function again when re-running cell 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Configuration

In [30]:
"""
Loads common configuration parameters
"""
from project.src.utils.configuration_manager import Config
from pathlib import Path, PurePath
parent_dir = Path().resolve().parent

config_path = PurePath(parent_dir, 'config.ini')
config = Config(config_path)

Loading configuration from: /home/justin/Code/ran_arcd/project/config.ini
raw_input: /home/justin/Data/Tigo_Colombia_2018111800-2018111823.csv
input_path: /home/justin/Code/ran_arcd/project/data/interim/tigo_parquet
figures_dir: /home/justin/Code/ran_arcd/project/figures
preprocessed_dir: /home/justin/Code/ran_arcd/project/data/preprocessed
train_data: /home/justin/Code/ran_arcd/project/data/preprocessed/X_train
train_target: /home/justin/Code/ran_arcd/project/data/preprocessed/y_train
test_data: /home/justin/Code/ran_arcd/project/data/preprocessed/X_test
test_target: /home/justin/Code/ran_arcd/project/data/preprocessed/y_test


### Start local Dask Client

In [31]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
    cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
#     cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Restarting client


0,1
Client  Scheduler: tcp://127.0.0.1:45265  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 20.00 GB


## Prepare / Load dataset

In [45]:
from project.src.preprocessing.dataset_manager import DatasetManager
dataset_manager = DatasetManager(config)

In [46]:
%time dataset_manager.write_dataset(test_size =0.5, overwrite=False)

Not overwriting existing training and test sets
CPU times: user 1.46 ms, sys: 73 µs, total: 1.54 ms
Wall time: 835 µs


In [47]:
X_train, y_train = dataset_manager.get_training_set()
X_test, y_test = dataset_manager.get_test_set()

### Additional Preprocessing Step

In [48]:
categorical_columns_to_transform = ['HourSlot', 'Service']
#                                     'EstablishmentCause',
#                                     'Service',
#                                     'EndSubRegion',
#                                     'HourSlot',
#                                     'StartCellName']
# datetime_columns_to_transform = ['tpep_pickup_datetime', 'tpep_dropoff_datetime']
datetime_columns_to_transform = []

In [49]:
from dask_ml.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)

In [50]:
import project.src.preprocessing.preprocessing_pipelines as preprocessing

def preprocess(ddf):
    return preprocessing.ohe_preprocessing_pipeline(encoder, 
                                     ddf, 
                                     categorical_columns_to_transform = categorical_columns_to_transform, 
                                     datetime_columns_to_transform = datetime_columns_to_transform)

X_train = preprocess(X_train)
X_test = preprocess(X_test)

In [51]:
input_columns = X_train.columns.values

In [52]:
print(len(input_columns))

46


In [53]:
X_train.head()

Unnamed: 0_level_0,Duration,StartPropagationDelay,UL_MaxDataBitrate,DL_MaxDataBitrate,UL_ReqMaxDataBitrate,DL_ReqMaxDataBitrate,UL_AssMaxDataBitrate,DL_AssMaxDataBitrate,HourSlot_2018111800,HourSlot_2018111801,...,Service_HSUPA,Service_R99,Service_Voice,Service_Voice + Data,Service_Attach / Detach,Service_LAU / RAU,Service_SMS,Service_Undefined,Service_Video,Service_Emergency
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,128789350.0,240.0,,,8640000.0,8640000.0,64000.0,8640000.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,110225980.0,480.0,,,8640000.0,8640000.0,5440000.0,8640000.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,172245410.0,480.0,,,1024000.0,4096000.0,64000.0,3600000.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,125018890.0,240.0,,,8640000.0,8640000.0,64000.0,8640000.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13,144124890.0,0.0,,,1024000.0,4096000.0,64000.0,3600000.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Subset Assignment 

In [80]:
# Subset assignment in order to ensure this fits into memory given the size increase caused by OHE 
partitions_to_concat = 100
import project.src.preprocessing.dask_to_pandas as dtp
import joblib
import eli5



### Training Linear Regression with Pandas

In [81]:
target = 'some_continuous_target'

In [82]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


import joblib

def train_linear_regression(X_ddf, y_ddf, target, partitions_to_concat, save_to):
    """
    In memory training of linear regression
    """
    X, y = dtp.dask_Xy_to_df(X_ddf, y_ddf, target, partitions_to_concat)
    
    # Using ridge to penalize large weights
    estimator = Ridge()
#     estimator = LinearRegression(normalize=True)
#     estimator = Lasso()
    estimator.fit(X, y=y)
    
    joblib.dump(estimator, save_to)
    return estimator

In [None]:
linear_reg_filename = 'linr_estimator_w_cat.sav'
linear_reg_save = Path(config.models_directory, linear_reg_filename)

lr_estimator = train_linear_regression(X_train, y_train, target, partitions_to_concat, linear_reg_save)

### Test Linear Regression

In [84]:
import numpy as np
import eli5

In [45]:
def test_regression(X_ddf, y_ddf, target, estimator):
    """
    In memory testing of regression model
    """
    X, y = dtp.dask_Xy_to_df(X_ddf, y_ddf, target, partitions_to_concat)
    
    predictions = estimator.predict(X)
    print('MAE: ' + str(abs(predictions-y).mean(axis=0)))
    print('RMSE: ' + str(np.sqrt(((predictions-y)**2).mean(axis=0))))
    

In [46]:
# Normal linear regression
test_regression(X_test, y_test, target, lr_estimator)

MAE: 0.046468316938262644
RMSE: 0.5313604117005752


In [47]:
eli5.explain_weights(lr_estimator, feature_names=input_columns, top=(10,10))

Weight?,Feature
+0.171,<BIAS>
+0.166,payment_type_1
+0.061,tpep_pickup_datetime_day_of_week_6
+0.040,tpep_dropoff_datetime_day_of_week_0
+0.031,tpep_dropoff_datetime_day_of_week_3
+0.029,tpep_pickup_datetime_day_of_week_5
+0.023,tpep_pickup_datetime_hourslot_22
+0.022,tpep_pickup_datetime_hourslot_21
+0.021,tpep_dropoff_datetime_hourslot_5
+0.020,tpep_dropoff_datetime_hourslot_4


In [48]:
# TODO resolve bug that has previous targets values bleed into training set
# print(X_train.columns.values)
# Patch the underlying bug
X_train = X_train.drop(columns = target, axis=1)

### Training Logistic Regression with Pandas

In [85]:
target = 'Status'

In [86]:
from sklearn.linear_model import LogisticRegression

In [92]:
def train_logistic_regression(X_ddf, y_ddf, target, partitions_to_concat, save_to):
    """
    In memory training of linear regression
    """
    X, y = dtp.dask_Xy_to_df(X_ddf, y_ddf, target, partitions_to_concat)
    
    X.fillna(0, inplace=True)
    
    estimator = LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42)
    estimator.fit(X, y=y)
    
    joblib.dump(estimator, save_to)
    return estimator

In [93]:
logr_reg_filename = 'logr_estimator_w_cat.sav'
logistic_reg_save = Path(config.models_directory, logr_reg_filename)

logr_estimator = train_logistic_regression(X_train, y_train, target, partitions_to_concat, logistic_reg_save)

### Test Logistic Regression

In [94]:
from sklearn.metrics import classification_report
def test_classification(X_ddf, y_ddf, target, estimator):
    """
    In memory testing of linear regression 
    """
    X, y = dtp.dask_Xy_to_df(X_ddf, y_ddf, target, partitions_to_concat)
    X.fillna(0, inplace=True)
    
    predictions = estimator.predict(X)
    print(classification_report(predictions, y))

# TODO Find bug that is adding the target to X_train without removing it after

In [95]:
test_classification(X_train, y_train, target, logr_estimator)

                precision    recall  f1-score   support

       Blocked       0.40      0.99      0.57    197990
       Dropped       0.09      0.01      0.02   1054681
Non-progressed       0.00      0.00      0.00        14
        Normal       0.85      0.95      0.90   4511880

      accuracy                           0.78   5764565
     macro avg       0.34      0.49      0.37   5764565
  weighted avg       0.70      0.78      0.73   5764565



In [97]:
eli5.explain_weights(logr_estimator, feature_names=input_columns)#, top=(10,10))

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+0.000,DL_ReqMaxDataBitrate,,
+0.000,Service_Undefined,,
+0.000,<BIAS>,,
+0.000,Duration,,
+0.000,Service_HSPA+,,
+0.000,HourSlot_2018111814,,
+0.000,HourSlot_2018111818,,
+0.000,HourSlot_2018111813,,
+0.000,HourSlot_2018111817,,
+0.000,HourSlot_2018111816,,

Weight?,Feature
+0.000,DL_ReqMaxDataBitrate
+0.000,Service_Undefined
+0.000,<BIAS>
+0.000,Duration
+0.000,Service_HSPA+
+0.000,HourSlot_2018111814
+0.000,HourSlot_2018111818
+0.000,HourSlot_2018111813
+0.000,HourSlot_2018111817
+0.000,HourSlot_2018111816

Weight?,Feature
+0.000,StartPropagationDelay
+0.000,UL_ReqMaxDataBitrate
+0.000,Duration
+0.000,DL_AssMaxDataBitrate
+0.000,UL_AssMaxDataBitrate
… 3 more positive …,… 3 more positive …
… 18 more negative …,… 18 more negative …
-0.000,Service_R99
-0.000,Service_HSPA+
-0.000,HourSlot_2018111818

Weight?,Feature
+0.000,UL_ReqMaxDataBitrate
+0.000,Service_Voice + Data
+0.000,Service_Voice
+0.000,DL_AssMaxDataBitrate
+0.000,Duration
… 5 more positive …,… 5 more positive …
… 16 more negative …,… 16 more negative …
-0.000,Service_HSDPA
-0.000,Service_R99
-0.000,HourSlot_2018111817

Weight?,Feature
+0.000,DL_ReqMaxDataBitrate
+0.000,DL_AssMaxDataBitrate
+0.000,UL_AssMaxDataBitrate
+0.000,Service_LAU / RAU
+0.000,Service_HSPA
+0.000,Service_HSPA+
… 13 more positive …,… 13 more positive …
… 8 more negative …,… 8 more negative …
-0.000,HourSlot_2018111811
-0.000,HourSlot_2018111816
