In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import pathlib

from omegaconf import OmegaConf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from src.utils.transform import (HourExtractor, WeekendExtractor, 
    FeatureCrosser, FringeCategoryBucketer)

In [3]:
cfg = OmegaConf.load('../config.yaml')

In [4]:
processed_data_dir = '..' / pathlib.Path(cfg.paths.data.processed)
processed_file = processed_data_dir / cfg.files.processed_dataset

df = pd.read_csv(processed_file)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38182 entries, 0 to 38181
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   lat              33983 non-null  float64
 1   long             33983 non-null  float64
 2   download_mbit    38182 non-null  float64
 3   upload_mbit      38182 non-null  float64
 4   ping_ms          38180 non-null  float64
 5   lte_rsrp         23828 non-null  float64
 6   signal_strength  1335 non-null   float64
 7   platform         38182 non-null  object 
 8   network_name     24556 non-null  object 
 9   time_utc         38182 non-null  object 
 10  cat_technology   38182 non-null  object 
dtypes: float64(7), object(4)
memory usage: 3.2+ MB


## Train Test Split

In [6]:
X = df.copy()
y = X.pop(cfg.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)
#X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=1/3, random_state=42)

## Preprocessing pipeline

### Feature prep for date field

In [7]:
X_train['time_utc'].values.astype(str)

array(['2022-03-09 15:32:26', '2022-03-20 17:14:43',
       '2022-02-27 10:33:45', ..., '2022-03-27 08:23:57',
       '2022-02-24 23:33:53', '2022-03-10 11:56:44'], dtype='<U19')

In [9]:
dt = HourExtractor()
hours = dt.fit_transform(X_train['time_utc'].values)
hours

array([[15],
       [17],
       [10],
       ...,
       [ 8],
       [23],
       [11]])

In [8]:
dt = WeekendExtractor()
weekend_yn = dt.fit_transform(X_train['time_utc'].values)
weekend_yn

array([[0.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [0.]])

In [10]:
dt = FeatureCrosser(sep='/')
dt.fit_transform(np.column_stack([weekend_yn, hours]))

array([['0/15'],
       ['1/17'],
       ['1/10'],
       ...,
       ['1/8'],
       ['0/23'],
       ['0/11']], dtype='<U4')

### Feature prep for category with high cardinality

In [11]:
df.network_name.value_counts()

3 AT             6156
A1               5037
Magenta-T-       2639
spusu            1093
T-Mobile          891
                 ... 
mobily              1
NOVA IS             1
NIGERTELECOMS       1
ntel                1
SYMA                1
Name: network_name, Length: 456, dtype: int64

In [12]:
df.network_name.values[:20]

array(['INDOSATOOREDOO', 'I WIND', 'Red Bull MOBILE ', '3 AT',
       'Red Bull MOBILE ', '3 AT', 'UNEFON 4G', 'spusu', 'Djezzy',
       'Verizon ', 'A1', 'INDOSATOOREDOO', 'o2 - de+', 'o2 - de+', 'A1',
       'Globe Telecom-PH', 'HoT', '3 AT', 'A1', 'spusu'], dtype=object)

In [13]:
fcb = FringeCategoryBucketer()
fcb.fit_transform(df.network_name.values)[:20]

array(['other', 'other', 'other', '3 AT', 'other', '3 AT', 'other',
       'spusu', 'other', 'other', 'A1', 'other', 'other', 'other', 'A1',
       'other', 'HoT', '3 AT', 'A1', 'spusu'], dtype='<U10')

In [17]:
# Full transformer pipeline
numeric_transformer = make_pipeline(
    StandardScaler(), SimpleImputer(strategy='median')
)

categorical_transformer = make_pipeline(
    FringeCategoryBucketer(), OneHotEncoder(handle_unknown='ignore')
)

weekend_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'), WeekendExtractor()
) 

hour_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'), HourExtractor()
)

preprocessor = ColumnTransformer(
    transformers=[
        ('numerics', numeric_transformer, list(cfg.inputs.numerics)),
        ('categories', categorical_transformer, list(cfg.inputs.categories)),
        ('weekend', weekend_transformer, list(cfg.inputs.datetimes)),
        ('hour', hour_transformer, list(cfg.inputs.datetimes)),
    ]
)

pd.DataFrame(preprocessor.fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.303936,0.048569,1.441792,-0.773054,-0.527348,0.413262,-0.176575,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0
1,0.290371,0.029523,-0.700020,-0.830673,-0.040330,-0.101191,-0.176575,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,17.0
2,0.300700,0.048317,2.982593,0.273978,-0.530163,2.471073,-0.176575,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,10.0
3,0.303769,0.049294,-0.664913,-0.792403,-0.452747,-0.101191,-0.176575,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20.0
4,-0.590368,-4.541050,-0.631824,-0.451348,3.647494,-0.615644,-0.176575,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25449,0.290371,0.029523,-0.068669,-0.891811,0.027233,-0.529902,-0.176575,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0
25450,0.287359,0.048444,0.062341,0.755801,-0.383776,0.841972,-0.176575,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16.0
25451,0.198171,0.017963,-0.606810,-0.580087,-0.390814,-0.101191,-0.176575,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,8.0
25452,0.809236,-0.161661,-0.699030,-0.783038,-0.178271,-0.358418,-0.176575,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,23.0
