In [1]:
import pandas as pd
from loguru import logger
import json

In [2]:
df_raw = pd.read_csv('../data/bronze/telecom_churn.csv')
logger.info(f"Data loaded with shape: {df_raw.shape}")
# logger.info(f"Columns: {df_raw.columns}")

[32m2024-10-10 23:16:50.367[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mData loaded with shape: (3333, 11)[0m


In [3]:
#load the metadata from a json file:
with open('../data/bronze/telecom_churn_metadata.json') as f:
    metadata = json.load(f)

col_desc_list = metadata['recordSet'][0]['field']
# get name and description for eac of the element in the list
col_desc_dict = {el['name']: el['description'] for el in col_desc_list}

In [4]:
# we can use the dict to see the description of the columns
col_desc_dict['DayMins']

'average daytime minutes per month'

In [5]:
# rearrange the columns:

label_col = 'Churn'
columns = df_raw.columns.tolist()
columns.remove(label_col)
columns.append(label_col)
df_raw = df_raw[columns]

# only retain columns with no missing values
retained_cols = df_raw.columns[df_raw.isnull().mean() == 0]


In [6]:
# split to train and test:
from sklearn.model_selection import train_test_split

df_train_raw, df_inference_raw = train_test_split(df_raw, test_size=0.2, random_state=99, stratify=df_raw['Churn'])

In [7]:
# export the df_raw_inference
df_inference_raw.to_csv('../data/inference/df_inference_raw.csv', index=False)

## 2. Cleaning and Featurization

The following steps are to be reproduced if requests from raw data:
- Remove cols that were removed from the initial steps
- Scaled numberical values by the saved scaler object

In [8]:
# remove columns with missing values:
df_train_cleaned = df_train_raw[retained_cols]
feat_cols = df_train_cleaned.columns.tolist()
feat_cols.remove(label_col)

In [9]:
df_train_cleaned.shape

(2666, 11)

### Featurize the numberical cols

In [10]:
# to avoid data leakage, we need to split the data before any preprocessing

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_train_cleaned, test_size=0.2, random_state=99, stratify=df_train_cleaned[label_col])

In [11]:
df_train_featurized = df_train.copy(deep=True)

from sklearn.preprocessing import StandardScaler

# note we only fit the scaler on feature cols, not the label
scaler = StandardScaler()
df_train_featurized[feat_cols] = scaler.fit_transform(df_train_featurized[feat_cols])

In [16]:
# only transform the test set without fitting to avoid data leakage
df_test_featurized = df_test.copy(deep=True)
df_test_featurized[feat_cols] = scaler.transform(df_test_featurized[feat_cols])

### Export it to Gold layer

In [18]:
df_train_featurized.to_parquet('../data/gold/telcom_churn_train_featurized.parquet', index=False)
df_test_featurized.to_parquet('../data/gold/telcom_churn_test_featurized.parquet', index=False)

## 3. Persisting the artifacts from data cleaning and featurization

They are important for the service later, which will apply the same logic to the raw input data (from df_raw_inference)

In [19]:
import joblib

# save the columns_without_missing to a file
with open('../services/models/retained_cols.json', 'w') as f:
    json.dump(retained_cols.tolist(), f)

# save the feature columns to a file
with open('../services/models/feat_cols.json', 'w') as f:
    json.dump(feat_cols, f)

joblib.dump(scaler, '../services/models/scaler.pkl')




['../services/models/scaler.pkl']