In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import warnings

In [2]:
df = pd.read_csv('../dataset/messy_data.csv')

In [3]:
df.head()

Unnamed: 0,id,event_date,age,income,gender,region,purchases,category,churn
0,1,2023-04-13,34.0,,Female,East,2.0,D,0
1,2,2023-12-15,40.0,54770.0,Male,East,2.0,,1
2,3,2023-09-28,,85399.0,Female,East,2.0,B,0
3,4,2023-04-17,44.0,52703.0,Male,,,B,1
4,5,2023-03-13,,53504.0,,North,7.0,B,0


## QUICK EDA

In [4]:
print(f"{df.info()} \n \n")
print(f"dataframe shape : {df.shape} \n \n")
print("percentage of missing entries per column")
print(df.isna().mean())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          1500 non-null   int64  
 1   event_date  1500 non-null   object 
 2   age         1405 non-null   float64
 3   income      1396 non-null   float64
 4   gender      1379 non-null   object 
 5   region      1414 non-null   object 
 6   purchases   1397 non-null   float64
 7   category    1396 non-null   object 
 8   churn       1500 non-null   int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 105.6+ KB
None 
 

dataframe shape : (1500, 9) 
 

percentage of missing entries per column
id            0.000000
event_date    0.000000
age           0.063333
income        0.069333
gender        0.080667
region        0.057333
purchases     0.068667
category      0.069333
churn         0.000000
dtype: float64


## Fix dates & basic schema

In [5]:
df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce')

In [6]:
df['event_date'].head()

0   2023-04-13
1   2023-12-15
2   2023-09-28
3   2023-04-17
4   2023-03-13
Name: event_date, dtype: datetime64[ns]

## Feature engineering from dates 
#### create features : event_month , event_dayofweek and recency_days , dataset max_date 

In [7]:
max_date = df['event_date'].max()
df['event_month'] = df['event_date'].dt.month
df['event_dayOfWeek'] = df['event_date'].dt.dayofweek

In [8]:
df['recency_days'] = (max_date - df['event_date']).dt.days

In [9]:
df.head()

Unnamed: 0,id,event_date,age,income,gender,region,purchases,category,churn,event_month,event_dayOfWeek,recency_days
0,1,2023-04-13,34.0,,Female,East,2.0,D,0,4.0,3.0,297.0
1,2,2023-12-15,40.0,54770.0,Male,East,2.0,,1,12.0,4.0,51.0
2,3,2023-09-28,,85399.0,Female,East,2.0,B,0,9.0,3.0,129.0
3,4,2023-04-17,44.0,52703.0,Male,,,B,1,4.0,0.0,293.0
4,5,2023-03-13,,53504.0,,North,7.0,B,0,3.0,0.0,328.0


## Split features and target 
#### churn will be the target

In [10]:
target = 'churn'
feature_cols = [c for c in df.columns if c not in target]
X = df[features].copy()
y = df[target].copy()

NameError: name 'features' is not defined

In [None]:
numeric_features = [col for col in df.columns if (df[col].dtype == 'int64' or 
                   df[col].dtype == 'float64') and col not in target] 
numeric_features

In [None]:
date_col = [col for col in df.columns if df[col].dtype == 'datetime64[ns]'] 
date_col

In [None]:
categorical_features = [col for col in df.columns 
                        if col not in date_col 
                        and col not in numeric_features 
                        and col not in target] 
categorical_features

In [None]:
X.drop('event_date',axis=1,inplace=True)

In [None]:
X.drop('id',axis=1,inplace=True)

### Outliers exploration using z-score 

In [None]:
from scipy import stats

numX = X[numeric_features].copy()
z = np.abs(stats.zscore(numX , nan_policy='omit'))

outlier_mask = (z > 3).any(axis=1) 
outlier_rate = outlier_mask.mean()
print(f"{outlier_rate * 100} % of the record are outliers")

## Pipeline Preprocessing
    impute missing values (KNN for numeric and constant for categorical) 
    Scale numeric features
    one-hot encode categoricals

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 

In [None]:
X_train , X_test , y_train, y_test = train_test_split(X,y,random_state=42, test_size=0.2)

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)) , 
    ('scaler',StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')) , 
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
]) 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer , numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## Outlier handling by capping 

In [None]:
from sklearn.base import BaseEstimator , TransformerMixin 

class Winsorizer(BaseEstimator, TransformerMixin) : 
    def __init__(self, quantile_low=0.01 , quantile_high = 0.99) : 
        self.quantile_low = quantile_low,
        self.quantile_high = quantile_high

    def fit(self, X, y=None) : 
        import pandas as pd 
        X = pd.DataFrame(X) 
        self.lows_ = X.quantile(self.quantile_low) 
        self.highs_ = X.quantile(self.quantile_high) 
        return self

    def transform (self,X) : 
        import pandas as pd 
        X = pd.DataFrame(X).clip(lower=self.lows_ , upper=self.highs_ , axis= 1) 
        return X.values

## Train model with preprocessing pipeline

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score , roc_auc_score , classification_report

clf = Pipeline(steps=[
    ('preprocess',preprocessor),
    ('model',LogisticRegression(max_iter=200))
])

clf.fit(X_train,y_train)
prediction = clf.predict(y_test) 

#churn_probability = clf.predict_proba(X_test)[:,1]