In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.pipeline import make_pipeline
from preprocess import reduceCategories
from preprocess import drop_ColumnsNan
from preprocess import classify_vars
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from tableone  import TableOne
import Models
from Models import GridSearchLogisticRegression

## To modeling
from sklearn import linear_model
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Assesment
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, confusion_matrix, precision_score, recall_score, roc_auc_score


In [2]:
#########################################
# Data to test the pipeline           ###
#########################################
def data():
      size = 25
      np.random.seed(0)
      income = np.random.normal(130,22,size)
      age  =  np.random.normal(35, 10, size)
      age[np.random.randint(0,size, 4)] = np.nan
      sex  = np.random.choice(['F', 'M'], size=size, p=[0.5, 0.5])
      country = np.random.choice(['Europe', 'Africa', 'Latin', 'USA', 'other'], size = size, p=[0.45, 0.3, 0.15, 0.05, 0.05 ])
      country = country.astype(object) # to include nans...
      country[np.random.randint(0,size, 8)] = np.nan
      program = np.random.choice(['A', 'B', 'C', 'D', 'E', 'F'], size=size, p=[0.4,0.3,0.05, 0.05,0.1,0.1])
      program = program.astype(object) # to include nans...
      program[np.random.randint(0,size, 2)] = np.nan
      work = np.random.choice(['yes', 'not'], size=size, p=[0.7, 0.3])
      time = np.random.exponential(22, size)
      churn = np.random.choice(['Churn','stay'], size = size, p=[0.20, 0.80])
      df = {'income':income, 'age':age, 'sex':sex, 
            'country':country, 'program':program, 'time':time,
            'work':work, 'churn':churn}
      df = pd.DataFrame(df)
      return df
df = data()

In [3]:
# EDA....
categorical, nonormal,  normal = classify_vars(df)
categorical.remove('churn')
mytable = TableOne(df,categorical=categorical, nonnormal=nonormal,  groupby='churn', pval=True)
mytable

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by churn,Grouped by churn,Grouped by churn,Grouped by churn,Grouped by churn
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,Churn,stay,P-Value
n,,,25,5,20,
"income, mean (SD)",,0.0,140.5 (24.1),139.0 (19.9),140.8 (25.4),0.864
"age, mean (SD)",,4.0,34.7 (11.1),41.9 (6.1),33.1 (11.4),0.063
"sex, n (%)",F,0.0,15 (60.0),1 (20.0),14 (70.0),0.121
"sex, n (%)",M,,10 (40.0),4 (80.0),6 (30.0),
"country, n (%)",Europe,5.0,11 (55.0),4 (100.0),7 (43.8),0.129
"country, n (%)",Africa,,8 (40.0),,8 (50.0),
"country, n (%)",USA,,1 (5.0),,1 (6.2),
"program, n (%)",A,2.0,6 (26.1),2 (40.0),4 (22.2),0.166
"program, n (%)",B,,14 (60.9),2 (40.0),12 (66.7),


In [9]:
### Modelling Define a pipeline for preprocessing categorical variables
target = 'churn'
X = df.drop(columns=target)
y = df[target]
y = np.where(df[target].isnull(),  np.nan, np.where(df[target]=='Churn',1,0))


## Split training - test!...
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 666, stratify=y)

# Creating pipelines...

Normal_trasnform = Pipeline([
    ('drop', drop_ColumnsNan(min_percent=0.8)),
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()), ])


NoNormal_transform = Pipeline([
    ('drop', drop_ColumnsNan(min_percent=0.8)),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),])


Categorical_transform = Pipeline([
    ('reduce', reduceCategories(min_percent=0.4)),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first')), ])

# Create a column transformer for preprocessing
preprocessing = ColumnTransformer([
    ('Normal_pipe', Normal_trasnform, normal),
    ('NoNormal_pipe', NoNormal_transform, nonormal),
    ('Categorical_pipe',Categorical_transform, categorical)])

# Fit and transform the data
X_train_transformed = preprocessing.fit_transform(X_train)
X_test_transformed = preprocessing.fit_transform(X_test)

# Create a pipeline with preprocessing and logistic regression
pipeline = Pipeline([
    ('preprocessor', preprocessing),
    ('classifier', GridSearchLogisticRegression())
])

pipeline.fit(X_train,y_train)
best_params = pipeline.named_steps['classifier'].model.get_params()
print("Best parameters:", best_params)
pipeline.predict(X_test)



Best parameters: {'C': 1.0, 'class_weight': {0: 0.05, 1: 0.95}, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 666, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}




array([0., 1., 0., 1., 1.])

Unnamed: 0,income,age,sex,country,program,time,work
16,162.86974,20.799821,F,,A,59.86966,yes
17,125.486518,17.937298,F,Europe,B,20.940254,yes
2,151.532236,33.128161,F,USA,B,16.194884,not
21,144.379609,,M,Europe,B,8.225503,not
7,126.670141,26.122143,F,Europe,B,5.577126,yes
