In [45]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, f1_score, classification_report, plot_confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC

from sklearn.compose import ColumnTransformer, make_column_transformer

# Pipeline

In [119]:
y = pd.read_csv('../../src/data/y_dataframe.csv', index_col= 0, squeeze= True)
X = pd.read_csv('../../src/data/X_dataframe.csv', index_col= 0)

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [121]:
numerical_features = X_train.select_dtypes(exclude='object').columns
categorical_features = X_train.select_dtypes(include='object').columns

In [122]:
#One_hot = ColumnTransformer([('enc', OneHotEncoder(sparse = False, drop ='first'),
#                            X_train.select_dtypes(include='object'))], remainder ='passthrough') 

In [172]:
preprocess = make_column_transformer(
    (OneHotEncoder(sparse = False, drop ='first'), categorical_features), remainder = 'passthrough')

# Log model

In [173]:
pipe_log = make_pipeline(
    preprocess,
    StandardScaler(),
    SMOTE(random_state= 42),
    LogisticRegression( C = 233.57214690901213, random_state=42))

In [176]:
pipe_log.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  Index(['state', 'international plan', 'voice mail plan'], dtype='object'))])),
                ('standardscaler', StandardScaler()),
                ('smote', SMOTE(random_state=42)),
                ('logisticregression',
                 LogisticRegression(C=233.57214690901213, random_state=42))])

In [177]:
y_hat = pipe_log.predict(X_test)


print(cross_val_score(estimator = pipe_log, X = X_train,y = y_train, cv = 3))
print(pipe_log.score(X_test, y_test))
print(confusion_matrix(y_test, y_hat))
print(classification_report(y_test, y_hat))

[0.76470588 0.76230492 0.78031212]
0.7829736211031175
[[556 153]
 [ 28  97]]
              precision    recall  f1-score   support

       False       0.95      0.78      0.86       709
        True       0.39      0.78      0.52       125

    accuracy                           0.78       834
   macro avg       0.67      0.78      0.69       834
weighted avg       0.87      0.78      0.81       834



# Random forest model

In [160]:
pipe_forest = make_pipeline(
    preprocess,
    StandardScaler(),
    SMOTE(random_state= 42),
    RandomForestClassifier(n_estimators=500, max_features=.5, random_state=42))

In [166]:
pipe_forest.fit(X_train,y_train);

In [165]:
y_hat_forest = pipe_forest.predict(X_test)


print(cross_val_score(estimator = pipe_forest, X = X_train,y = y_train, cv = 3).mean())
print(pipe_forest.score(X_test, y_test))
print(confusion_matrix(y_test, y_hat_forest))
print(classification_report(y_test, y_hat_forest))

0.9263705482192877
0.9448441247002398
[[687  22]
 [ 24 101]]
              precision    recall  f1-score   support

       False       0.97      0.97      0.97       709
        True       0.82      0.81      0.81       125

    accuracy                           0.94       834
   macro avg       0.89      0.89      0.89       834
weighted avg       0.94      0.94      0.94       834



In [None]:
def model_output(model):
    
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    