# SelectFromModel

Using our scaler data to quickly check the model that fits the best 

## Imports

#### Import libraries

In [62]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import seaborn as sns

#### Import data

In [7]:
churn_norm = pd.read_csv("Churn_Norm.csv")

In [8]:
churn_norm.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gender_Int,Geography_Germany,Geography_Spain,Balance_Int
0,0.538,0.324324,0.2,0.0,0.0,1.0,1.0,0.506735,1.0,1.0,0.0,0.0,0.0
1,0.516,0.310811,0.1,0.334031,0.0,0.0,1.0,0.562709,0.0,1.0,0.0,1.0,1.0
2,0.304,0.324324,0.8,0.636357,0.666667,1.0,0.0,0.569654,1.0,1.0,0.0,0.0,1.0
3,0.698,0.283784,0.1,0.0,0.333333,0.0,0.0,0.46912,0.0,1.0,0.0,0.0,0.0
4,1.0,0.337838,0.2,0.500246,0.0,1.0,1.0,0.3954,0.0,1.0,0.0,1.0,1.0


In [10]:
# place target column a the end of the dataset
churn_norm= churn_norm[["CreditScore","Age","Tenure","Balance","NumOfProducts","HasCrCard","IsActiveMember","EstimatedSalary","Gender_Int","Geography_Germany","Geography_Spain","Balance_Int","Exited"]]


In [11]:
# check dataset
churn_norm.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Gender_Int,Geography_Germany,Geography_Spain,Balance_Int,Exited
0,0.538,0.324324,0.2,0.0,0.0,1.0,1.0,0.506735,1.0,0.0,0.0,0.0,1.0
1,0.516,0.310811,0.1,0.334031,0.0,0.0,1.0,0.562709,1.0,0.0,1.0,1.0,0.0
2,0.304,0.324324,0.8,0.636357,0.666667,1.0,0.0,0.569654,1.0,0.0,0.0,1.0,1.0
3,0.698,0.283784,0.1,0.0,0.333333,0.0,0.0,0.46912,1.0,0.0,0.0,0.0,0.0
4,1.0,0.337838,0.2,0.500246,0.0,1.0,1.0,0.3954,1.0,0.0,1.0,1.0,0.0


## Models

Function of all main Models to have a quick overlook

In [58]:
def run_multiple_models():
    
    X = churn_norm.iloc[:,:-1].values
    y = churn_norm['Exited']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)
    
    dfs = []
    models = [
        ('LR_model', LogisticRegression()),
        ('SVC_model', svm.SVC()),
        ('KNN_model', KNeighborsClassifier(n_neighbors=5)),
        ('DTC_model', DecisionTreeClassifier()),
        ('LDA_model', LinearDiscriminantAnalysis()),
        ('GNB_model', GaussianNB()),
        ('RFC_model', RandomForestClassifier()),
        ('MLPC_model', MLPClassifier(max_iter=1000))
        ]
    
    results = []
    
    names = []
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
    target_names = ['Not Churn', 'Churn']

    for name, model in models:
        cv = cross_val_score(model, X_train, y_train, cv=StratifiedKFold(), scoring = "accuracy")
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))
        
        results.append(cv)
        names.append(name)
        
        final_df = pd.DataFrame(cv)
        final_df['model'] = name
        dfs.append(final_df)

        final = pd.concat(dfs, ignore_index=True)
    return final

In [59]:
run_multiple_models()

LR_model
              precision    recall  f1-score   support

   Not Churn       0.81      0.97      0.88       777
       Churn       0.65      0.19      0.30       223

    accuracy                           0.80      1000
   macro avg       0.73      0.58      0.59      1000
weighted avg       0.77      0.80      0.75      1000

SVC_model
              precision    recall  f1-score   support

   Not Churn       0.83      0.99      0.90       777
       Churn       0.89      0.28      0.42       223

    accuracy                           0.83      1000
   macro avg       0.86      0.63      0.66      1000
weighted avg       0.84      0.83      0.79      1000

KNN_model
              precision    recall  f1-score   support

   Not Churn       0.83      0.94      0.88       777
       Churn       0.62      0.32      0.42       223

    accuracy                           0.80      1000
   macro avg       0.72      0.63      0.65      1000
weighted avg       0.78      0.80      0.78  

Unnamed: 0,0,model
0,0.818889,LR_model
1,0.812778,LR_model
2,0.805,LR_model
3,0.821111,LR_model
4,0.810556,LR_model
5,0.843889,SVC_model
6,0.841111,SVC_model
7,0.842222,SVC_model
8,0.842778,SVC_model
9,0.840556,SVC_model


## Conclusions

We can see that RFC_model, SVC_model and MLPC_model gives us the best results with a higher recall and f1-score.
And, we have a score from cross validation around 0.82