# SVM

In [490]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn import metrics, svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from collections import Counter


### Prepering data

In [491]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [492]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"],errors="coerce")

In [493]:
df.dropna(inplace=True)

In [494]:
df_d = df.drop("customerID", axis = 1)

In [495]:
df_d['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df_d['Churn'].replace(to_replace='No',  value=0, inplace=True)

#### Skalowanie

In [496]:
df_d.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [497]:
def apply_scalers(df, columns_to_exclude=None):
    if columns_to_exclude:
        exclude_filter = ~df.columns.isin(columns_to_exclude) 
    else:
        exclude_filter = ~df.columns.isin([]) 
    for column in df.iloc[:, exclude_filter].columns:
        df[column] = df[column].astype(float)
 
    df.loc[:, exclude_filter] = StandardScaler().fit_transform(df.loc[:, exclude_filter])
    return df
 
df_d = apply_scalers(df_d, columns_to_exclude=['gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn'])

#### Dumifikacja

In [498]:
dummies_df = pd.get_dummies(df_d)

In [499]:
dummies_df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,-1.280248,-1.161694,-0.994194,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,0,0.064303,-0.260878,-0.17374,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,0,-1.239504,-0.363923,-0.959649,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0,0.512486,-0.74785,-0.195248,0,0,1,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,0,-1.239504,0.196178,-0.940457,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


### Train test split

In [500]:
y = dummies_df['Churn'].values
X = dummies_df.drop(columns = ['Churn'])

In [501]:
#Splity X,y into train, val, test
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=44,train_size=0.6,stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, random_state=44,train_size=0.5,stratify=y_val)

### Make scorer "slytherin score"

In [502]:
basic_medians = [64.45, 79.65]
basic_discount = 1 - basic_medians[0] / basic_medians[1]

In [503]:
def function(y_true, y_pred, medians = basic_medians, discount = basic_discount, basic = True, scoring_only = True):
    
    """
    Jeśli interesuje Cie wartość dodana przyjmij scoring_only = False. Wówczas poza scorem otrzymasz także:
        base_case - najlepszy możliwy scenariusz dla firmy
        standard_case - aktualny scenariusz firmy (bez naszej ingerencji)
        model_vase - scenariusz oparty na naszym modelu

    Jeśli chcesz manipulować zmiennymi medians i discount, pamiętaj aby przyjąć basic = False
    """
    
    
    matrix = confusion_matrix(y_true, y_pred)
    TN = matrix[0][0]
    FP = matrix[0][1]
    FN = matrix[1][0]
    TP = matrix[1][1]
    
    if basic:
        basic_score = 1 - (medians[1]/medians[0]*FN + medians[0]/medians[1]*FP)/(TN + TP)
        
        if scoring_only:
            return basic_score
        else:
            best_case = round(medians[0] * Counter(y_true)[0] + medians[1] * Counter(y_true)[1], 2)
            standard_case = round(medians[0] * Counter(y_true)[0] - medians[1] * Counter(y_true)[1], 2)
            model_case = round(basic_score * medians[0]*(TN+TP), 2)
            return basic_score, best_case, standard_case, model_case
    
    else:
        score = 1 - (medians[1]*FN + (1 - discount)*medians[0]*FP)/(medians[0]*TN + (1- discount)*medians[1]*TP)
        
        if scoring_only:
            return score
        else:
            best_case = round(medians[0] * Counter(y_true)[0] + medians[1] * Counter(y_true)[1], 2)
            standard_case = round(medians[0] * Counter(y_true)[0] - medians[1] * Counter(y_true)[1], 2)
            model_case = round(score * (medians[0]*TN + (1- discount)*medians[1]*TP), 2)
            return score, best_case, standard_case, model_case

In [504]:
slytherin_score = make_scorer(function, greater_is_better=True)

### Clasification witg Grid Search CV for SVM model

In [505]:
parameters = {"kernel":("linear", "poly", "rbf", "sigmoid"),
                "C":(0.1,0.2,0.3,0.4,0.5,1,2,3,4,5),
                "degree":(1,2,3),
                "gamma": ["scale", "auto"],
                "class_weight": [None, "balanced"]
                }

In [506]:
svm_clsf = svm.SVC(random_state=44)
grid_clsf = GridSearchCV(estimator = svm_clsf, 
                            param_grid=parameters ,
                            cv=3,
                            n_jobs=-1, 
                            scoring=slytherin_score
                        )

In [507]:
grid_clsf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=SVC(random_state=44), n_jobs=-1,
             param_grid={'C': (0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 3, 4, 5),
                         'class_weight': [None, 'balanced'],
                         'degree': (1, 2, 3), 'gamma': ['scale', 'auto'],
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid')},
             scoring=make_scorer(function))

In [508]:
grid_clsf.cv_results_

{'mean_fit_time': array([0.64909387, 0.57838964, 0.66646051, 0.72256851, 0.53104893,
        0.50770434, 0.5930415 , 0.68924435, 0.57638653, 0.51004362,
        0.6090281 , 0.65871414, 0.51242471, 0.53909222, 0.59688544,
        0.59802206, 0.52704787, 0.58287589, 0.70075846, 0.67506067,
        0.51614952, 0.52370421, 0.65508278, 0.63951985, 0.70494723,
        0.70187759, 0.86890078, 0.82439232, 0.69638475, 0.75939099,
        0.86022449, 0.87761585, 0.69060079, 0.72338637, 0.83048145,
        0.82484976, 0.69815564, 1.01157864, 1.10815652, 1.05518317,
        0.88352704, 0.897808  , 0.89445082, 0.8950847 , 0.75405629,
        1.24086674, 0.96936099, 1.00042502, 0.60773913, 0.52506018,
        0.6857272 , 0.67413529, 0.59397356, 0.55910444, 0.72648819,
        0.67296306, 0.58034356, 0.56370743, 0.7764318 , 0.68461728,
        0.58680622, 0.588919  , 0.63783844, 0.65776992, 0.61044669,
        0.5818065 , 0.66273046, 0.6268688 , 0.58973591, 0.56654716,
        0.67007558, 0.64635921,

### Clasification with random search for SVM model

In [509]:
svm_clsf = svm.SVC(random_state=44)
random_clsf = RandomizedSearchCV(estimator = svm_clsf, 
                                    param_distributions=parameters, 
                                    n_iter=20, 
                                    cv=3, 
                                    n_jobs=-1, 
                                    scoring=slytherin_score
                                )

In [510]:
random_clsf.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=SVC(random_state=44), n_iter=20, n_jobs=-1,
                   param_distributions={'C': (0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 3,
                                              4, 5),
                                        'class_weight': [None, 'balanced'],
                                        'degree': (1, 2, 3),
                                        'gamma': ['scale', 'auto'],
                                        'kernel': ('linear', 'poly', 'rbf',
                                                   'sigmoid')},
                   scoring=make_scorer(function))

In [511]:
random_clsf.cv_results_

{'mean_fit_time': array([0.73579113, 0.67727439, 0.8735652 , 0.58567293, 0.9206539 ,
        0.85910861, 0.95425105, 0.91749128, 1.08232435, 0.67173982,
        0.95149922, 0.84648371, 1.03722405, 1.13277976, 1.22709942,
        1.61779277, 0.8270599 , 0.8768394 , 0.8122797 , 0.84372958]),
 'std_fit_time': array([0.01713297, 0.05869923, 0.0545071 , 0.03449299, 0.01174389,
        0.09108635, 0.12782891, 0.01457881, 0.02933352, 0.06010243,
        0.00942707, 0.12935087, 0.03088599, 0.02398054, 0.07742195,
        0.18319001, 0.02160204, 0.02357907, 0.03319432, 0.02538369]),
 'mean_score_time': array([0.18868057, 0.25952109, 0.2353522 , 0.2069025 , 0.35235699,
        0.34070086, 0.29292289, 0.26591214, 0.79637384, 0.26554807,
        0.28435405, 0.29542685, 0.35708046, 0.85306207, 0.27435017,
        0.26935116, 0.2403508 , 0.29268678, 0.2723515 , 0.22901551]),
 'std_score_time': array([0.00543705, 0.01331798, 0.00555791, 0.0027514 , 0.03653849,
        0.08192196, 0.05235734, 0.036627

#### Wyniki Grid i Random Search

In [512]:
print(f" Best params for SVM model by GridSearchCV :        {grid_clsf.best_params_}")
print(f" Best score with those params :                     {grid_clsf.best_score_}")
print(f" Best params for SVM model by RandomizedSearchCV :  {random_clsf.best_params_}")
print(f" Best score with those params :                     {random_clsf.best_score_}")

 Best params for SVM model by GridSearchCV :        {'C': 1, 'class_weight': None, 'degree': 1, 'gamma': 'auto', 'kernel': 'sigmoid'}
 Best score with those params :                     0.7293805234654546
 Best params for SVM model by RandomizedSearchCV :  {'kernel': 'sigmoid', 'gamma': 'auto', 'degree': 3, 'class_weight': None, 'C': 2}
 Best score with those params :                     0.7280930952776851


### SVM model


In [513]:
print(f" Best params for SVM model by GridSearchCV :        {grid_clsf.best_params_}")
print(f" Best params for SVM model by RandomizedSearchCV :  {random_clsf.best_params_}")

 Best params for SVM model by GridSearchCV :        {'C': 1, 'class_weight': None, 'degree': 1, 'gamma': 'auto', 'kernel': 'sigmoid'}
 Best params for SVM model by RandomizedSearchCV :  {'kernel': 'sigmoid', 'gamma': 'auto', 'degree': 3, 'class_weight': None, 'C': 2}


In [517]:
scores = []

# basic model
svm_model = svm.SVC(random_state=44)
svm_model.fit(X_train, y_train)
# params grid
svm_model_grid = svm.SVC(C=1, degree=1,  gamma='auto', kernel='sigmoid', random_state=44 )
svm_model_grid.fit(X_train, y_train)
# params random
svm_model_random = svm.SVC(kernel= 'sigmoid', gamma= 'auto', degree= 3, C= 2, random_state=44)
svm_model_random.fit(X_train, y_train)

#prediction from models
pred_basic = svm_model.predict(X_val)
pred_grid = svm_model_grid.predict(X_val)
pred_random = svm_model_random.predict(X_val)


models = (svm_model, svm_model_grid, svm_model_random)
pred = (pred_basic,pred_grid,pred_random)


precision = metrics.precision_score(y_val, pred_basic)
recall = metrics.recall_score(y_val, pred_basic)
roc_auc = metrics.roc_auc_score(y_val, pred_basic)
f1 = metrics.f1_score(y_val, pred_basic)
SS = slytherin_score(svm_model, X_val, y_val)

scores.append({"model": "basic model",
                "precision": precision,
                "recall": recall,
                "ROC AUC": roc_auc,
                "F1": f1,
                "Slytherin score": SS
                    })

precision = metrics.precision_score(y_val, pred_grid)
recall = metrics.recall_score(y_val, pred_grid)
roc_auc = metrics.roc_auc_score(y_val, pred_grid)
f1 = metrics.f1_score(y_val, pred_grid)
SS = slytherin_score(svm_model_grid, X_val, y_val)

scores.append({"model": "grid search params",
                "precision": precision,
                "recall": recall,
                "ROC AUC": roc_auc,
                "F1": f1,
                "Slytherin score": SS
                    })                    


precision = metrics.precision_score(y_val, pred_random)
recall = metrics.recall_score(y_val, pred_random)
roc_auc = metrics.roc_auc_score(y_val, pred_random)
f1 = metrics.f1_score(y_val, pred_random)
SS = slytherin_score(svm_model_random, X_val, y_val)

scores.append({"model": "random search params",
                "precision": precision,
                "recall": recall,
                "ROC AUC": roc_auc,
                "F1": f1,
                "Slytherin score": SS
                    })       


scores_df = pd.DataFrame(scores,columns=["model","precision","recall","ROC AUC","F1","Slytherin score"])

scores_df

Unnamed: 0,model,precision,recall,ROC AUC,F1,Slytherin score
0,basic model,0.695341,0.518717,0.718176,0.594181,0.744759
1,grid search params,0.65528,0.564171,0.728306,0.606322,0.742704
2,random search params,0.632047,0.569519,0.724682,0.599156,0.733


In [515]:
pd.DataFrame(random_clsf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_gamma,param_degree,param_class_weight,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.735791,0.017133,0.188681,0.005437,linear,auto,2,,0.3,"{'kernel': 'linear', 'gamma': 'auto', 'degree'...",0.725511,0.711957,0.722861,0.72011,0.005866,4
1,0.677274,0.058699,0.259521,0.013318,sigmoid,auto,3,,2.0,"{'kernel': 'sigmoid', 'gamma': 'auto', 'degree...",0.734534,0.715969,0.733777,0.728093,0.008579,1
2,0.873565,0.054507,0.235352,0.005558,sigmoid,scale,2,balanced,0.2,"{'kernel': 'sigmoid', 'gamma': 'scale', 'degre...",0.627706,0.563702,0.620948,0.604118,0.028712,20
3,0.585673,0.034493,0.206903,0.002751,poly,auto,1,,5.0,"{'kernel': 'poly', 'gamma': 'auto', 'degree': ...",0.722411,0.712944,0.725182,0.720179,0.005239,3
4,0.920654,0.011744,0.352357,0.036538,sigmoid,auto,1,balanced,0.1,"{'kernel': 'sigmoid', 'gamma': 'auto', 'degree...",0.651705,0.650095,0.690819,0.664206,0.018829,15
5,0.859109,0.091086,0.340701,0.081922,poly,auto,3,balanced,3.0,"{'kernel': 'poly', 'gamma': 'auto', 'degree': ...",0.68231,0.656807,0.691423,0.676847,0.01465,10
6,0.954251,0.127829,0.292923,0.052357,sigmoid,auto,2,,5.0,"{'kernel': 'sigmoid', 'gamma': 'auto', 'degree...",0.708555,0.705794,0.688495,0.700948,0.008878,7
7,0.917491,0.014579,0.265912,0.036627,poly,auto,2,,2.0,"{'kernel': 'poly', 'gamma': 'auto', 'degree': ...",0.710133,0.704111,0.723379,0.712541,0.008048,5
8,1.082324,0.029334,0.796374,0.012472,rbf,auto,2,balanced,0.4,"{'kernel': 'rbf', 'gamma': 'auto', 'degree': 2...",0.662613,0.62364,0.68641,0.657554,0.025874,17
9,0.67174,0.060102,0.265548,0.015925,poly,auto,3,,3.0,"{'kernel': 'poly', 'gamma': 'auto', 'degree': ...",0.705303,0.713424,0.71364,0.710789,0.00388,6
