# Support Vector Machine SVM

Son modelos de aprendizaje supervisado con algoritmos de aprendizaje asociados que analizan los datos historicos para el análisis de clasificación y regresión. Sin embargo, se utilizan principalmente en problemas de clasificación.

# Librerias

In [32]:
import pandas as pd 
import numpy as np
import pandas_profiling
import seaborn as sns
import sklearn as sk
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pickle

# Carga de dataset

Utilizamos el dataset que hemos dejado al final de notebook anterior, despues de hacer el EDA y feature engineering

In [33]:
svm = pd.read_csv('data_modelos.csv',  usecols=lambda x: 'Unnamed' not in x,)

In [34]:
svm.head()

Unnamed: 0,loan_amnt,installment,grade,emp_length,home_ownership,annual_inc,verification_status,dti,delinq_2yrs,fico_range_high,...,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,pub_rec_bankruptcies,total_bc_limit,total_il_high_credit_limit,debt_settlement_flag,Target
0,11575,359.26,1,6,OWN,153000.0,Not Verified,16.99,0,724,...,0.0,0,0,2,95.7,1,28100,120572,N,1
1,7200,285.7,5,2,RENT,50000.0,Source Verified,6.07,0,689,...,0.0,0,0,0,80.0,0,3600,4000,N,1
2,7500,232.79,1,7,MORTGAGE,110000.0,Not Verified,13.12,0,714,...,0.0,0,0,3,100.0,0,83700,32239,N,1
3,10000,243.29,3,7,RENT,51979.0,Source Verified,10.11,0,694,...,0.0,0,0,3,100.0,2,18800,5500,N,1
4,14000,492.34,3,7,MORTGAGE,75000.0,Verified,10.86,1,689,...,0.0,0,0,3,90.0,0,3000,30321,N,1


# Downsampling

El algoritmo Support Vector Machine funciona muy bien con dataset peaqueños, pero no funciona tan bien cuando tenemos un gran conjunto de datos (como en nuestro caso) porque el tiempo de entrenamiento requerido es mayor. Por eso, vamos a coger una submuestra para agilizar los calculos

In [35]:
svm_no_default = svm[svm['Target'] == 1]
svm_default = svm[svm['Target'] == 0]

In [36]:
svm_no_default_downsampled = sk.utils.resample(svm_no_default,
                                               replace = False,
                                               n_samples=3000,
                                               random_state=123
)
len(svm_no_default_downsampled)

3000

In [37]:
svm_default_downsampled = sk.utils.resample(svm_default,
                                      replace = False,
                                      n_samples=1000,
                                      random_state=123
)
len(svm_default_downsampled)

1000

In [38]:
svm_downsampled = pd.concat([svm_default_downsampled, svm_no_default_downsampled])
len(svm_downsampled)

4000

# Clasificacion vs prediccion

Dividimos el dataset en 2 partes, lo que queremos predecir y lo que usamos para clasificar

In [39]:
X = svm_downsampled.drop('Target', axis = 1).copy()
X.head()

Unnamed: 0,loan_amnt,installment,grade,emp_length,home_ownership,annual_inc,verification_status,dti,delinq_2yrs,fico_range_high,...,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,pub_rec_bankruptcies,total_bc_limit,total_il_high_credit_limit,debt_settlement_flag
314479,5000,170.87,3,10,MORTGAGE,37000.0,Verified,26.69,0,669,...,11,0.0,0,0,6,100.0,1,4200,34234,N
271683,24000,774.3,2,0,OWN,95000.0,Not Verified,15.12,2,679,...,23,0.0,0,1,2,92.2,0,20900,264647,N
230009,35000,886.11,4,10,RENT,85000.0,Verified,17.39,0,699,...,14,0.0,0,0,5,100.0,0,42700,42293,N
317551,12000,283.28,3,8,MORTGAGE,65000.0,Source Verified,6.74,0,689,...,6,0.0,0,0,3,100.0,0,13200,15850,N
25074,22000,748.29,3,10,MORTGAGE,50000.0,Source Verified,35.83,1,704,...,17,0.0,0,0,0,97.3,0,71100,0,N


In [40]:
y = svm_downsampled['Target'].copy()
y.head()

314479    0
271683    0
230009    0
317551    0
25074     0
Name: Target, dtype: int64

# Encoding

one - hot - encoding

In [41]:
X.select_dtypes('object').apply(pd.Series.nunique, axis = 0).sort_values(ascending = False)

verification_status     3
home_ownership          3
debt_settlement_flag    2
initial_list_status     2
dtype: int64

In [42]:
X_encoded = pd.get_dummies(X, columns=['emp_length',
                            'grade',
                             'verification_status',
                             'home_ownership',
                             'debt_settlement_flag',
                             'initial_list_status'
                            ])
X_encoded.head()

Unnamed: 0,loan_amnt,installment,annual_inc,dti,delinq_2yrs,fico_range_high,pub_rec,revol_bal,total_acc,last_pymnt_amnt,...,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,debt_settlement_flag_N,debt_settlement_flag_Y,initial_list_status_f,initial_list_status_w
314479,5000,170.87,37000.0,26.69,0,669,1,5575,51,170.87,...,0,0,1,1,0,0,1,0,0,1
271683,24000,774.3,95000.0,15.12,2,679,0,10916,53,774.3,...,1,0,0,0,1,0,1,0,0,1
230009,35000,886.11,85000.0,17.39,0,699,0,25721,48,886.11,...,0,0,1,0,0,1,1,0,1,0
317551,12000,283.28,65000.0,6.74,0,689,0,6099,10,283.28,...,0,1,0,1,0,0,1,0,0,1
25074,22000,748.29,50000.0,35.83,1,704,0,46269,38,748.29,...,0,1,0,1,0,0,1,0,0,1


In [43]:
X_encoded.select_dtypes('object').apply(pd.Series.nunique, axis = 0).sort_values(ascending = False)

Series([], dtype: float64)

# Estandarizacion de la informacion

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state = 123)
X_train_scalated = preprocessing.scale(X_train)
X_test_scalated = preprocessing.scale(X_test)

In [45]:
X_train_scalated.shape, X_test_scalated.shape

((3000, 58), (1000, 58))

In [46]:
y_train.shape, y_test.shape

((3000,), (1000,))

# Modelo

In [47]:
clf_svm = SVC(random_state = 123)
clf_svm.fit(X_train_scalated, y_train)

SVC(random_state=123)

In [48]:
prediction_test_svm = clf_svm.predict(X_test_scalated)
print(prediction_test_svm)

[1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1
 1 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1 0 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 0 0 0 1
 0 1 0 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 1
 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1
 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 1 1 0 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1
 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0
 1 1 1 1 1 0 0 1 1 1 0 1 1 0 1 0 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 1 1
 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 0
 1 0 1 0 1 1 1 1 1 1 1 1 

In [49]:
cm_svm = confusion_matrix(y_test, prediction_test_svm)
print ("Accuracy : ", accuracy_score(y_test, prediction_test_svm))
cm_svm

Accuracy :  0.932


array([[206,  37],
       [ 31, 726]], dtype=int64)

Usamos cross validation para optimizar los parametros

In [50]:
param_grid = [
    {'C': [0.5,1,10,100],
     'gamma': ['scale',1,0.1,0.01,0.001,0.001],
     'kernel': ['rbf', 'linear']},
]

optimal_parametros = GridSearchCV(clf_svm,
                                      param_grid,
                                      cv=3,
                                      scoring='accuracy',
                                      verbose=0)
optimal_parametros.fit(X_train_scalated, y_train)
print(optimal_parametros.best_params_)

{'C': 100, 'gamma': 'scale', 'kernel': 'linear'}


In [51]:
clf_svm = SVC(random_state = 123, C=100, gamma= 'scale', kernel='linear')
clf_svm.fit(X_train_scalated, y_train)

SVC(C=100, kernel='linear', random_state=123)

In [52]:
prediction_test_svm = clf_svm.predict(X_test_scalated)
print(prediction_test_svm)

[1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1
 0 1 0 1 1 1 1 0 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0 1 0 1 1 1 1 0 0 1 1 0 1
 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 0 1
 0 1 0 0 1 1 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 1
 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1
 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1 0 1 1
 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 0 0 0 1 1 1 0 1 0 0 0
 1 1 1 1 1 0 0 1 1 1 0 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 1 1
 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 0
 1 1 1 0 1 1 1 1 1 1 1 1 

In [53]:
cm_svm = confusion_matrix(y_test, prediction_test_svm)
print ("Accuracy : ", accuracy_score(y_test, prediction_test_svm))
cm_svm

Accuracy :  0.951


array([[229,  14],
       [ 35, 722]], dtype=int64)

In [54]:
pickle.dump(clf_svm, open("clf_svm", "wb"))
