# Model 7: Support Vector Machine (SVM)

## Import the libraries and dataset

In [12]:
from sklearn.svm import LinearSVC

from sklearn.svm import SVC

### Importing the libraries
import numpy as np
import pandas as pd
#from sklearnex import patch_sklearn

#patch_sklearn()

#plotting lib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#Sklearn Lib metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, KFold

# Pipelines : 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config

#GradientBoostingClassifier and AdaboostClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

#Missing values : 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
import missingno as msno

#Dummy
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn. preprocessing import StandardScaler


pd.set_option("display.max_columns",None)

In [13]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
#removing id for train_df
train_df.drop("id", axis = 1, inplace = True)

In [14]:
# Transforming object into categories 
# for train
for i in train_df.columns:
    if train_df[i].dtypes == "object":
        train_df[i] = train_df[i].astype("category")
# For test
for i in test_df.columns:
    if test_df[i].dtypes == "object":
        test_df[i] = test_df[i].astype("category")

In [15]:
X = train_df.drop("high_income", axis = 1)
y = train_df["high_income"]

In [16]:
# Grouping our features that are categories in one vector
# Same for numeric

categorical_features = [i for i in X.columns if X[i].dtype.name == "category"]
numerical_features = [i for i in X.columns if X[i].dtype.name != "category"]

## First GridSearch

In [None]:
# I have tried without polynomial Feature transformation and we get results similar to the logistic regression since it makes a linear seperation of our classes
# Since trees were performing A little bit better It looks like our decision boundary is not completely linear. We could model it by introducing a polynomial feature on enlarge the predictors space. 

In [7]:
from sklearn.preprocessing import PolynomialFeatures
numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")),
           ("poly_feat", PolynomialFeatures(degree = 3)),
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="constant", fill_value="Missing")),
           
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearSVC(random_state=1, max_iter=9999))]
)


set_config(display="diagram")
clf

In [31]:
grid = {"classifier__penalty": ["l2"],"classifier__loss": ["hinge"], # --> only available for l2 penalization
                    "classifier__C": [0.7,0.8,0.9,1]}
linear_svm_cv = GridSearchCV(estimator = clf,
                           param_grid = grid,
                           scoring = "accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

linear_svm_cv.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




In [32]:
pd.DataFrame(linear_svm_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_classifier__loss,param_classifier__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,327.795463,13.416187,0.565568,0.012274,0.7,hinge,l2,"{'classifier__C': 0.7, 'classifier__loss': 'hi...",0.858176,0.86019,0.859445,0.862375,0.860361,0.86011,0.00137,1
1,317.056281,6.916972,0.567361,0.023183,0.8,hinge,l2,"{'classifier__C': 0.8, 'classifier__loss': 'hi...",0.857535,0.859824,0.859537,0.862742,0.860269,0.859981,0.001668,2
2,334.071983,7.912637,0.551317,0.00905,0.9,hinge,l2,"{'classifier__C': 0.9, 'classifier__loss': 'hi...",0.857901,0.86019,0.85972,0.862558,0.859537,0.859981,0.001502,3
3,354.445299,8.209439,0.431711,0.134867,1.0,hinge,l2,"{'classifier__C': 1, 'classifier__loss': 'hing...",0.857077,0.859824,0.859354,0.862467,0.859537,0.859652,0.001714,4


## Second GridSearch

In [8]:
grid2 = {"classifier__penalty": ["l2"],"classifier__loss": ["hinge"], # --> only available for l2 penalization
                    "classifier__C": [0.4,0.5,0.6,0.7]}
linear_svm_cv = GridSearchCV(estimator = clf,
                           param_grid = grid2,
                           scoring = "accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

linear_svm_cv.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




In [9]:
pd.DataFrame(linear_svm_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_classifier__loss,param_classifier__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,149.635861,3.656914,0.2132,0.014978,0.4,hinge,l2,"{'classifier__C': 0.4, 'classifier__loss': 'hi...",0.859,0.860007,0.859811,0.862284,0.859903,0.860201,0.001101,2
1,168.660602,3.562031,0.269053,0.087927,0.5,hinge,l2,"{'classifier__C': 0.5, 'classifier__loss': 'hi...",0.858359,0.860099,0.859628,0.86265,0.860269,0.860201,0.001395,1
2,185.641886,3.087711,0.209983,0.020431,0.6,hinge,l2,"{'classifier__C': 0.6, 'classifier__loss': 'hi...",0.857993,0.859733,0.859537,0.862284,0.860269,0.859963,0.001385,4
3,166.243947,19.108409,0.123256,0.031843,0.7,hinge,l2,"{'classifier__C': 0.7, 'classifier__loss': 'hi...",0.858176,0.86019,0.859445,0.862375,0.860361,0.86011,0.00137,3


## Third GridSearch

In [7]:
# It looks like we should decrease the C to increase the score perfromance 
from sklearn.preprocessing import PolynomialFeatures
numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")),
           ("poly_feat", PolynomialFeatures(degree = 5)),
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="constant", fill_value="Missing")),
           
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearSVC(random_state=1))]
)


set_config(display="diagram")
clf
grid = {"classifier__penalty": ["l2"],"classifier__loss": ["hinge"], # --> only available for l2 penalization
                    "classifier__C": [0.01]}
linear_svm_cv = GridSearchCV(estimator = clf,
                           param_grid = grid,
                           scoring = "accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

linear_svm_cv.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




In [8]:
pd.DataFrame(linear_svm_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_classifier__loss,param_classifier__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,3797.971171,9.557787,0.801513,0.150847,0.01,hinge,l2,"{'classifier__C': 0.01, 'classifier__loss': 'h...",0.858451,0.861014,0.855691,0.859445,0.858255,0.858571,0.00174,1


## Export the predictions to .csv

In [None]:
# Exporting the second GridSearch (best #1)

In [25]:
clf.set_params(classifier__C = 0.5, classifier__loss = 'hinge', classifier__penalty = 'l2')

In [26]:
clf.fit(X,y)



In [39]:
test_df = pd.read_csv("Data/test.csv")
test_id = test_df[["id"]]
test_df.drop("id",axis = 1, inplace = True)

y_test_pred = clf.predict(test_df)

In [40]:
y_test_pred[0:5]

array([0, 0, 0, 1, 0])

In [41]:
svm_l2_C05_hinge = test_id
svm_l2_C05_hinge

Unnamed: 0,id
0,1
1,2
2,3
3,4
4,5
...,...
6063,6064
6064,6065
6065,6066
6066,6067


In [42]:
svm_l2_C05_hinge["high_income"] = y_test_pred
svm_l2_C05_hinge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  svm_l2_C05_hinge["high_income"] = y_test_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [43]:
svm_l2_C05_hinge.to_csv("Predictions/svm_l2_C05_hinge.csv",index = False, header=True)