In [3]:
%matplotlib inline

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer

from pathlib import Path

In [25]:
RANDOM_STATE = 42

# Title

## Get data

In [15]:
income = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None)

In [33]:
def clean_income_dataset(df: pd.DataFrame):
    col_mapper = {
        0: 'age',
        1: 'workclass',
        2: 'final_weight',
        3: 'education',
        4: 'education_num',
        5: 'marital_status',
        6: 'occupation',
        7: 'relationship',
        8: 'race',
        9: 'sex',
        10: 'capital_gain',
        11: 'capital_loss',
        12: 'hours_per_week',
        13: 'native_country',
        14: 'income_class'
    }

    relevant_cols = ['age', 'workclass', 'final_weight', 'education',
            'marital_status', 'occupation', 'relationship', 'race',
            'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
            'native_country', 'income_class']
    
    categorical_cols = ['workclass', 'education', 'marital_status', 
                    'occupation', 'relationship', 'race', 'sex', 
                    'native_country', 'income_class',]
    
    def tweak_categorical_column(col: pd.Series):
        return (col
                .str.strip()
                .str.lower()
                .astype('category')
                )
    
    df = df.rename(columns=col_mapper)

    return (df
            [relevant_cols]
            .assign(**{col: tweak_categorical_column(df[col]) for col in categorical_cols})
            .astype({'age': 'uint8', 'hours_per_week': 'uint8'})
            )

In [38]:
def split_income_dataset(df: pd.DataFrame):
    labels = (df
              .income_class
             )
    
    attributes = (df
                  .drop(columns="income_class")
                  .pipe(pd.get_dummies, drop_first=True)
                  .pipe(MinMaxScaler().fit_transform)
                  )

    return train_test_split(attributes, labels, train_size=0.8, stratify=df.income_class)

In [39]:
clean_income = clean_income_dataset(income)
clean_income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             32561 non-null  uint8   
 1   workclass       32561 non-null  category
 2   final_weight    32561 non-null  int64   
 3   education       32561 non-null  category
 4   marital_status  32561 non-null  category
 5   occupation      32561 non-null  category
 6   relationship    32561 non-null  category
 7   race            32561 non-null  category
 8   sex             32561 non-null  category
 9   capital_gain    32561 non-null  int64   
 10  capital_loss    32561 non-null  int64   
 11  hours_per_week  32561 non-null  uint8   
 12  native_country  32561 non-null  category
 13  income_class    32561 non-null  category
dtypes: category(9), int64(3), uint8(2)
memory usage: 1.1 MB


In [41]:
(attributes_train, attributes_test, labels_train, labels_test) = split_income_dataset(clean_income)

for d in [attributes_train, attributes_test, labels_train, labels_test]:
    print(d.shape)

(26048, 99)
(6513, 99)
(26048,)
(6513,)


## Linear SVC

In [92]:
# Baseline model
linear_svc = LinearSVC(
    penalty='l2',
    loss='squared_hinge',
    C=1e6,
    max_iter=1000,
    tol=0.01,
    random_state=RANDOM_STATE,
)

linear_svc.fit(attributes_train, labels_train)



In [93]:
# Feature importances
dummy_cols = (clean_income
 .drop(columns="income_class")
 .pipe(pd.get_dummies, drop_first=True)
 .columns
)

data = np.abs(linear_svc.coef_[0])

coefs = pd.Series(data, dummy_cols)
coefs.nlargest(8)

capital_gain                                 10.998492
education_preschool                           3.915117
education_prof-school                         1.125746
marital_status_married-af-spouse              1.112141
hours_per_week                                1.073089
age                                           1.043121
native_country_trinadad&tobago                1.025673
native_country_outlying-us(guam-usvi-etc)     1.025061
dtype: float64

In [94]:
# F1
labels_predicted = linear_svc.predict(attributes_test)

f1 = f1_score(y_true=labels_test, y_pred=labels_predicted, pos_label='>50k')
f1

0.4729673957903425

In [99]:
# Look for the best hyperparams
grid = {
    'C': [0.01, 1, 100, 10_000],
    'loss': ['squred_hinge', 'hinge'],
}

# custom scorer to specify the positive label
scorer = make_scorer(f1_score, pos_label='>50k')

gs = GridSearchCV(
    estimator=LinearSVC(),
    param_grid=grid,
    scoring=scorer,
    cv=None, # default 5-fold cross validation
)

gs.fit(attributes_train, labels_train)

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\a1056968\Anaconda3\envs\machine_learning\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\a1056968\Anaconda3\envs\machine_learning\lib\site-packages\sklearn\svm\_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "C:\Users\a1056968\Anaconda3\envs\machine_learning\lib\site-packages\sklearn\svm\_base.py", line 1204, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File

In [129]:
# Display result
display(gs.best_params_)
display(gs.best_score_.round(2))

# hyperparams ranking by f1 score of the cv test set
params = np.array(gs.cv_results_['params'])
ranks = np.array(gs.cv_results_['rank_test_score']) - 1
display(params[ranks])

{'C': 100, 'loss': 'hinge'}

0.66

array([{'C': 100, 'loss': 'squred_hinge'},
       {'C': 1, 'loss': 'squred_hinge'}, {'C': 100, 'loss': 'hinge'},
       {'C': 0.01, 'loss': 'hinge'}, {'C': 10000, 'loss': 'squred_hinge'},
       {'C': 0.01, 'loss': 'squred_hinge'}, {'C': 10000, 'loss': 'hinge'},
       {'C': 1, 'loss': 'hinge'}], dtype=object)

In [130]:
# F1
labels_predicted = gs.best_estimator_.predict(attributes_test)

f1 = f1_score(y_true=labels_test, y_pred=labels_predicted, pos_label='>50k')
f1

0.6253709198813057