In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from xgboost.sklearn import XGBClassifier

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [2]:
def load_data():
	# load data 
	data = pd.read_csv(
		'http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data', header=None)
	p_ = data.shape[1]
	p = p_ - 1
	X = data.iloc[:, :p]
	y = data.iloc[:, p]
	return X.values, y.values
	

XGBoost Classifier with default setting

In [4]:
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
print(xgb_clf)
xgb_clf = xgb_clf.fit(X_train, y_train)
training_score = xgb_clf.score(X_train, y_train) 
testing_score = xgb_clf.score(X_test, y_test)
print("Training accuracy is %.2f and testing accuracy is %.2f"%(training_score*100, testing_score*100), '%')

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              eval_metric='mlogloss', gamma=None, gpu_id=None,
              importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              use_label_encoder=False, validate_parameters=None,
              verbosity=None)
Training accuracy is 99.78 and testing accuracy is 95.87 %


In [None]:
def tune_hyperparams(model, param_grid):
    
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
    
    rs_clf = RandomizedSearchCV(model, 
    param_grid, 
    n_iter=20, 
    n_jobs = 10,
    cv = 10,
    scoring='accuracy',
    refit = True, 
    random_state = 0, verbose = 0)

    rs_clf.fit(X_train, y_train)

    score = rs_clf.best_score_
    param_recommend = rs_clf.best_params_
    print(param_recommend)

    print('Training Accuracy %.2f'%(rs_clf.score(X_train, y_train)*100),'%')
    print('Testing Accuracy %.2f'%(rs_clf.score(X_test, y_test)*100),'%')

    return rs_clf


In [None]:
param_grid = {
    'n_estimators':[10, 30, 50, 70, 90, 150, 200],
    'max_depth':[100,200,300,400,500,600,700,800,1000,2000]
}
xgb_tuned = tune_hyperparams(xgb.XGBClassifier(use_label_encoder = False, eval_metric = 'mlogloss'), param_grid)

In [None]:
print(xgb_tuned)

Linear SVM with default setting

In [6]:

svm = LinearSVC()
print(svm)
svm.fit(X_train, y_train)
training_score_svm = svm.score(X_train, y_train) 
testing_score_svm = svm.score(X_test, y_test)
# print(training_score_svm, testing_score_svm)
print("Training accuracy is %.2f and testing accuracy is %.2f"%(training_score_svm*100, testing_score_svm*100), '%')

LinearSVC()
Training accuracy is 88.59 and testing accuracy is 86.43 %


Gaussian SVM with default setting

In [7]:

svm = SVC(kernel='rbf')
print(svm)
svm.fit(X_train, y_train)
training_score_svm = svm.score(X_train, y_train) 
testing_score_svm = svm.score(X_test, y_test)
# print(training_score_svm, testing_score_svm)
print("Training accuracy is %.2f and testing accuracy is %.2f"%(training_score_svm*100, testing_score_svm*100), '%')

SVC()
Training accuracy is 71.36 and testing accuracy is 66.23 %


In [12]:
param_grid = {
    'C': [0.1,1, 10, 100],
    'max_iter':[10, 1000, 10000,100000 ]
}
svcGrid_linear = GridSearchCV(LinearSVC(), param_grid, refit=True, verbose=0)
svcGrid_linear.fit(X_train,y_train)
print(svcGrid_linear)
print('Best parameters are:',svcGrid_linear.best_estimator_)

GridSearchCV(estimator=LinearSVC(),
             param_grid={'C': [0.1, 1, 10, 100],
                         'max_iter': [10, 1000, 10000, 100000]})
Best parameters are: LinearSVC(C=0.1, max_iter=100000)


In [13]:
param_grid = {
    'C': [0.1,1, 10, 100],
    'max_iter':[10, 1000, 10000,100000 ]
}
svcGrid_gaussian = GridSearchCV(SVC(kernel = 'rbf'), param_grid, refit=True, verbose=0)
svcGrid_gaussian.fit(X_train,y_train)
print(svcGrid_gaussian)
print('Best parameters are:',svcGrid_gaussian.best_estimator_)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100],
                         'max_iter': [10, 1000, 10000, 100000]})
Best parameters are: SVC(C=100, max_iter=10000)


Linear SVM with hypertuned parameters

In [14]:
svm_linear_tuned = LinearSVC(C=0.1, max_iter = 100000)
svm_linear_tuned.fit(X_train, y_train)
training_score_svm = svm_linear_tuned.score(X_train, y_train) 
testing_score_svm = svm_linear_tuned.score(X_test, y_test)
# print(training_score_svm, testing_socre_svm)
print("Training accuracy is %.2f and testing accuracy is %.2f"%(training_score_svm*100, testing_score_svm*100), '%')

Training accuracy is 92.45 and testing accuracy is 91.75 %


Gaussian SVM with hypertuned parameters

In [15]:
svm_gaussian_tuned = SVC(kernel = 'rbf', C=100, max_iter = 10000)
svm_gaussian_tuned.fit(X_train, y_train)
training_score_svm = svm_gaussian_tuned.score(X_train, y_train) 
testing_score_svm = svm_gaussian_tuned.score(X_test, y_test)
# print(training_score_svm, testing_socre_svm)
print("Training accuracy is %.2f and testing accuracy is %.2f"%(training_score_svm*100, testing_score_svm*100), '%')

Training accuracy is 84.13 and testing accuracy is 83.06 %


In [None]:
def main():
    # Load data
    X, y = load_data()

    # TO DO:
    # Randomly split the data in to training set and testing test; 
    # Let testing set contain 20% of total dataset
    # You can check the train_test_split function in sklearn package
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

    # TO DO：
    # 1. Using the XgboostClassifier (default setting), report the training and testing accuracy
    xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_clf = xgb_clf.fit(X_train, y_train)
    training_score = xgb_clf.score(X_train, y_train) 
    testing_socre = xgb_clf.score(X_test, y_test)
    print(training_score, testing_socre)
    # 2. Tuning the n_estimator and max_depth, compare the results
    xgb_tuned = xgb.XGBClassifier(n_estimators = 30, max_depth = 100, use_label_encoder=False, eval_metric='mlogloss')
    xgb_tuned = xgb_tuned.fit(X_train, y_train)
    training_score = xgb_tuned.score(X_train, y_train) 
    testing_socre = xgb_tuned.score(X_test, y_test)
    print(training_score, testing_socre)
    # TO DO：
    # 1. Using Linear SVM (default setting), report the training and testing accuracy
    svm = LinearSVC()
    svm.fit(X_train, y_train)
    training_score_svm = svm.score(X_train, y_train) 
    testing_socre_svm = svm.score(X_test, y_test)
    print(training_score_svm, testing_score_svm)
    # 2. Tuning C and max_iter, compare the results
    svm_linear_tuned = LinearSVC(C=0.1, max_iter = 100000)
    svm_linear_tuned.fit(X_train, y_train)
    training_score_svm = svm_linear_tuned.score(X_train, y_train) 
    testing_socre_svm = svm_linear_tuned.score(X_test, y_test)
    print(training_score_svm, testing_socre_svm)
    # TO DO：
    # 1. Using kernel SVM (with Gaussian Kernel) (default setting), report the training and testing accuracy
    svm_gauss = SVC(kernel = 'rbf')
    svm_gauss.fit(X_train, y_train)
    training_score_svmG = svm_gauss.score(X_train, y_train) 
    testing_score_svmG = svm_gauss.score(X_test, y_test)
    print(training_score_svmG, testing_score_svmG)
    # 2. Tuning C and max_iter, compare the results
    svm_gaussian_tuned = SVC(kernel = 'rbf', C=100, max_iter = 10000)
    svm_gaussian_tuned.fit(X_train, y_train)
    training_score_svm = svm_gaussian_tuned.score(X_train, y_train) 
    testing_socre_svm = svm_gaussian_tuned.score(X_test, y_test)
    print(training_score_svm, testing_socre_svm)


if __name__=="__main__":

	# API usage 
	main()
