In [14]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.utils import shuffle as skshuffle
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import math
'''
author: Han Zhang
date:   April 8th, 2019
using logistic regression model
'''

#0.define hyperparameters.
SPLIT = 0.4
INPUT_PATH = "./spambase.data"
OUTPUT_PATH = "./summary.txt"

#1.load data
def load_data(path):
    features = []
    ylabel = []
    with open(path,'r') as infile:
        for line in infile:
            line = line.strip("\n")
            line = line.split(",")
            features.append(np.array(list(map(float,line[:-1]))))
            ylabel.append(int(line[-1]))
    return features,ylabel

#2. split data into training and test set
def split_data(features,ylabel):
    features,ylabel = skshuffle(features,ylabel)
    num_test = math.floor(len(features)*SPLIT) #Note, this split might result in only one class in the training or testing data. If so, try another random seed.
    test_x = features[:num_test]
    test_y = ylabel[:num_test]
    train_x = features[num_test:]
    train_y = ylabel[num_test:]
    return test_x,test_y,train_x,train_y

#3. call grid search to search over for optimal hyperparameters
def fit_best_model(train_x,train_y):
    #return: dict for best parameters and best estimator
    #discussion on logistic regression:
    #1. The Logistic Regression in scikit-learn use different solvers for maximizing likelihood,
    #    we have options of liblinear, newton-cg, sag, lbfgs, saga.
    #2. The objective function uses regularization. Note that newton-cg, sag, lbfgs only support L2. 
    #3. Regularization strength is tunable by setting 'C' parameter.
    #4. To fit interception? yes or no
    # so we have totally 4 hyperparameters.
    # we set cv=5 folds. Then the GridSearchCV object automatically search the optimal hyperparameters for us.
    
    params = [{'solver':['liblinear','newton-cg','sag','lbfgs','saga'],
               'penalty':['l2'],'C':[1,5,10],'fit_intercept':[True,False]},
             {'solver':['liblinear','saga'],
               'penalty':['l1'],'C':[1,5,10],'fit_intercept':[True,False]}]
    clf = GridSearchCV(LogisticRegression(),params,cv=5)
    clf.fit(train_x,train_y)
    return clf.best_estimator_, clf.best_params_
    
    

#4. scoring performance on test data, print f1-score and write to file
def scoring(best_estimator,test_x,test_y):
    pred_y = best_estimator.predict(test_x)
    result = f1_score(test_y,pred_y)
    return result
    
def main():
    features,ylabel = load_data(INPUT_PATH)
    test_x,test_y,train_x,train_y = split_data(features,ylabel)
    best_est, best_params = fit_best_model(train_x,train_y)
    print(best_params)
    result = scoring(best_est,test_x,test_y)
    print("f1-score: {}".format(result))
    with open(OUTPUT_PATH,'w') as infile:
        infile.write(str(best_params)+"\n")
        infile.write("f1-score:" + str(result))

if __name__=="__main__":
    main()







{'C': 1, 'fit_intercept': True, 'penalty': 'l1', 'solver': 'liblinear'}
f1-score: 0.9026548672566372


