# Task Instructions

Step 0. Import **ALL** packages you need in **ONE** cell   

Step 1. Load Data

Step 2. Model Comparison and Discussion 

Step 3. Conclusion

 

# Step 0. Import **ALL** packages you need in **ONE** cell  

In [15]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score,accuracy_score
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

import csv
import time

def generate_csv(ypredict, filename = 'result.csv'):
    with open(filename,'w',newline = '') as fd:
        writer = csv.writer(fd)
        writer.writerow(['index','default.payment.next.month'])

    for index,pred in enumerate(ypredict):
        with open(filename,'a',newline = '') as fd:
            writer = csv.writer(fd)
            writer.writerow([index,pred])
            
def show_result(groud_truth, prediction):
    print("Accuracy: ", accuracy_score(groud_truth, prediction))
    print("Precision: ",precision_score(groud_truth, prediction))
    print("Recall: ",recall_score(groud_truth, prediction))
    print("Confusion Matrix: ")
    print(confusion_matrix(groud_truth, prediction))
    return accuracy_score(groud_truth, prediction)

# Step 1. Load Data

In [2]:
train_x = pd.read_csv('data_train.csv')
train_x = train_x.iloc[:,1:-1]
train_y = pd.read_csv('answer_train.csv')
train_y = train_y.iloc[:,-1]
test_x = pd.read_csv('data_test.csv')
test_x = test_x.iloc[:,1:-1]
train_split_x, valid_x, train_split_y, valid_y = train_test_split(train_x, train_y, test_size = 0.2, random_state = 1)

# Step 2. Algorithms Comparison and Discussion 

**"In addition to the parameters listed, please provide an analytical discussion for each model as described below. You can also supplement any other parameters that were found to have an impact on the model during the process."**

* Linear Regression: L1/L2   - The weight difference of different features under L1 and L2
* Decision Tree: IG/Gini - The difference between the results of two different index pairs
* Support Vector Machine: Gamma/C - The effect of each of the two parameters on the model
* K-Nearest Neighbor: K  - Effect of different K values on the model


### Linear Regression ###

In [19]:
#No regularization
model=LinearRegression()
model.fit(train_split_x, train_split_y)

ypredict = model.predict(valid_x).round()
ypredict[ypredict>1] = 1
ypredict[ypredict<1] = 0
show_result(valid_y, ypredict)

model=LinearRegression()
model.fit(train_x, train_y)
ypredict=model.predict(test_x).round()
ypredict[ypredict>1] = 1
ypredict[ypredict<1] = 0
generate_csv(ypredict, 'LR_Result.csv')

Accuracy:  0.8085416666666667
Precision:  0.7241379310344828
Recall:  0.14554455445544554
Confusion Matrix: 
[[3734   56]
 [ 863  147]]


In [18]:
#L1
model=Lasso(alpha = 0.1)
model.fit(train_split_x, train_split_y)

ypredict = model.predict(valid_x).round()
ypredict[ypredict>1] = 1
ypredict[ypredict<1] = 0
show_result(valid_y, ypredict)

model=Lasso(alpha = 0.1)
model.fit(train_x, train_y)
ypredict=model.predict(test_x).round()
ypredict[ypredict>1] = 1
ypredict[ypredict<1] = 0
generate_csv(ypredict, 'L1_Result.csv')

Accuracy:  0.7895833333333333
Precision:  0.0
Recall:  0.0
Confusion Matrix: 
[[3790    0]
 [1010    0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
#L2
model=Ridge(alpha = 0.1)
model.fit(train_split_x, train_split_y)

ypredict = model.predict(valid_x).round()
ypredict[ypredict>1] = 1
ypredict[ypredict<1] = 0
show_result(valid_y, ypredict)

model=Ridge(alpha = 0.1)
model.fit(train_x, train_y)
ypredict=model.predict(test_x).round()
ypredict[ypredict>1] = 1
ypredict[ypredict<1] = 0
generate_csv(ypredict, 'L2_Result.csv')

Accuracy:  0.8085416666666667
Precision:  0.7241379310344828
Recall:  0.14554455445544554
Confusion Matrix: 
[[3734   56]
 [ 863  147]]


### Decision Tree ###

In [10]:
#information gain
start = time.time()
model=DecisionTreeClassifier(criterion = 'entropy', max_depth = 4, random_state=1)
model.fit(train_split_x, train_split_y)

ypredict = model.predict(valid_x)
show_result(valid_y, ypredict)

model=DecisionTreeClassifier(criterion = 'entropy', max_depth = 4, random_state=1)
model.fit(train_x, train_y)
ypredict=model.predict(test_x)
generate_csv(ypredict, 'DTIG_Result.csv')

Accuracy:  0.82875
Precision:  0.6721611721611722
Recall:  0.36336633663366336
Confusion Matrix: 
[[3611  179]
 [ 643  367]]


In [11]:
#gini impurity
start = time.time()
model=DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state=1)
model.fit(train_split_x, train_split_y)

ypredict = model.predict(valid_x)
show_result(valid_y, ypredict)

model=DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state=1)
model.fit(train_x, train_y)
ypredict=model.predict(test_x)
generate_csv(ypredict, 'DTgini_Result.csv')

Accuracy:  0.8277083333333334
Precision:  0.6678899082568808
Recall:  0.3603960396039604
Confusion Matrix: 
[[3609  181]
 [ 646  364]]


### Support Vector Machine ###

In [None]:
# Training the SVM model
start = time.time()
model = SVC(kernel='linear', random_state=1)
model.fit(train_split_x, train_split_y)
print("Training time: ", time.time() - start)
print()

# Evaluating the SVM model on the validation set
ypredict = model.predict(valid_x)
show_result(valid_y, ypredict)

# Generating CSV file for the SVM model's predictions on the test set
model = SVC(kernel='linear', random_state=1)
model.fit(train_x, train_y)
ypredict = model.predict(test_x)
generate_csv(ypredict, 'SVM_Result.csv')

### K-Nearest Neighbor ###

In [None]:
start = time.time()
neighbors = [5,10,20,50,100,200]
best_acc = 0
for i in neighbors:
    model=KNeighborsClassifier(n_neighbors = i)
    model.fit(train_split_x, train_split_y)
    print("N =", i)
    print("Training time: ", time.time() - start)

    ypredict = model.predict(valid_x)
    acc = show_result(valid_y, ypredict)
    print()
    if acc > best_acc:
        best_acc = acc
        best_n = i    

model=KNeighborsClassifier(n_neighbors = 50)
model.fit(train_x, train_y)
ypredict=model.predict(test_x)
generate_csv(ypredict, 'KNN_Result.csv')

N = 5
Training time:  0.009003639221191406
Accuracy:  0.7666666666666667
Precision:  0.38866396761133604
Recall:  0.1900990099009901
Confusion Matrix: 
[[3488  302]
 [ 818  192]]

N = 10
Training time:  2.1960880756378174
Accuracy:  0.7875
Precision:  0.4772727272727273
Recall:  0.10396039603960396
Confusion Matrix: 
[[3675  115]
 [ 905  105]]

N = 20
Training time:  4.345383405685425
Accuracy:  0.7885416666666667
Precision:  0.48366013071895425
Recall:  0.07326732673267326
Confusion Matrix: 
[[3711   79]
 [ 936   74]]

N = 50
Training time:  6.510468006134033
Accuracy:  0.7902083333333333
Precision:  0.5151515151515151
Recall:  0.0504950495049505
Confusion Matrix: 
[[3742   48]
 [ 959   51]]

N = 100
Training time:  8.71213674545288
Accuracy:  0.7910416666666666
Precision:  0.5573770491803278
Recall:  0.033663366336633666
Confusion Matrix: 
[[3763   27]
 [ 976   34]]

N = 200
Training time:  11.273615837097168
Accuracy:  0.7902083333333333
Precision:  0.6153846153846154
Recall:  0.007

# Step 3. Conclusion #

Conduct a comparison among the four algorithms, considering factors such as performance, efficiency, and any additional insights you would like to share regarding this assignment.
