# Homework 3


## Logistic Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
wine_dat = pd.read_csv('./wine_original.csv')
print(wine_dat.shape)
print(wine_dat['class'].value_counts())
wine_dat.head()

Problem 1:

In [None]:
X = wine_dat.copy().drop('class',axis=1)
Y = wine_dat['class']

# Split into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Lasso = L1, Ridge = L2
parameters = { 'penalty': ['l1','l2'], 
              'C':[0.1, 0.5, 1, 2, 3, 4, 5, 10]}
logreg = LogisticRegression()
clf = GridSearchCV(logreg, parameters, verbose=True, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
train_acc = accuracy_score(clf.predict(X_train), y_train)
print ('Selected Parameters: ', clf.best_params_)
print ('\nTest Accuracy = ' + str(accuracy))

## Perceptron and SVM

In [None]:
from sklearn.datasets import fetch_20newsgroups

features = ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.politics.mideast']
strip = ('headers', 'footers', 'quotes')

train_init = fetch_20newsgroups(subset = 'train', categories=features, remove=strip)
test_init = fetch_20newsgroups(subset = 'test', categories=features, remove=strip)

X_train = train_init.data
y_train = train_init.target
X_test = test_init.data
y_test = test_init.target

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True, decode_error='ignore', \
                             max_features=2000, stop_words='english')
X_train_fin = vectorizer.fit_transform(X_train)
X_test_fin = vectorizer.transform(X_test)

print('Train data:', X_train_fin.shape)
print('Test data:', X_test_fin.shape)

Problem 2:

In [None]:
from sklearn.linear_model import Perceptron

clf = Perceptron(penalty=None)
clf.fit(X_train_fin, y_train)
y_pred = clf.predict(X_test_fin)
print('Test accuracy = ' + str(accuracy_score(y_pred, y_test)))

Problem 3:

In [None]:
result = []
features = [100, 200, 500, 1000, 1500, 2000, 3000]
for i in features:
    vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True, decode_error='ignore', \
                                 max_features=i, stop_words='english')
    X_train_fin = vectorizer.fit_transform(X_train)
    X_test_fin = vectorizer.transform(X_test)
    clf = Perceptron(penalty=None)
    clf.fit(X_train_fin, y_train)
    y_pred = clf.predict(X_test_fin)
    accuracy = accuracy_score(y_pred, y_test)
    result.append(accuracy)
    print('Test accuracy with top ', i,' features is ', accuracy)
    
plt.plot(features, result)
plt.title('Peceptron: Accuracy Increases with more Features');
plt.xlabel('Top (N) Features');
plt.ylabel('Accuracy');

Indicated by the title plot, we see an overall trend that as we increase the amount of top features fed to the model, the model performs better. Interestingly, however, there seems to be some large jumps with some increases in n, but not in others.

Problem 4:

In [None]:
from sklearn.svm import SVC

clf = SVC(kernel='linear')
clf.fit(X_train_fin, y_train)
y_pred = clf.predict(X_test_fin)

print('Test accuracy = ' + str(accuracy_score(y_pred, y_test)))

Problem 5:

In [None]:
result = []
features = [100, 200, 500, 1000, 1500, 2000, 3000]
for i in features:
    vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True, decode_error='ignore', \
                                 max_features=i, stop_words='english')
    X_train_fin = vectorizer.fit_transform(X_train)
    X_test_fin = vectorizer.transform(X_test)
    clf = SVC(kernel='linear')
    clf.fit(X_train_fin, y_train)
    y_pred = clf.predict(X_test_fin)
    accuracy = accuracy_score(y_pred, y_test)
    result.append(accuracy)
    print('Test accuracy with top ', i,' features is ', accuracy)
    
plt.plot(features, result)
plt.title('SVM: Accuracy Increases with more Features');
plt.xlabel('Top (N) Features');
plt.ylabel('Accuracy');

Indicated by the title plot, we see an overall trend that as we increase the amount of top features fed to the model, the model performs better. Different from the perceptron model, SVM seems to steadily improve as we increase n.

Problem 6:

In [None]:
X_train_n, X_valid, y_train_n, y_valid = train_test_split(X_train_fin, y_train, test_size = 0.2, random_state=10)

In [None]:
best_acc = 0.0
results = []
cost = [0.01,0.1,1,10,100]
for i in cost:
    clf = SVC(kernel='linear', C= i)
    clf.fit(X_train_n, y_train_n)
    y_pred = clf.predict(X_valid)
    accuracy = accuracy_score(y_pred, y_valid)
    print('The validation accuracy for C = ', i, 'is ', accuracy)
    results.append(accuracy)
    if (accuracy > best_acc):
        best_cost = i
        best_acc = accuracy
print('\nOptimal cost (c): ', best_cost, ', validation accuracy=', best_acc)

plt.plot(cost, results);
plt.title('SVM: Diminishing Marginal Returns Tuning Cost');
plt.xlabel('Cost');
plt.ylabel('Accuracy')
plt.show();


clf = SVC(kernel='linear', C= best_cost)
clf.fit(X_train_fin, y_train)
y_pred = clf.predict(X_test_fin)
accuracy = accuracy_score(y_pred, y_test)
print('\n')
print('Test accuracy with optimal c =', best_cost, 'is ', accuracy)

Problem 7:

In [None]:
best_acc = 0.0
results = []
deg = [1,2,3]

for i in deg:
    clf = SVC(C=10000, kernel='poly', degree=i)
    clf.fit(X_train_n, y_train_n)
    y_pred = clf.predict(X_valid)
    accuracy = accuracy_score(y_pred, y_valid)
    print('Poly validation accuracy with', i, 'degree = ', accuracy)
    results.append(accuracy)
    if (accuracy > best_acc):
        best_deg = i
        best_kernel = 'poly'
        best_acc = accuracy
        
for i in ['rbf', 'sigmoid']:
    clf = SVC(C=10000, kernel=i)
    clf.fit(X_train_n, y_train_n)
    y_pred = clf.predict(X_valid)
    accuracy = accuracy_score(y_pred, y_valid)
    print(i,'validation accuracy =', accuracy)
    results.append(accuracy)
    if (accuracy > best_acc):
        best_kernel = i
        best_acc = accuracy

if best_kernel != 'poly':
    best_degree == np.nan
print('\nBest kernel is ',best_kernel,'best degree is',best_deg)

In [None]:
clf = SVC(C=10000, kernel=best_kernel, degree= best_deg)
clf.fit(X_train_fin, y_train)
y_pred = clf.predict(X_test_fin)
accuracy = accuracy_score(y_pred, y_test)
print('Test accuracy using SVM with poly kernel, degree 1 = ', accuracy)

## Custom Kernels

Problem 8:

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, laplacian_kernel

# Create dictionary to avoid string error
kernels = {'cosine_similarity': cosine_similarity, 'laplacian_kernel': laplacian_kernel}
for i,j in kernels.items():
    clf = SVC(kernel=j)
    clf.fit(X_train_fin, y_train)
    y_pred = clf.predict(X_test_fin)
    accuracy = accuracy_score(y_pred, y_test)
    print('Kernel = ', i, ', test accuracy = ', accuracy)

Problem 9:

In [None]:
for a in np.arange(0, 1.1, 0.1):
    new_train = (a* cosine_similarity(X_train_n)) + ((1-a)* laplacian_kernel(X_train_n))
    new_valid = (a* cosine_similarity(X_valid, X_train_n)) + ((1-a)* laplacian_kernel(X_valid, X_train_n))
    
    clf = SVC(kernel='precomputed')
    clf.fit(new_train, y_train_n)
    y_pred = clf.predict(new_valid)
    accuracy = accuracy_score(y_pred, y_valid)
    print('Validation accuracy with alpha = ', a, 'is ', accuracy)
    if (accuracy > best_acc):
        best_alpha = a
        best_acc = accuracy
        
new_train = (best_alpha* cosine_similarity(X_train_fin)) + ((1-best_alpha)* laplacian_kernel(X_train_fin))
new_valid = (best_alpha* cosine_similarity(X_test_fin, X_train_fin)) + ((1-best_alpha)* laplacian_kernel(X_test_fin, X_train_fin))
clf = SVC(kernel='precomputed')
clf.fit(new_train, y_train)
y_pred = clf.predict(new_valid)
accuracy = accuracy_score(y_pred, y_test)
print('\n')
print('Test accuracy with alpha:', best_alpha, '= ', accuracy)

A valid kernel must satisfy Mercer's condition: the resulting Kernel Matrix is symmetric positive semi-definite and the positive eigenvalues follows from that. In this case, we know that the kernel has been specified to be a >= 0, which implies the integral will be positive. Since we have two valid Kernels and we know that they can be expressed as the inner product of some feature space, we know that the new kernel is valid as well.