In [34]:
# setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split as tts

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix as CM

In [5]:
# initialization
data = pd.read_csv('drive/MyDrive/DATA301_Data/mobile.csv')

In [6]:
x = data.drop('price_range', axis = 1).values
y = data['price_range'].values

## Question 1
Create one single train/test split with a test size of 25% of the total number of observations and a random_state=123. You will decide if you need to properly scale the data, fit each classifier on the train set, and measure the test's overall accuracy. For each classifier that requires method-specific hyper-parameters, you will determine the best choice from the following ranges:

In [7]:
scale = StandardScaler()
x_train, x_test, y_train, y_test = tts(x, y, test_size = .25, random_state = 123)

### Logistic Regression: no need to tune anything.

 Take solver=‘lbfgs’ and max_iter=10000.’


In [8]:
model_lr = LogisticRegression(solver = 'lbfgs', max_iter = 10000)
model_lr.fit(scale.fit_transform(x_train),y_train)
model_lr.score(scale.transform(x_test), y_test)

0.95

In [9]:
# confusion matrix
mat_lr = CM(y_test, model_lr.predict(scale.transform(x_test)))
mat_lr

array([[117,   2,   0,   0],
       [  8, 116,   4,   0],
       [  0,   4, 129,   4],
       [  0,   0,   3, 113]])

### Support Vector Machine - polynomial kernel.

Test degrees 1,2 and 3 and a range for C between 1 and 75 with increments of 0.5


In [10]:
degree_range = [1, 2, 3]
C_range = np.arange(1, 75, .5)

In [11]:
table = []
for degree in degree_range:
  for C in C_range:
    model = SVC(kernel = 'poly', degree = degree, C = C, probability = False)
    model.fit(scale.fit_transform(x_train), y_train)
    table.append([degree, C, model.score(scale.transform(x_test), y_test)])

In [12]:
tabpoly1 = pd.DataFrame(table, columns = ['degree', 'C', 'score'])
tabpoly1 = tabpoly1.sort_values('score', ascending = False)
tabpoly1

Unnamed: 0,degree,C,score
143,1,72.5,0.970
142,1,72.0,0.970
141,1,71.5,0.970
147,1,74.5,0.968
129,1,65.5,0.968
...,...,...,...
280,2,67.0,0.464
152,2,3.0,0.462
149,2,1.5,0.460
151,2,2.5,0.450


In [13]:
model_poly = SVC(kernel = 'poly', degree = 1, C = 72.5)
model_poly.fit(scale.fit_transform(x_train),y_train)
model_poly.score(scale.transform(x_test), y_test)

0.97

In [14]:
# confusion matrix
mat_poly = CM(y_test, model_poly.predict(scale.transform(x_test)))
mat_poly

array([[118,   1,   0,   0],
       [  1, 123,   4,   0],
       [  0,   2, 133,   2],
       [  0,   0,   5, 111]])

### Support Vector Machine - radially basis kernel

Test gamma between 0.0005 and 0.01 in 0.0001 increments and a range for C between 5 and 75 with increments of 0.5


In [16]:
gamma_range = np.arange(.0005, .01, .0001)
C_range = np.arange(5, 75, .5)

In [17]:
table = []
for gamma in gamma_range:
  for C in C_range:
    model = SVC(kernel = 'rbf', gamma = gamma, C = C, probability = False)
    model.fit(scale.fit_transform(x_train), y_train)
    table.append([gamma, C, model.score(scale.transform(x_test), y_test)])

In [18]:
tabrbf1 = pd.DataFrame(table, columns = ['gamma', 'C', 'score'])
tabrbf1 = tabrbf1.sort_values('score', ascending = False)
tabrbf1

Unnamed: 0,gamma,C,score
1250,0.0013,70.0,0.948
1388,0.0014,69.0,0.948
1251,0.0013,70.5,0.948
1252,0.0013,71.0,0.948
1512,0.0015,61.0,0.948
...,...,...,...
3,0.0005,6.5,0.892
2,0.0005,6.0,0.884
140,0.0006,5.0,0.884
1,0.0005,5.5,0.878


In [19]:
model_rbf = SVC(kernel = 'rbf', gamma = .00013, C = 70.0)
model_rbf.fit(scale.fit_transform(x_train),y_train)
model_rbf.score(scale.transform(x_test), y_test)

0.926

In [20]:
# confusion matrix
mat_rbf = CM(y_test, model_poly.predict(scale.transform(x_test)))
mat_rbf

array([[118,   1,   0,   0],
       [  1, 123,   4,   0],
       [  0,   2, 133,   2],
       [  0,   0,   5, 111]])

### K-Nearest neighbors

 test the performance for any number of neighbors between 3 and 125. The weights could be either ‘uniform’ or ‘distance.’


In [21]:
neigh_range = np.arange(3, 125)
weights = ['uniform', 'distance']

In [22]:
table = []
for neigh in neigh_range:
  for w in weights:
    model = KNeighborsClassifier(weights = w, n_neighbors = neigh)
    model.fit(scale.fit_transform(x_train), y_train)
    table.append([neigh, w, model.score(scale.transform(x_test), y_test)])

In [23]:
tabknn1 = pd.DataFrame(table, columns = ['neighbors', 'weights', 'score'])
tabknn1 = tabknn1.sort_values('score', ascending = False)
tabknn1

Unnamed: 0,neighbors,weights,score
233,119,distance,0.714
237,121,distance,0.714
225,115,distance,0.712
229,117,distance,0.710
235,120,distance,0.708
...,...,...,...
2,4,uniform,0.510
6,6,uniform,0.494
4,5,uniform,0.494
1,3,distance,0.486


In [24]:
model_knn = KNeighborsClassifier(n_neighbors = 119, weights = 'distance')
model_knn.fit(scale.fit_transform(x_train),y_train)
model_knn.score(scale.transform(x_test), y_test)

0.714

In [25]:
# confusion matrix
mat_knn = CM(y_test, model_knn.predict(scale.transform(x_test)))
mat_knn

array([[96, 23,  0,  0],
       [33, 75, 20,  0],
       [ 0, 25, 93, 19],
       [ 0,  1, 22, 93]])

# Question 2

In [33]:
scale = StandardScaler()
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 123)

###Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix as CM

In [28]:
def objective_LR():
  model = LogisticRegression(solver = 'lbfgs', max_iter = 10000)
  pipe = Pipeline([('scale', scale), ('model', model)])

  ext_val = []
  for idxtrain, idxtest in kf.split(x, y):
    pipe.fit(x[idxtrain], y[idxtrain])
    ext_val.append(1 - pipe.score(x[idxtest], y[idxtest]))
  return np.mean(ext_val)

In [29]:
objective_LR()

0.040999999999999995

### Support Vector Machine

In [30]:
C_range = np.arange(1, 75, 0.5)
degrees = [1, 2, 3]

In [31]:
def objective_poly(h):
  degrees = h[0]
  C = h[1]
  model = SVC(kernel = 'poly', C = C, degree = degrees)
  pipe = Pipeline([('scale', scale), ('model', model)])

  ext_val = []
  for idxtrain, idxtest in kf.split(x, y):
    pipe.fit(x[idxtrain], y[idxtrain])
    ext_val.append(1 - pipe.score(x[idxtest], y[idxtest]))
  return np.mean(ext_val)

In [36]:
table = []
for degree in degrees:
  for C in C_range:
    table.append([degree, C, objective_poly(h=[degree,C])])

In [37]:
tab_poly = pd.DataFrame(table, columns = ['Degree', 'C', 'error'])
tab_poly = tab_poly.sort_values('error')
tab_poly

Unnamed: 0,Degree,C,error
84,1,43.0,0.0355
83,1,42.5,0.0355
86,1,44.0,0.0360
62,1,32.0,0.0360
111,1,56.5,0.0360
...,...,...,...
152,2,3.0,0.5230
151,2,2.5,0.5265
150,2,2.0,0.5390
149,2,1.5,0.5405


### Support Vector Machine

In [41]:
gamma_range = np.arange(.0005, .01, .0001)
C_range = np.arange(5, 75, 0.5)

In [42]:
def objective_rad(h):
  gamma = h[0]
  C = h[1]
  model = SVC(kernel = 'rbf', C = C, gamma = gamma)
  pipe = Pipeline([('scale', scale), ('model', model)])

  ext_val = []
  for idxtrain, idxtest in kf.split(x, y):
    pipe.fit(x[idxtrain], y[idxtrain])
    ext_val.append(1 - pipe.score(x[idxtest], y[idxtest]))
  return np.mean(ext_val)

In [None]:
table = []
for gamma in gamma_range:
  for C in C_range:
    table.append([gamma, C, objective_rad(h=[degree,C])])

In [None]:
tab_rad = pd.DataFrame(table, columns = ['gamma', 'C', 'error'])
tab_rad = tab_rad.sort_values('error')
tab_rad

###K-Nearest neighbors

In [44]:
weights = ['distance', 'uniform']
neigh_range = np.arange(3, 125)

In [45]:
def objective_KNN(h):
  weight = h[0]
  neighbor = h[1]
  model = KNeighborsClassifier(weights = weight, n_neighbors = neighbor)
  pipe = Pipeline([('scale', scale), ('model', model)])

  ext_val = []
  for idxtrain, idxtest in kf.split(x, y):
    pipe.fit(x[idxtrain], y[idxtrain])
    ext_val.append(1 - pipe.score(x[idxtest], y[idxtest]))
  return np.mean(ext_val)

In [46]:
table = []
for weight in weights:
  for neighbor in neigh_range:
    table.append([neighbor, weight, objective_KNN(h=[weight, neighbor])])

In [47]:
tab_knn = pd.DataFrame(table, columns = ['n_neighbor', 'weight', 'error'])
tab_knn = tab_knn.sort_values('error')
tab_knn

Unnamed: 0,n_neighbor,weight,error
121,124,distance,0.3295
91,94,distance,0.3305
92,95,distance,0.3330
120,123,distance,0.3335
242,123,uniform,0.3340
...,...,...,...
125,6,uniform,0.4935
124,5,uniform,0.5015
123,4,uniform,0.5050
0,3,distance,0.5185


1) polynomial

2) rbf

3) logistic regression

4) n-nearest neighbor