# Powerful Classifiers 
Here we will try powerful classifiers, including support vector machines, random forests, neural networks. Remember that 90% of Machine Learning is about classification. This lecture includes precise but uninterpretable classifiers.


# Load file
Commonly two libraries are used to load a csv files.
- numpy function `np.loadtext` and `np.genfromtext ` 
- pandas function `pd.read_csv`

Here we prefer using pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

path='data/'
filename = path+'spamdata.csv'
spam = pd.read_csv(filename)

In [2]:
X = spam.values[:,:57]
y = spam.values[:,57]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, \
                                    test_size = 0.2, random_state=1)

from sklearn.metrics import confusion_matrix, accuracy_score

# Random Forests
Random forests is one of the powerful classification tools. Computations for moderate number of samples is rather fast. 



In [3]:
from sklearn.tree import DecisionTreeClassifier
classification_tree_spam = DecisionTreeClassifier(max_depth = 10)

In [7]:
from sklearn.ensemble import BaggingClassifier
bag = BaggingClassifier(classification_tree_spam, n_estimators=100, \
                        random_state=1)
bag.fit(X_train, y_train)

y_bag_train = bag.predict(X_train)
accuracy_score(y_train, y_bag_train)

0.9698369565217392

In [11]:
y_bag_test = bag.predict(X_test)
accuracy_score(y_test, y_bag_test)

0.9446254071661238

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=10, \
                            random_state=1)
rf.fit(X_train, y_train)
y_rf_train = rf.predict(X_train)
accuracy_score(y_train, y_rf_train)

0.967391304347826

In [14]:
y_rf_test = rf.predict(X_test)
accuracy_score(y_test, y_rf_test)

0.9500542888165038

# Fine-tune the depth of the trees

In [None]:
# Initialize the accuracy_score vector
acc = []
acc_train = []
depth = np.arange(1, 50)
# Calculate accuracy score on the test set for different depths of the trees
for i in depth:
    # Fit the Regression Tree
    dt = RandomForestClassifier(n_estimators=100, max_depth=i, random_state=1)
    dt.fit(X_train,y_train)
    # Predict on the test set
    y_pred = dt.predict(X_test)
    # Compute the accuracy
    score = accuracy_score(y_test, y_pred)  
    acc.append(score)
    acc_train.append(accuracy_score(y_train, dt.predict(X_train)))
# Plot results    
plt.plot(depth, acc, '-', depth, acc_train, 'r')
plt.xlabel('Depth of the trees')
plt.ylabel('Accuracy')
plt.title('spam');

In [None]:
print("The best depth of the trees is ", np.argmax(acc)+1, " with accuracy of ", np.amax(acc))


# Suppor Vector Machines

SVMs are like linear regression, expanded in kernel space.

In [None]:
# try C=1, C=10, C=100
from sklearn.svm import SVC
sv = SVC(C=10)
sv.fit(X_train,y_train)

In [None]:
y_svc_train = sv.predict(X_train)
accuracy_score(y_svc_train, y_train)

In [None]:
y_svc_test = sv.predict(X_test)
accuracy_score(y_svc_test, y_test)

# SVM and Cross-Validation

In [None]:
# Only one iteration of KFold for single C
from sklearn.model_selection import KFold
k = 5
acck = np.zeros(k)
kf = KFold(n_splits=k, shuffle=True)
i = 0
for train_i, test_i in kf.split(spam):
    sv = SVC(C=10)
    sv = sv.fit(X[train_i], y[train_i])
    acck[i]=accuracy_score(sv.predict(X[test_i]), y[test_i], normalize=False)
    i+=1
    


In [None]:
np.sum(acck)/X.shape[0]

In [None]:
# One iteration of Kfold and tune C between zero and 10
nb = 5
penalty = np.linspace(0.01, 100, nb)
k = 5
acck = np.zeros(k)
acc = np.zeros(len(penalty))
kf = KFold(n_splits=k, shuffle=True)
for c in range(len(penalty)):
    i = 0
    for train_i, test_i in kf.split(spam):
        sv = SVC(C=penalty[c])
        sv = sv.fit(X[train_i], y[train_i])
        acck[i]=accuracy_score(sv.predict(X[test_i]), y[test_i], normalize=False)
        i+=1
    acc[c] = np.sum(acck)/X.shape[0]

In [None]:
acc

## SVM on Synthetic Data

In [None]:
# We'll define a function to draw a nice plot of an SVM
def plot_svc(svc, X, y, h=0.02, pad=0.25):
    x_min, x_max = X[:, 0].min()-pad, X[:, 0].max()+pad
    y_min, y_max = X[:, 1].min()-pad, X[:, 1].max()+pad
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.2)

    plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=mpl.cm.Paired)
    # Uncomment the next two lines if you want support vectors indicated in plot by vertical lines
    #sv = svc.support_vectors_
    #plt.scatter(sv[:,0], sv[:,1], c='k', marker='x', s=100, linewidths='1')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()
    #print('Number of support vectors: ', svc.support_.size)

In [None]:
# Generating random data: 30 observations of 2 features and divide into two classes.
np.random.seed(5)
X = np.random.randn(30,2)
y = np.repeat([1,-1], 15)

X[y == -1] = X[y == -1]+1

In [None]:
#Let's plot the data to see whether the classes are linearly separable:
plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=mpl.cm.Paired)
plt.xlabel('X1')
plt.ylabel('X2')

In [None]:
#Next, we fit the support vector classifier:
svc = SVC(C=10, kernel='rbf')
svc.fit(X, y)

We can now plot the support vector classifier by calling the  𝚙𝚕𝚘𝚝_𝚜𝚟𝚌()  function on the output of the call to  𝚂𝚅𝙲() , as well as the data used in the call to  𝚂𝚅𝙲() :

In [None]:
plot_svc(svc, X, y)

In [None]:
svc2 = SVC(C=1, kernel='linear')
svc2.fit(X, y)
plot_svc(svc2, X, y)

# Select the optimal C

In [None]:
from sklearn.model_selection import GridSearchCV

# Select the optimal C parameter by cross-validation
tuned_parameters = [{'C': [0.001, 0.01, 0.1, 1, 5, 10, 100]}]
clf = GridSearchCV(SVC(kernel='linear'), tuned_parameters, cv=10, scoring='accuracy')
clf.fit(X, y)

In [None]:
clf.best_params_

# Neural Networks
Shallow neural networks often produces comparable results with random forest, bagging and boosting.

In [None]:
# build two hidden layers, each layer with 10 neurons
from sklearn.neural_network import MLPClassifier

In [None]:
nn = MLPClassifier(hidden_layer_sizes=(10, 10), activation='logistic')
nn.fit(X_train, y_train)

In [None]:
accuracy_score(nn.predict(X_train), y_train)

In [None]:
accuracy_score(nn.predict(X_test), y_test)