# A Machine Learning Toolkit
Jonathan Zia (zia@gatech.edu)

## Import Databases for Regression and Classification

In [None]:
# Diabetes dataset (regression)
from sklearn.datasets import load_diabetes
# "Digits" dataset (classification)
from sklearn.datasets import load_digits

The "digits" dataset is composed of 1797 samples of 8x8 images. This results in a feature vector of length N = 64.

In [None]:
# Load "digits" dataset
n_classes = 10 # Set number of classes of digits
digits = load_digits(n_class = n_classes)

# Print the dimensionality of the data
print(digits.data.shape)
print(digits.target.shape)

# Write the data and targets to easier variables
x, y = digits.data, digits.target

Let's see an example digit from the dataset.

In [None]:
# Plot an example of the dataset
import matplotlib.pyplot as plt 
plt.figure()
plt.gray() 
plt.matshow(digits.images[0]) # <-- Integer number to view

## Dimensionality Reduction

We can perform linear dimensionality reduction with PCA using scikit-learn.

In [None]:
# Import the decomposition sub-package
from sklearn import decomposition

# Initialize our PCA mapping
# We set n_components < x.shape[1] to perform dim. reduction via elimination
pca = decomposition.PCA(n_components = x.shape[1])

# Fit the PCA mapping to the data
pca.fit(x)

# Transform the data using the learned mapping
x_pca = pca.transform(x)

Let's plot the first three PCs!

In [None]:
# Import the Matplotlib 3D utility
from mpl_toolkits import mplot3d

# Import color map utilities
import matplotlib.colors as colors
import matplotlib.cm as cmx

# Get the first three dimensions of the new PCA data
pca_data = x_pca[:, 0:3]

# Initialize the figure
fig, ax = plt.figure(), plt.axes(projection='3d')

# Prepare the color map
colormap = plt.get_cmap('gist_rainbow') # Import color map
cNorm  = colors.Normalize(vmin=0, vmax=n_classes) # Normalize color map values to the number of classes
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=colormap) # Create a scalar map of the color map

# Plot each point according to its class
for i in range(0, x_pca.shape[0]):
    # Get the color value corresponding to the current class
    colorVal = scalarMap.to_rgba(y[i])
    # Plot a scatterpoint with the proper color
    ax.scatter3D(x_pca[i, 0], x_pca[i, 1], x_pca[i, 2], color = colorVal)

The most useful attributes are the principal component (PC) vectors and variance explained.

In [None]:
# The PCs are an attribute of the trained PCA object
PC = pca.components_

# The variance explained is also an attribute of the object
var_exp = pca.explained_variance_ratio_

# Plot the variance explained as a bar graph
fig, ax = plt.figure(), plt.axes()
ax.set_xlabel("Principal Component (PC)")
ax.set_ylabel("Variance Explained (%)")
ax.bar(range(0, pca.components_.shape[0]), var_exp)
plt.title("Variance Explained")

We can visualize clustering in the data with t-SNE using scikit-learn.

In [None]:
from sklearn import manifold

# Again, we start by initializing our t-SNE object
# The attribute n_components sets the number of dimensions of the result
# The perplexity is the key hyperparameter for t-SNE -> akin to # of nearest neighbors
tsne = manifold.TSNE(n_components = 2, perplexity = 30)

# Obtain the embedding of the data
tsne_embedding = tsne.fit_transform(x)

# Let's plot the embedding on a scatter plot
fig, ax = plt.figure(), plt.axes()
for i in range(0, y.shape[0]):
    # Get the color associated with the class
    colorVal = scalarMap.to_rgba(y[i])
    # Plot the scatter point as defined by the t-SNE embedding
    ax.scatter(tsne_embedding[i, 0], tsne_embedding[i, 1], c = colorVal)

## Classification

We'll explore how to implement various different classifiers with scikit-learn. Let's first partition the data into training and testing sets.

### Training / Testing Set Splitting

In [None]:
# Since the samples are already randomized, we just split the set in half
x_train, x_test = x[0:round(x.shape[0]/2), :], x[round(x.shape[0]/2) + 1:-1, :]
y_train, y_test = y[0:round(y.shape[0]/2)], y[round(y.shape[0]/2) + 1:-1]

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# Initialize the logistic regression model
lr_model = LogisticRegression(random_state = 42)

# Fit the model to the training data
lr_model.fit(x_train, y_train)

# Generate predictions for testing data (class labels)
preds_lr = lr_model.predict(x_test)

# Let's visualize the performance with a confusion matrix!
# Just importing some useful visualization packages here
from sklearn.metrics import confusion_matrix
import seaborn as sb
# Generating a confusion matrix
cm = confusion_matrix(y_test, preds_lr)
# It looks great in a gray colormap
gray_map = plt.get_cmap('gist_yarg')
# Converting the matrix to a heatmap for visualization
heat_map = sb.heatmap(cm, cmap=gray_map)
plt.title("Logistic Regression Confusion Matrix")
plt.show()

# Print the accuracy
print("LR Accuracy: " + str(round(100*np.sum(preds_lr == y_test)/y_test.shape[0], 2)) + "%")

# Let's visualize how confident we were in our predictions
counter = np.zeros([1, n_classes]) # Total number of samples for each class
cumulative = np.zeros([1, n_classes]) # Cumulative confidence over all samples of a particular class
for i in range(0, x_test.shape[0]):
    # Increment the counter for the current class
    counter[0, y_test[i]] += 1
    # Get the probability of the correct class for the current sample...
    temp = lr_model.predict_proba(x_test[i, :].reshape(1, -1))
    # ... and add it to the cumulative
    cumulative[0, y_test[i]] += temp[0][y_test[i]]
# Compute the confidence per sample in each class
confidence = np.divide(cumulative, counter)
# Plot the confidence on a bar graph, showing confidence in each class
plt.bar(range(0, confidence.shape[1]), np.squeeze(confidence))
plt.title("Logistic Regression Confidence")
plt.show()

### Linear/Quadratic Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Initialize the LDA/QDA models
lda, qda = LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis()

# Generate fits using the training data
lda.fit(x_train, y_train)
qda.fit(x_train, y_train)

# Generate predictions using the testing data
preds_lda, preds_qda = lda.predict(x_test), qda.predict(x_test)

# Print the accuracy of each model
print("LDA Accuracy: " + str(round(100*np.sum(preds_lda == y_test)/y_test.shape[0], 2)) + "%")
print("QDA Accuracy: " + str(round(100*np.sum(preds_qda == y_test)/y_test.shape[0], 2)) + "%")

# Visualize the performance of each with a confusion matrix
cm_lda, cm_qda = confusion_matrix(y_test, preds_lda), confusion_matrix(y_test, preds_qda)
heat_map_lda = sb.heatmap(cm_lda, cmap=gray_map)
plt.title("LDA Confusion Matrix")
plt.show()
heat_map_qda = sb.heatmap(cm_qda, cmap=gray_map)
plt.title("QDA Confusion Matrix")
plt.show()

# Let's visualize how confident we were in our predictions
counter = np.zeros([1, 10])
lda_cumulative, qda_cumulative = np.zeros([1, 10]), np.zeros([1, 10])
for i in range(0, x_test.shape[0]):
    counter[0, y_test[i]] += 1
    temp_lda = lda.predict_proba(x_test[i, :].reshape(1, -1))
    temp_qda = qda.predict_proba(x_test[i, :].reshape(1, -1))
    lda_cumulative[0, y_test[i]] += temp_lda[0][y_test[i]]
    qda_cumulative[0, y_test[i]] += temp_qda[0][y_test[i]]
lda_confidence, qda_confidence = np.divide(lda_cumulative, counter), np.divide(qda_cumulative, counter)
plt.bar(range(0, lda_confidence.shape[1]), np.squeeze(lda_confidence))
plt.title("LDA Confidence")
plt.show()
plt.bar(range(0, qda_confidence.shape[1]), np.squeeze(qda_confidence))
plt.title("QDA Confidence")
plt.show()

### k-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize k-NN model and specify number of nearest neighbors
neighbors = 5
knn = KNeighborsClassifier(n_neighbors = neighbors)

# Fit the model to the training data
knn.fit(x_train, y_train)

# Test the model on the testing data
preds_knn = knn.predict(x_test)

# Print the accuracy of the model
print("k-NN Accuracy: " + str(round(100*np.sum(preds_knn == y_test)/y_test.shape[0], 2)) + "%")

# Visualize the performance with a confusion matrix
cm_knn = confusion_matrix(y_test, preds_knn)
heat_map_knn = sb.heatmap(cm_knn, cmap=gray_map)
plt.title("k-NN Confusion Matrix")
plt.show()

# Let's visualize how confident we were in our predictions
counter, cumulative = np.zeros([1, 10]), np.zeros([1, 10])
for i in range(0, x_test.shape[0]):
    counter[0, y_test[i]] += 1
    temp = knn.predict_proba(x_test[i, :].reshape(1, -1))
    cumulative[0, y_test[i]] += temp[0][y_test[i]]
confidence = np.divide(cumulative, counter)
plt.bar(range(0, confidence.shape[1]), np.squeeze(confidence))
plt.title("k-NN Confidence")
plt.show()

### Classification Tree

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the random forest classifier
# n_estimators: number of trees in the forest
# max_depth: maximum depth of tree
rf = RandomForestClassifier(n_estimators = 100, max_depth = 2, random_state=42)

# Fit the model to the training data
rf.fit(x_train, y_train)

# Test the model on the testing data
preds_rf = rf.predict(x_test)

# Print the accuracy of the model
print("RF Accuracy: " + str(round(100*np.sum(preds_rf == y_test)/y_test.shape[0], 2)) + "%")

# Visualize the performance with a confusion matrix
cm_rf = confusion_matrix(y_test, preds_rf)
heat_map_rf = sb.heatmap(cm_rf, cmap=gray_map)
plt.title("Random Forest Confusion Matrix")
plt.show()

# Let's plot the feature importance!
fig, ax = plt.figure(), plt.axes()
ax.set_xlabel("Feature")
ax.set_ylabel("Importance")
plt.title("Random Forest Feature Importance")
rf_importance = ax.bar(range(0, 64), rf.feature_importances_)

## Regression

We'll test out polynomial fit functions on the "diabetes" dataset. First we'll load the dataset.

In [None]:
### Load "diabetes" dataset
diabetes = load_diabetes()

# Print the dimensionality of the data
print(diabetes.data.shape)
print(diabetes.target.shape)

# Write the data and targets to easier variables
x, y = diabetes.data, diabetes.target

# Split the dataset into training and testing
x_train, x_test = x[0:round(x.shape[0]/2), :], x[round(x.shape[0]/2) + 1:-1, :]
y_train, y_test = y[0:round(y.shape[0]/2)], y[round(y.shape[0]/2) + 1:-1]

### Ordinary Least Squares

In [None]:
from sklearn import linear_model

# Initialize the linear model
linear_reg = linear_model.LinearRegression()

# Fit the model to the training data
linear_reg.fit(x_train, y_train)

# Generate predictions on the testing data using the trained model
preds_linear_reg = linear_reg.predict(x_test)

# Get the R^2
perf_linear_reg = linear_reg.score(x_test, y_test)

# Plot the predicted and actual values on a scatter plot
plt.scatter(preds_linear_reg, y_test)
plt.title("Linear Regression: R^2 = " + str(round(perf_linear_reg, 2)))
plt.show()

### Ridge/Lasso Regression

In [None]:
# Initialize the ridge/lasso regression models
# alpha: penalty factor
penalty = 1
ridge, lasso = linear_model.Ridge(alpha = penalty), linear_model.Lasso(alpha = penalty)

# Fit the models to the training data
ridge.fit(x_train, y_train)
lasso.fit(x_train, y_train)

# Generate predictions on the testing data using the trained models
preds_ridge, preds_lasso = ridge.predict(x_test), lasso.predict(x_test)

# Get the R^2
perf_ridge, perf_lasso = ridge.score(x_test, y_test), lasso.score(x_test, y_test)

# Plot the predicted and actual values on a scatter plot
plt.scatter(preds_ridge, y_test)
plt.title("Ridge Regression: R^2 = " + str(round(perf_ridge, 2)))
plt.show()
plt.scatter(preds_lasso, y_test)
plt.title("Lasso Regression: R^2 = " + str(round(perf_lasso, 2)))
plt.show()