## MARC 2022 Training Workshop on Machine Learning and NLP
## Part I: Machine Learning

### Jiangang Hao, ETS, contact: <jhao@ets.org>
----

### 1: Loading packages and data

In [None]:
# --- Data Transformation ---
from sklearn.datasets import load_iris
from sklearn import preprocessing
import pandas as pd
import warnings; warnings.simplefilter('ignore')

In [None]:
data = load_iris()
X = data.data
y = data.target

In [None]:
X.shape

In [None]:
y.shape

### 2: Preprocessing data

In [None]:
X = preprocessing.scale(X)

### 3: Supervised Learning Models

In [None]:
# --- Import Supervised Learning methods ---- 

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier # for Random Forest
from sklearn.ensemble import GradientBoostingClassifier # for Gradient Boosting Machine


In [None]:
model_SVM = SVC()
model_RF = RandomForestClassifier()
model_GBM = GradientBoostingClassifier()
model_ANN = MLPClassifier()
model_MaxEnt = LogisticRegression()

In [None]:
model_SVM

In [None]:
model_RF

In [None]:
model_GBM

In [None]:
model_ANN

In [None]:
model_MaxEnt

### 4: Training and Validation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [None]:
tmp1 = model_SVM.fit(X_train,y_train)
tmp2 = model_RF.fit(X_train,y_train)
tmp3 = model_GBM.fit(X_train,y_train)
tmp4 = model_ANN.fit(X_train,y_train)
tmp5 = model_MaxEnt.fit(X_train,y_train)

In [None]:
y_pred_MaxEnt = model_MaxEnt.predict(X_test)
y_pred_SVM = model_SVM.predict(X_test)
y_pred_RF = model_RF.predict(X_test)
y_pred_ANN = model_ANN.predict(X_test)
y_pred_GBM = model_GBM.predict(X_test)

### 5: Model Evalutation

In [None]:
# --- Model evaluation and selection ---
from sklearn.metrics import accuracy_score, cohen_kappa_score

In [None]:
accuracy_score(y_pred_MaxEnt, y_test)

In [None]:
accuracy_score(y_pred_SVM, y_test)

In [None]:
accuracy_score(y_pred_RF, y_test)

In [None]:
accuracy_score(y_pred_ANN, y_test)

In [None]:
accuracy_score(y_pred_GBM, y_test)

In [None]:
cohen_kappa_score(y_pred_MaxEnt, y_test)

In [None]:
cohen_kappa_score(y_pred_SVM, y_test)

In [None]:
cohen_kappa_score(y_pred_RF, y_test)

In [None]:
cohen_kappa_score(y_pred_ANN, y_test)

In [None]:
cohen_kappa_score(y_pred_GBM, y_test)

### 6: Cross-validation Approach

In [None]:
# --- Cross validation and hyper-paramter search
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(model_MaxEnt, X, y,cv=3)

In [None]:
cross_val_score(model_SVM, X, y,cv=3)

In [None]:
cross_val_score(model_RF, X, y,cv=3)

In [None]:
cross_val_score(model_ANN, X, y,cv=3)

In [None]:
cross_val_score(model_GBM, X, y,cv=3)

### 7: Hypter-parameter Searching

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
hyper_parameters = {'kernel':('linear', 'rbf'), 'C':[1,5,10,100]}

In [None]:
model_grid = GridSearchCV(model_SVM, hyper_parameters,cv=3)

In [None]:
model_grid.fit(X,y)

In [None]:
model_grid.best_params_

In [None]:
model_grid.best_score_

# Unsupervised Learning

### 1. Hiearchical Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage,fcluster
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# single linkage, complete linkage, ward
Z = linkage(X,'complete')
plt.figure(figsize=(10,10))
#t=dendrogram(Z)
t=dendrogram(Z,color_threshold=5)

In [None]:
# decide the cluster member assignments
cluster_label = fcluster(Z,4, criterion='distance')

In [None]:
np.unique(cluster_label).shape[0]

In [None]:
data.target

In [None]:
# compare with the true label
accuracy_score(abs(cluster_label -3),data.target)

In [None]:
# confusion matrix
pd.crosstab(abs(cluster_label -3),data.target)

### 2. K-mean clustering

In [None]:
from sklearn.cluster import KMeans
import seaborn as sbn
import numpy as np

In [None]:
kmeans_cluster = KMeans(n_clusters=4, random_state=0).fit(X)

In [None]:
cluster_labels = kmeans_cluster.labels_

In [None]:
kmeans_cluster.cluster_centers_

In [None]:
# number of cluster using inertia
inertia = [KMeans(n_clusters=n, random_state=0).fit(X).inertia_ for n in range(1,8)]
plt.plot(np.arange(1,8),inertia,'bo-')

In [None]:
# number of clusters using silhouette score
from sklearn.metrics import silhouette_score

In [None]:
s_score = [silhouette_score(X, KMeans(n_clusters=n, random_state=0).fit(X).labels_) for n in range(2,8)]
plt.plot(np.arange(2,8),s_score,'go-')
plt.xlabel('Number of clusters')
plt.ylabel('Mean Silhouette Score')

In [None]:
# number of cluster for hierachical clustering CH index

from sklearn.metrics import calinski_harabasz_score
ch_score = [calinski_harabasz_score(X,fcluster(linkage(X,'complete'),n, criterion='distance')) for n in range(2,6)]
n_cluster = [np.unique(fcluster(linkage(X,'complete'),n, criterion='distance')).shape[0] for n in range(2,6)]
plt.plot(n_cluster,ch_score,'ro-')
plt.xlabel('Number of Clusters')
plt.ylabel('CH index')

### 3. Dimensionality Reduction: t-SNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
# example 1: iris data
# get iris data again
data = load_iris()
X = data.data
y = data.target

In [None]:
X.shape

In [None]:
y.shape

In [None]:
TSNE().get_params()

In [None]:
# t-SNE 
X_embedded = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=20).fit_transform(X)

In [None]:
X_embedded.shape

In [None]:
df = pd.DataFrame(X_embedded)
df.columns = ['TSNE1','TSNE2']
df['label'] = y.astype('str')

In [None]:
df.head()

In [None]:
sbn.scatterplot(df,x='TSNE1',y='TSNE2',hue='label')
plt.legend(loc='lower right')

In [None]:
# example 2, MNIST digits

from sklearn.datasets import load_digits
digits = load_digits(n_class=10)
X, y = digits.data, digits.target
n_samples, n_features = X.shape
n_neighbors = 30

In [None]:
# display digits images
fig, axs = plt.subplots(nrows=10, ncols=10, figsize=(6, 6))
for idx, ax in enumerate(axs.ravel()):
    ax.imshow(X[idx].reshape((8, 8)), cmap=plt.cm.binary)
    ax.axis("off")
_ = fig.suptitle("digit images", fontsize=16)

In [None]:
X_embedded = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=30).fit_transform(X)
df = pd.DataFrame(X_embedded)
df.columns = ['TSNE1','TSNE2']
df['label'] = y.astype('str')

In [None]:
sbn.scatterplot(df,x='TSNE1',y='TSNE2',hue='label')
plt.legend(loc='lower right')