## Mini Project

In [1]:
import torch
import time
import numpy as np

### Load Dataset - MNIST

In [None]:
import torchvision
import torchvision.transforms as transforms

In [None]:
batch_size=100

transform = transforms.Compose([
transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])

trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
len(trainset),len(testset)

In [None]:
import matplotlib.pyplot as plt

dataiter = iter(trainloader)
images, labels = dataiter.next()
images = images[:100,:,:,:]
images = images / 2 + 0.5
plt.figure(figsize=(10,10),dpi=100)
plt.imshow(np.transpose(torchvision.utils.make_grid(images,nrow=10).numpy(),(1,2,0)))
plt.axis('off')
plt.show()

## Feature Extraction - scattering net

### scattering net with known invariants

In [None]:
from kymatio import Scattering2D

scattering = Scattering2D(J=2, shape=(28, 28), max_order=1)
K = 17 # 1st order

# scattering = Scattering2D(J=2, shape=(28, 28), max_order=2)
# K = 81 # 2nd order

scattering = scattering.cuda()

In [None]:
Sdata = np.zeros((len(trainset),833)) # 17*7*7
# Sdata = np.zeros((len(trainset),3969)) # 81*7*7

Slabel = np.zeros((len(trainset),1))
Sdata.shape, Slabel.shape

In [None]:
start = time.time()

for batch_idx, (data, target) in enumerate(trainloader):
    print(batch_idx,end=',')
    data = data.to('cuda')
    sdata = scattering(data)
    Sdata[batch_size*batch_idx:(batch_idx+1)*batch_size,:] = sdata.to('cpu').numpy().reshape(batch_size,-1)
    Slabel[batch_size*batch_idx:(batch_idx+1)*batch_size,:] = target.numpy().reshape(batch_size,-1)
    
end = time.time()
end - start

In [None]:
test = scattering(data).to('cpu').numpy()[0]

In [None]:
test.shape

In [None]:
plt.figure(figsize=(16,2.5))
plt.subplot(161)
plt.imshow(test[0,0,:,:])
plt.axis('off')
plt.subplot(162)
plt.imshow(test[0,3,:,:])
plt.axis('off')
plt.subplot(163)
plt.imshow(test[0,6,:,:])
plt.axis('off')
plt.subplot(164)
plt.imshow(test[0,9,:,:])
plt.axis('off')
plt.subplot(165)
plt.imshow(test[0,12,:,:])
plt.axis('off')
plt.subplot(166)
plt.imshow(test[0,15,:,:])
plt.axis('off')
plt.show()

## Visualize

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.style as style 
import matplotlib.pyplot as plt
import matplotlib

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

### PCA of scattering net

In [None]:
start = time.time()

Sdata_pca_embedded = PCA(n_components=2).fit_transform(Sdata)  
Sdata_pca_embedded.shape

end = time.time()
end - start

In [None]:
Sdata_pca_embedded_df = pd.DataFrame(Sdata_pca_embedded,index=Slabel)
Sdata_pca_embedded_df = Sdata_pca_embedded_df.reset_index()
Sdata_pca_embedded_df['index'] = Sdata_pca_embedded_df['index'].astype(int)
Sdata_pca_embedded_df_plot = Sdata_pca_embedded_df.loc[range(5000),:]
Sdata_pca_embedded_df_plot.columns = ['Label','PCA Dim1', 'PCA Dim2']
plt.figure(figsize=(10,10),dpi=120)
ax = sns.scatterplot(x='PCA Dim1', y='PCA Dim2',hue='Label',data=Sdata_pca_embedded_df_plot,
                     palette='Set1',alpha=0.6,s=20,legend='full') # tab10  Set1  husl 
legend = plt.legend(bbox_to_anchor=(1, 0.6))
plt.show()

### tSNE of scattering net
t-SNE(t-distributed stochastic neighbor embedding)

In [None]:
Sdata.shape

In [None]:
start = time.time()

Sdata_tsne_embedded = TSNE(n_components=2).fit_transform(Sdata)
# Sdata_tsne_embedded.shape

end = time.time()
end - start

In [None]:
Sdata_tsne_embedded_df = pd.DataFrame(Sdata_tsne_embedded,index=Slabel)
Sdata_tsne_embedded_df = Sdata_tsne_embedded_df.reset_index()
Sdata_tsne_embedded_df['index'] = Sdata_tsne_embedded_df['index'].astype(int)
Sdata_tsne_embedded_df_plot = Sdata_tsne_embedded_df.loc[range(5000),:]
Sdata_tsne_embedded_df_plot.columns = ['Label','tSNE Dim1', 'tSNE Dim2']
plt.figure(figsize=(10,10),dpi=120)
ax = sns.scatterplot(x='tSNE Dim1', y='tSNE Dim2',hue='Label',data=Sdata_tsne_embedded_df_plot,
                     palette='tab10',alpha=0.6,s=20,legend='full')
legend = plt.legend(bbox_to_anchor=(1.001, 0.7),facecolor='white',edgecolor='white')
plt.show()

### Classifications based on features extracted by scattering net

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

start = time.time()

lda = LinearDiscriminantAnalysis()
# (3.6621105670928955, array([0.97090582, 0.9729973 , 0.97089709]))

lda_results = cross_validate(lda, Sdata[:10000], Slabel[:10000], cv=3, scoring='accuracy')
lda_results = lda_results['test_score']

end = time.time()
end - start, lda_results

In [None]:
from sklearn.linear_model import LogisticRegression

start = time.time()

logistic_regression = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
# (3.1544511318206787, array([0.97810438, 0.97209721, 0.969997  ]))
# logistic_regression = LogisticRegression(penalty = 'elasticnet',l1_ratio = 0.15,random_state=0, solver='saga')
# (190.6933810710907, array([0.9790042 , 0.969997  , 0.96879688]))

logistic_regression_results = cross_validate(logistic_regression, Sdata[:10000], Slabel[:10000], cv=3, scoring='accuracy')
logistic_regression_results = logistic_regression_results['test_score']

end = time.time()
end - start, logistic_regression_results

In [None]:
from sklearn import svm

start = time.time()

linear_svc = svm.LinearSVC()
# (24.275047779083252, array([0.98530294, 0.98259826, 0.9789979 ]))
# linear_svc = svm.LinearSVC(random_state=0, tol=1e-5)
# (25.60028052330017, array([0.98530294, 0.98259826, 0.9789979 ]))

svm_results = cross_validate(linear_svc, Sdata[:10000], Slabel[:10000], cv=3, scoring='accuracy')
svm_results = svm_results['test_score']

end = time.time()
end - start, svm_results

In [None]:
from sklearn.ensemble import RandomForestClassifier

start = time.time()

random_forest = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
# (3.6762218475341797, array([0.93041392, 0.92859286, 0.93249325]))
# random_forest = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=2, random_state=2)
# (17.511837482452393, array([0.89472106, 0.88178818, 0.89888989]))

random_forest_results = cross_validate(random_forest, Sdata[:10000], Slabel[:10000], cv=3, scoring='accuracy')
random_forest_results = random_forest_results['test_score']

end = time.time()
end - start, random_forest_results

In [None]:
lda_results,logistic_regression_results,svm_results,random_forest_results

In [None]:
lda_results.mean(),logistic_regression_results.mean(),svm_results.mean(),random_forest_results.mean()

In [None]:
lda_results.std(),logistic_regression_results.std(),svm_results.std(),random_forest_results.std()