In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
%matplotlib notebook

import matplotlib.pylab as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier,SGDClassifier
from sklearn.model_selection import GridSearchCV
from scipy.io import loadmat
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier


filename="thyroidTrain.mat"
# Loading the dataset into X (inputs) and y (outputs)
mat = loadmat(filename)
Xtrain=mat["trainThyroidInput"]
Ytrain=mat["trainThyroidOutput"]
xtest=mat["testThyroidInput"]
Ytrain=np.argmax(Ytrain, axis=1)

In [10]:
print("Xtrain shape is {}".format(Xtrain.shape))
print("Ytrain shape is {}".format(Ytrain.shape))
print("xtest shape is {}".format(xtest.shape))

Xtrain shape is (5000, 21)
Ytrain shape is (5000,)
xtest shape is (2200, 21)


In [11]:
print("Xtrain",Xtrain[0:5,0:5])
print("Ytrain",Ytrain[0:5])
print("xtest",xtest[0:5,0:5])


Xtrain [[0.28 0.   1.   0.   0.  ]
 [0.45 0.   0.   0.   0.  ]
 [0.77 0.   0.   0.   0.  ]
 [0.72 0.   1.   0.   1.  ]
 [0.78 0.   0.   0.   0.  ]]
Ytrain [2 2 2 2 2]
xtest [[0.73 0.   0.   0.   0.  ]
 [0.6  0.   0.   0.   0.  ]
 [0.79 1.   0.   0.   0.  ]
 [0.26 0.   0.   0.   0.  ]
 [0.71 0.   0.   0.   0.  ]]


In [12]:
from sklearn import preprocessing
Xtrain = preprocessing.normalize(Xtrain)
print("Xtrain",Xtrain[0:5,0:5])

Xtrain [[0.1910307  0.         0.68225249 0.         0.        ]
 [0.89235067 0.         0.         0.         0.        ]
 [0.47498392 0.         0.         0.         0.        ]
 [0.4472743  0.         0.6212143  0.         0.6212143 ]
 [0.97192326 0.         0.         0.         0.        ]]


In [6]:
pca = PCA()
Lin=RidgeClassifier()
pipe = Pipeline(steps=[('pca', pca), ('Lin', Lin)])
param_grid = {'pca__n_components':range(1,10),"Lin__alpha":range(1,10)}
grid_clf = GridSearchCV(pipe, param_grid,cv=10,refit=True,n_jobs=-1)
grid_clf.fit(Xtrain, Ytrain)
grid_clf.best_params_

{'Lin__alpha': 1, 'pca__n_components': 1}

In [7]:
pca = PCA()
KNeighborsRegressor=KNeighborsClassifier()
k_range = list(range(1,31))
weight_options = ["uniform", "distance"]
param_grid = {'pca__n_components':range(1,10),'KNeighborsRegressor__n_neighbors': k_range, 'KNeighborsRegressor__weights' : weight_options}
pipe = Pipeline(steps=[('pca', pca), ('KNeighborsRegressor', KNeighborsRegressor)])
grid_clf = GridSearchCV(pipe, param_grid,cv=10,refit=True,n_jobs=-1)
grid_clf.fit(Xtrain, Ytrain)
grid_clf.best_params_

{'KNeighborsRegressor__n_neighbors': 13,
 'KNeighborsRegressor__weights': 'uniform',
 'pca__n_components': 5}

In [8]:
pca = PCA()
SVR=SVC()
param_grid={'pca__n_components':range(1,10),'SVR__C': [1, 10,100], 'SVR__kernel': ('rbf','poly','sigmoid','linear'),'SVR__gamma' : ('auto','scale')}
pipe = Pipeline(steps=[('pca', pca), ('SVR',SVR)])
grid_clf = GridSearchCV(pipe, param_grid,cv=10,refit=True,n_jobs=-1)
grid_clf.fit(Xtrain, Ytrain)
grid_clf.best_params_

{'SVR__C': 100,
 'SVR__gamma': 'scale',
 'SVR__kernel': 'rbf',
 'pca__n_components': 9}

In [9]:
pca = PCA()
SGD=SGDClassifier()
param_grid={'pca__n_components':range(1,10),'SGD__alpha':[.0001,.001,1],'SGD__loss': ['squared_loss', 'huber', 'epsilon_insensitive'],'SGD__eta0':[0.1,0.2,0.01],
    'SGD__penalty': ['l2', 'l1', 'elasticnet'],
    'SGD__learning_rate': ['constant', 'optimal', 'invscaling']}
pipe = Pipeline(steps=[('pca', pca), ('SGD',SGD)])
grid_clf = GridSearchCV(pipe, param_grid,cv=10,refit=True,n_jobs=-1)
grid_clf.fit(Xtrain, Ytrain)
grid_clf.best_params_

{'SGD__alpha': 0.0001,
 'SGD__eta0': 0.1,
 'SGD__learning_rate': 'constant',
 'SGD__loss': 'squared_loss',
 'SGD__penalty': 'l2',
 'pca__n_components': 1}

In [10]:
pca = PCA()
tree=DecisionTreeClassifier()
param_grid={'pca__n_components':range(1,10),'tree__min_samples_split':  [10, 20, 40],'tree__min_samples_leaf':[20, 40, 100],'tree__max_depth':[2, 6, 8,10,50],'tree__max_leaf_nodes': [5, 20, 100]}
pipe = Pipeline(steps=[('pca', pca), ('tree',tree)])
grid_clf = GridSearchCV(pipe, param_grid,cv=10,refit=True,n_jobs=-1)
grid_clf.fit(Xtrain, Ytrain)
grid_clf.best_params_

{'pca__n_components': 7,
 'tree__max_depth': 50,
 'tree__max_leaf_nodes': 20,
 'tree__min_samples_leaf': 20,
 'tree__min_samples_split': 10}

In [16]:
pca=PCA(n_components=1)
principlecomponents=pca.fit_transform(Xtrain)

scores=cross_val_score(RidgeClassifier(alpha=1), principlecomponents, Ytrain, cv=10,scoring="accuracy")
print("accuracy",scores.mean())
print("STD:",scores.std())
print("Variance",scores.var())

accuracy 0.9254010200040801
STD: 0.0008201756874977336
Variance 6.726881583623799e-07


In [17]:
pca=PCA(n_components=5)
principlecomponents=pca.fit_transform(Xtrain)

scores=cross_val_score(KNeighborsClassifier(n_neighbors=13,weights="uniform"), principlecomponents, Ytrain, cv=10,scoring="accuracy")
print("accuracy",scores.mean())
print("STD:",scores.std())
print("Variance",scores.var())

accuracy 0.9309998335993341
STD: 0.0030014103494368535
Variance 9.008464085706654e-06


In [18]:
pca=PCA(n_components=9)
principlecomponents=pca.fit_transform(Xtrain)

scores=cross_val_score(SVC(C=100,gamma="scale",kernel="rbf"), principlecomponents, Ytrain, cv=10,scoring="accuracy")
print("accuracy",scores.mean())
print("STD:",scores.std())
print("Variance",scores.var())

accuracy 0.9343970391881568
STD: 0.004499336641817846
Variance 2.0244030216404687e-05


In [19]:
pca=PCA(n_components=1)
principlecomponents=pca.fit_transform(Xtrain)

scores=cross_val_score(SGDClassifier(eta0=.1,alpha=0.0001,learning_rate="constant",loss="squared_loss",penalty="l2"), principlecomponents, Ytrain, cv=10,scoring="accuracy")
print("accuracy",scores.mean())
print("STD:",scores.std())
print("Variance",scores.var())


accuracy 0.9254010200040801
STD: 0.0008201756874977336
Variance 6.726881583623799e-07


In [20]:
pca=PCA(n_components=7)
principlecomponents=pca.fit_transform(Xtrain)

scores=cross_val_score(DecisionTreeClassifier(max_depth=50,max_leaf_nodes=20,min_samples_leaf=20,min_samples_split=10), principlecomponents, Ytrain, cv=10,scoring="accuracy")
print("accuracy",scores.mean())
print("STD:",scores.std())
print("Variance",scores.var())

accuracy 0.9257954199816801
STD: 0.005082768682953596
Variance 2.5834537484413835e-05


In [None]:
"hidden_layer_sizes": [(1,),(50,).(100,),(200,)], "activation": ["identity", "logistic", "tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": [0.00005,0.0005,0.00001]

In [1]:
pca = PCA()
MLP=MLPClassifier()

pipe=Pipeline(steps=[("pca",pca),("MLP",MLP)])
param_grid = {'pca__n_components':range(1,15),"MLP__hidden_layer_sizes": [(500,),(1000,)], "MLP__solver": ["lbfgs", "adam"],"MLP__activation": ["identity", "logistic", "tanh", "relu"],"MLP__alpha": [0.00005]}

grid_clf = GridSearchCV(pipe, param_grid,cv=10,refit=True,n_jobs=-1)
grid_clf.fit(Xtrain, Ytrain)
grid_clf.best_params_

In [7]:
pca = PCA()
MLP=MLPClassifier()

pipe=Pipeline(steps=[("pca",pca),("MLP",MLP)])
param_grid = {'pca__n_components':range(1,15),"MLP__hidden_layer_sizes": [(50,),(100,)], "MLP__solver": ["lbfgs"],"MLP__activation": [ "relu"],"MLP__alpha": [0.00005]}

grid_clf = GridSearchCV(pipe, param_grid,cv=10,refit=True,n_jobs=-1)
grid_clf.fit(Xtrain, Ytrain)
grid_clf.best_params_

{'MLP__activation': 'relu',
 'MLP__alpha': 5e-05,
 'MLP__hidden_layer_sizes': (50,),
 'MLP__solver': 'lbfgs',
 'pca__n_components': 4}

In [None]:
#No normalisation

In [11]:
MLP=MLPClassifier(hidden_layer_sizes=(50,),activation="relu",solver="lbfgs",alpha=0.00005)
scores=cross_val_score(MLP,Xtrain, Ytrain, cv=10,scoring="accuracy")
print("accuracy",scores.mean())

accuracy 0.9698027288109152


In [12]:
pca=PCA(n_components=4)
pp=pca.fit_transform(Xtrain)
MLP=MLPClassifier(hidden_layer_sizes=(50,),activation="relu",solver="lbfgs",alpha=0.00005)
scores=cross_val_score(MLP,pp, Ytrain, cv=10,scoring="accuracy")
print("accuracy",scores.mean())

accuracy 0.9256014208056833


In [None]:
# pca=PCA(n_components=4)
# pp=pca.fit_transform(Xtrain)
MLP=MLPClassifier(hidden_layer_sizes=(50,),activation="relu",solver="lbfgs",alpha=0.00005)
scores=cross_val_score(grid_clf,Xtrain, Ytrain, cv=10,scoring="accuracy")
print("accuracy",scores.mean())

In [None]:
#With normalisation

In [13]:
MLP=MLPClassifier(hidden_layer_sizes=(50,),activation="relu",solver="lbfgs",alpha=0.00005)
scores=cross_val_score(MLP,Xtrain, Ytrain, cv=10,scoring="accuracy")
print("accuracy",scores.mean())

accuracy 0.9645991199964798
