# Import Dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Studio Activity 1

## Read the Dataframe

In [2]:
w1_df = pd.read_csv('w1.csv')
w2_df = pd.read_csv('w2.csv')
w3_df = pd.read_csv('w3.csv')
w4_df = pd.read_csv('w4.csv')

## Combine into 1 Pandas Dataframe

In [3]:
# Merge the DataFrames
merged_df = pd.concat([w1_df, w2_df, w3_df, w4_df], ignore_index=True)

In [4]:
# Shuffle the DataFrame
merged_df = merged_df.sample(frac=1).reset_index(drop=True)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11629 entries, 0 to 11628
Columns: 157 entries, acc_mean_x_right to class
dtypes: float64(128), int64(29)
memory usage: 13.9 MB


# Studio Activity 2: Model Training

## Import Dependencies

In [5]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

## Seperate Train, Test Set

In [6]:
X_train, X_test, y_train, y_test = train_test_split(merged_df.drop(columns=['class']), merged_df['class'], test_size=0.3, random_state=1)
print(X_train.shape)
print(X_test.shape)

(8140, 156)
(3489, 156)


## Train the model using SVM without Cross Validation

In [7]:
clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8882201203783319

## Train the model using SVM with Cross Validation

In [8]:
clf = svm.SVC()
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10)
print(scores)

y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

[0.8992629  0.89189189 0.8955774  0.8992629  0.8968059  0.88820639
 0.88943489 0.9004914  0.87714988 0.89066339]


0.8882201203783319

# Studio Activity 3: Hyper parameter tuning

## Using sigmoid (rbf is default kernel)

In [9]:
clf = svm.SVC(kernel='sigmoid')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8071080538836343

## Using GridSearchCV to identify optimal values of hyper parameters

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
hyper_parameters = {'kernel':('sigmoid', 'rbf'), 'C':[1, 10, 20]}
svc_model = svm.SVC()
clf = GridSearchCV(svc_model, hyper_parameters)
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10)
print(scores)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

[0.89434889 0.8980344  0.8992629  0.90786241 0.90663391 0.88452088
 0.89189189 0.9041769  0.88329238 0.8980344 ]


0.8945256520492978

# Studio Activity 4: Feature Selection

#### Importing Dependecies

In [12]:
from sklearn.feature_selection import SelectKBest, f_regression

#### Select 100 best feature from X_train

In [13]:
print(X_train.shape)

(8140, 156)


In [14]:
X_new = SelectKBest(f_regression, k=100).fit_transform(merged_df.drop(columns=['class']), merged_df['class'])

In [15]:
print(X_new.shape)

(11629, 100)


#### Split the Dataset

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_new, merged_df['class'], test_size=0.3, random_state=1)
print(X_train.shape)
print(X_test.shape)

(8140, 100)
(3489, 100)


#### Training Model with Parameters Tuning

In [17]:
hyper_parameters = {'kernel':('sigmoid', 'rbf'), 'C':[1, 10, 20]}
svc_model = svm.SVC()
clf = GridSearchCV(svc_model, hyper_parameters)
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10)
print(scores)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

[0.90540541 0.88943489 0.89066339 0.8992629  0.8955774  0.89189189
 0.88206388 0.9004914  0.88083538 0.89189189]


0.886787044998567

# Studio Activity 5: Dimensionality reduction

#### Import Dependencies

In [18]:
from sklearn.decomposition import PCA

#### Reduce dimension on original dataset

In [19]:
pca = PCA(n_components=10).fit(merged_df.drop(columns=['class']))

In [20]:
X_train_pca = pca.fit_transform(merged_df.drop(columns=['class']))
print(X_train_pca.shape)

(11629, 10)


#### Split Dataset

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_train_pca, merged_df['class'], test_size=0.3, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8140, 10)
(8140,)
(3489, 10)
(3489,)


#### Train the model

In [22]:
hyper_parameters = {'kernel':('sigmoid', 'rbf'), 'C':[1, 10, 20]}
svc_model = svm.SVC()
clf = GridSearchCV(svc_model, hyper_parameters)
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10)
print(scores)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

[0.9017199  0.8955774  0.8968059  0.90786241 0.9041769  0.88820639
 0.89312039 0.90540541 0.88452088 0.89434889]


0.8933791917454859

# Studio Activity 6: Prepare a summary table
#### Reported in the Portfolio Week 3

# Studio Activity 7: Other classifiers

In [23]:
X_train, X_test, y_train, y_test = train_test_split(merged_df.drop(columns=['class']), merged_df['class'], test_size=0.3, random_state=1)

## SGD

In [24]:
from sklearn.linear_model import SGDClassifier

#### Training without Cross validation

In [25]:
clf = SGDClassifier(loss="hinge", max_iter=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)



0.8661507595299512

#### Training with Cross validation

In [26]:
clf = svm.SVC()
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10)
print(scores)

y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

[0.8992629  0.89189189 0.8955774  0.8992629  0.8968059  0.88820639
 0.88943489 0.9004914  0.87714988 0.89066339]


0.8882201203783319

## Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

#### Training without Cross validation

In [28]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.9229005445686443

#### Training with Cross validation

In [29]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10)
print(scores)

y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

[0.92751843 0.91277641 0.91891892 0.92997543 0.92506143 0.91646192
 0.91400491 0.91891892 0.91891892 0.91523342]


0.9231871596445973

## Multi-Layer Perceptron (MLP)

In [30]:
from sklearn.neural_network import MLPClassifier

#### Training without Cross validation

In [31]:
clf = MLPClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.7449125824018343

#### Training with Cross validation

In [32]:
clf = MLPClassifier()
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10)
print(scores)

y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

[0.87592138 0.87837838 0.88697789 0.82432432 0.84520885 0.86732187
 0.89066339 0.83169533 0.88329238 0.86240786]


0.8865004299226139