# Import Dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Studio Activity 1

## Read the Dataframe

In [2]:
w1_df = pd.read_csv('w1.csv')
w2_df = pd.read_csv('w2.csv')
w3_df = pd.read_csv('w3.csv')
w4_df = pd.read_csv('w4.csv')

## Combine into 1 Pandas Dataframe

In [58]:
# Merge the DataFrames
merged_df = pd.concat([w1_df, w2_df, w3_df, w4_df], ignore_index=True)

In [4]:
# Shuffle the DataFrame
merged_df = merged_df.sample(frac=1).reset_index(drop=True)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11629 entries, 0 to 11628
Columns: 157 entries, acc_mean_x_right to class
dtypes: float64(128), int64(29)
memory usage: 13.9 MB


# Studio Activity 2: Model Training

## Import Dependencies

In [5]:
from sklearn import svm
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

## Seperate Train, Test Set

In [59]:
X_train, X_test, y_train, y_test = train_test_split(merged_df.drop(columns=['class']), merged_df['class'], test_size=0.3, random_state=1)
print(X_train.shape)
print(X_test.shape)

(8140, 156)
(3489, 156)


## Train the model using SVM without Cross Validation

In [7]:
clf = svm.SVC() 
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test) 
accuracy_score(y_test,y_pred)

0.8902264259100029

## Train the model using SVM with Cross Validation

In [60]:
clf = svm.SVC()
clf.fit(X_train, y_train)  
scores = cross_val_score(clf, X_train, y_train, cv=10) 
print(scores)

y_pred = clf.predict(X_test) 
accuracy_score(y_test,y_pred)

[0.88820639 0.88574939 0.88206388 0.89312039 0.89312039 0.89066339
 0.89434889 0.9004914  0.8955774  0.88574939]


0.8928059615935798

# Studio Activity 3: Hyper parameter tuning

## Using sigmoid (rbf is default kernel)

In [9]:
clf = svm.SVC(kernel='sigmoid')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8317569504155918

## Using GridSearchCV to identify optimal values of hyper parameters

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
hyper_parameters = {'kernel':('sigmoid', 'rbf'), 'C':[1, 10, 20]}
svc_model = svm.SVC()
clf = GridSearchCV(svc_model, hyper_parameters)
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10)
print(scores)

[0.8992629  0.8955774  0.8992629  0.8992629  0.89066339 0.88820639
 0.89066339 0.91891892 0.90540541 0.9004914 ]


# Studio Activity 4: Feature Selection

#### Importing Dependecies

In [17]:
from sklearn.feature_selection import SelectKBest, f_regression

#### Select 100 best feature from X_train

In [18]:
print(X_train.shape)

(8140, 156)


In [21]:
X_new = SelectKBest(f_regression, k=100).fit_transform(merged_df.drop(columns=['class']), merged_df['class'])

In [22]:
print(X_new.shape)

(11629, 100)


#### Split the Dataset

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_new, merged_df['class'], test_size=0.3, random_state=1)
print(X_train.shape)
print(X_test.shape)

(8140, 100)
(3489, 100)


#### Training Model with Parameters Tuning

In [27]:
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10)
print(scores)

[0.88820639 0.88820639 0.89434889 0.8955774  0.89066339 0.89066339
 0.88206388 0.9004914  0.90663391 0.89066339]


# Studio Activity 5: Dimensionality reduction

#### Import Dependencies

In [28]:
from sklearn.decomposition import PCA

#### Reduce dimension on original dataset

In [33]:
pca = PCA(n_components=10).fit(merged_df.drop(columns=['class']))

In [35]:
X_train_pca = pca.fit_transform(merged_df.drop(columns=['class']))
print(X_train_pca.shape)

(11629, 10)


#### Split Dataset

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_train_pca, merged_df['class'], test_size=0.3, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8140, 10)
(8140,)
(3489, 10)
(3489,)


#### Train the model

In [38]:
clf5 = svm.SVC() 
scores = cross_val_score(clf5, X_train, y_train, cv=10) 
print(scores)

[0.88820639 0.88329238 0.8955774  0.88943489 0.89434889 0.88943489
 0.89066339 0.9017199  0.9029484  0.8955774 ]


# Studio Activity 6: Prepare a summary table
#### Reported in the Portfolio Week 3

# Studio Activity 7: Other classifiers

In [40]:
X_train, X_test, y_train, y_test = train_test_split(merged_df.drop(columns=['class']), merged_df['class'], test_size=0.3, random_state=1)

## SGD

In [41]:
from sklearn.linear_model import SGDClassifier

#### Training without Cross validation

In [47]:
clf = SGDClassifier(loss="hinge", max_iter=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) 
accuracy_score(y_test,y_pred)



0.8667239896818573

#### Training with Cross validation

In [49]:
clf = svm.SVC() 
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10) 
print(scores)

y_pred = clf.predict(X_test) 
accuracy_score(y_test,y_pred)

[0.89189189 0.88697789 0.89189189 0.88943489 0.89312039 0.89066339
 0.88820639 0.8992629  0.9029484  0.8955774 ]


0.8902264259100029

## Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier

#### Training without Cross validation

In [51]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) 
accuracy_score(y_test,y_pred)

0.9211808541129264

#### Training with Cross validation

In [53]:
clf = RandomForestClassifier() 
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10) 
print(scores)

y_pred = clf.predict(X_test) 
accuracy_score(y_test,y_pred)

[0.92260442 0.92260442 0.92628993 0.92874693 0.91769042 0.92014742
 0.91154791 0.93857494 0.92751843 0.92260442]


0.9240470048724563

## Multi-Layer Perceptron (MLP)

In [55]:
from sklearn.neural_network import MLPClassifier

#### Training without Cross validation

In [56]:
clf = MLPClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) 
accuracy_score(y_test,y_pred)

0.8744625967325881

#### Training with Cross validation

In [57]:
clf = MLPClassifier() 
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10) 
print(scores)

y_pred = clf.predict(X_test) 
accuracy_score(y_test,y_pred)

[0.76167076 0.86117936 0.87346437 0.85749386 0.87714988 0.84766585
 0.66953317 0.90540541 0.84398034 0.86363636]


0.8695901404413873