# Nueral Network

In [1]:
from sklearn.utils import resample

def resample_to_equal_class_sizes(X,y):
    df = pd.DataFrame(X)
    df['group'] = [int(i) for i in y]
    groups = []
    for v in set(df['group']):
        groups.append(df[df['group'] == v])
           
    max_length = max([len(group) for group in groups])
    print("Maximum class size is %s" %max_length)
    
    final_groups = []
    for group in groups:
        if len(group) < max_length:
            print("Class %s size is %s. Resampling with replacement to %s" %(max(group['group']),len(group), max_length))
            final_groups.append(resample(group, replace=True, n_samples=max_length))
        else:
            print("Class %s size has max class size (%s)." %(max(group['group']), max_length))
            final_groups.append(group)
    df = pd.concat(final_groups)
    return df.drop('group', axis=1).values, df['group'].values


### Read in the Training Data

In [2]:
# Read in the data
import pandas as pd

raw_data = pd.read_excel('data/training_data.xlsx')

# remove unneeded subject ID column
data = raw_data.drop('Subject', axis=1)

# split x and y data
y = data['GroupID']
X = data.drop('GroupID', axis=1)

### Read in the Validation Data

In [3]:
# Read in the data
import pandas as pd

raw_validation_data = pd.read_excel('data/Validation.xlsx')

# remove unneeded subject ID column
validation_data = raw_validation_data[data.columns]

# split x and y data
y_valid = validation_data['GroupID']
X_valid = validation_data.drop('GroupID', axis=1)

### Reading Validation Data 2

In [4]:
# Read in the data
import pandas as pd

raw_validation_data2 = pd.read_excel('data/Validation_2.0.xlsx')

# remove unneeded subject ID column
validation_data2 = raw_validation_data2[data.columns]

# split x and y data
y_valid2 = validation_data2['GroupID']
X_valid2 = validation_data2.drop('GroupID', axis=1)

### Standardize all the data to training set and resample the training data

In [5]:
# standardize the data by removing the mean (making it 0) and scaling to unit variance
from sklearn.preprocessing import StandardScaler

std_scale = StandardScaler().fit(X)

X_std = std_scale.transform(X)
X_valid_std = std_scale.transform(X_valid)
X_valid_std2 = std_scale.transform(X_valid2)

X_std_res, y_res = resample_to_equal_class_sizes(X_std, y)

Maximum class size is 399
Class 0 size is 240. Resampling with replacement to 399
Class 1 size has max class size (399).
Class 2 size is 52. Resampling with replacement to 399
Class 3 size is 55. Resampling with replacement to 399


### Train a neural network

In [10]:
# Train the network
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-1, hidden_layer_sizes=(100), random_state=1)

clf.fit(X_std_res,y_res)

MLPClassifier(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=100, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [11]:
print("Validation Data 1 Score:", clf.score(X_valid_std, y_valid))
print("Validation Data 2 Score:", clf.score(X_valid_std2, y_valid2))

Validation Data 1 Score: 0.84
Validation Data 2 Score: 0.86


In [80]:
clf.predict(X_valid2)
y_valid2

0     0
1     0
2     0
3     0
4     0
5     0
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    1
34    1
35    1
36    1
37    1
38    2
39    2
40    2
41    2
42    3
43    3
44    3
45    3
46    3
47    3
48    3
49    3
Name: GroupID, dtype: int64