## Feature Engineering & Selection for UF6 Neural Network
This notebook will explore the value of feature engineering and selection for the UF6 GADRAS data set.
We will explore the predictor correlations, down-select predictors based on correlation, cross-validate to choose model hyperparameters and finally train models.

In [1]:
# import all of the libraries 

import pandas as pd
from sklearn import base
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [2]:
# copied from the http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

# This defines a function to build a confusion matrix, which we will use later.
import itertools
import numpy as np
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

   # print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [3]:
#perform some cross-validation to find best activation function
def forest_CV(columns, X_train, X_test, y_train, y_test):
    n_est = [500]
#    n_est = [500,900,1000,1100,1200]
#    max_feat = [31, 61, 91, 121, 151, 181, 230, 400]
    max_feat = [50]
#    max_feat = [40,45,50,55,60,65,70,75,80]
    min_samp_split = [2]
    min_samp_leaf = [1]
    names = []
    classifiers = []
    for i in n_est:
        for j in max_feat:
            for k in min_samp_split:
                for l in min_samp_leaf:
                    names.append('n_est='+str(i)+' max_feat=' +str(j) + ' min_samp_split=' +str(k) +' min_samp_leaf=' +str(l))
                    classifiers.append(RandomForestClassifier(n_estimators=i, max_features=j, min_samples_split=k,
                                                min_samples_leaf=l))
    # iterate over classifiers
    y = []
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train.values.ravel())
        score = clf.score(X_test, y_test.values.ravel())
        y.append(score)
        print(name, score)
    return(zip(names, y))

In [4]:
#perform some cross-validation to find best activation function
def forest_CVreg(columns, X_train, X_test, y_train, y_test):
    n_est = [500]
#    n_est = [500,900,1000,1100,1200]
#    max_feat = [31, 61, 91, 121, 151, 181, 230, 400]
    max_feat = [50]
#    max_feat = [40,45,50,55,60,65,70,75,80]
    min_samp_split = [2]
    min_samp_leaf = [1]
    names = []
    classifiers = []
    for i in n_est:
        for j in max_feat:
            for k in min_samp_split:
                for l in min_samp_leaf:
                    names.append('n_est='+str(i)+' max_feat=' +str(j) + ' min_samp_split=' +str(k) +' min_samp_leaf=' +str(l))
                    classifiers.append(RandomForestRegressor(n_estimators=i, max_features=j, min_samples_split=k,
                                                min_samples_leaf=l))
    # iterate over classifiers
    y = []
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train.values.ravel())
        score = clf.score(X_test, y_test.values.ravel())
        y.append(score)
        print(name, score)
    return(zip(names, y))

In [5]:
# this class will allow us to select different features, which are columns in the data set
# it is necessary to create a Class of this type for use in Pipelines

class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col_names):
        self.col_names = col_names  # We will need these in transform()
    
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        return self
    
    def transform(self, X):
        # return a new array with just the columns you specify
        newarray = X.filter(self.col_names, axis=1)
        return newarray

In [7]:
# import data
#data_df = pd.read_csv('/Users/mooreet_la/projects/SDRD/competition/data/TNG_set.csv')
data_df = pd.read_csv('/Users/mooreet_la/projects/SDRD/competition/data/TNG_set128_v2.csv')
labels = data_df['SourceID'].copy()
locations = data_df['location'].copy()
channel_data = data_df.iloc[:, 0:128].copy()

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
# define the set of predictors you want to use to train the model

features = list(channel_data) # takes the engineered features
#features.remove('city')
cst = ColumnSelectTransformer(features)

print(features)
print(len(features))
print(type(features))

['Channel1', 'Channel2', 'Channel3', 'Channel4', 'Channel5', 'Channel6', 'Channel7', 'Channel8', 'Channel9', 'Channel10', 'Channel11', 'Channel12', 'Channel13', 'Channel14', 'Channel15', 'Channel16', 'Channel17', 'Channel18', 'Channel19', 'Channel20', 'Channel21', 'Channel22', 'Channel23', 'Channel24', 'Channel25', 'Channel26', 'Channel27', 'Channel28', 'Channel29', 'Channel30', 'Channel31', 'Channel32', 'Channel33', 'Channel34', 'Channel35', 'Channel36', 'Channel37', 'Channel38', 'Channel39', 'Channel40', 'Channel41', 'Channel42', 'Channel43', 'Channel44', 'Channel45', 'Channel46', 'Channel47', 'Channel48', 'Channel49', 'Channel50', 'Channel51', 'Channel52', 'Channel53', 'Channel54', 'Channel55', 'Channel56', 'Channel57', 'Channel58', 'Channel59', 'Channel60', 'Channel61', 'Channel62', 'Channel63', 'Channel64', 'Channel65', 'Channel66', 'Channel67', 'Channel68', 'Channel69', 'Channel70', 'Channel71', 'Channel72', 'Channel73', 'Channel74', 'Channel75', 'Channel76', 'Channel77', 'Channe

In [593]:
# print(locations.head())
# print(locations.shape)
# print(labels.head())
# print(channel_data.head())
# print(channel_data.shape)

In [595]:
# df1 = pd.DataFrame(np.random.randn(6,4),
#                index=list('abcdef'),                 columns=list('ABCD'))
# df1

In [596]:
# f = df1.loc['d':, 'A':'C']
# f
# #g = df1.iloc[3:,0:3]
# #g

In [9]:
junk = labels.copy()
hasattr(labels,'filter')
print(type(labels))
print(type(junk))
junk = pd.DataFrame(data=junk, index=None)
print(type(junk))
print(labels.head())
print(junk.head())

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
0    0
1    0
2    0
3    0
4    0
Name: SourceID, dtype: int64
   SourceID
0         0
1         0
2         0
3         0
4         0


In [598]:
# labelsDFtest = labels.copy()
# print(labelsDFtest.shape)
# print(labelsDFtest.head())
# labelsDFtest = labelsDFtest.replace(to_replace= 0, value='Bkg', inplace=False, limit=None, regex=False, method='pad', axis=None)
# labelsDFtest = labelsDFtest.replace(to_replace= 1, value='Bkg', inplace=False, limit=None, regex=False, method='pad', axis=None)
# labelsDFtest = labelsDFtest.replace(to_replace= 2, value='Bkg', inplace=False, limit=None, regex=False, method='pad', axis=None)
# labelsDFtest = labelsDFtest.replace(to_replace= 3, value='Bkg', inplace=False, limit=None, regex=False, method='pad', axis=None)
# labelsDFtest = labelsDFtest.replace(to_replace= 4, value='Bkg', inplace=False, limit=None, regex=False, method='pad', axis=None)
# labelsDFtest = labelsDFtest.replace(to_replace= 5, value='Bkg', inplace=False, limit=None, regex=False, method='pad', axis=None)
# labelsDFtest = labelsDFtest.replace(to_replace= 6, value='Bkg', inplace=False, limit=None, regex=False, method='pad', axis=None)
# print(labelsDFtest.shape)
# print(labelsDFtest.head())

In [599]:
#print(labelsDFtest)

In [10]:
#featkeys = channel_data['SourceIDs']
#featkeys = np.reshape(featkeys, (29000,1))

#featdata = featdata.drop('SourceIDs', axis=1)
# print(featdata.shape)
# print(featkeys.shape)

labels = labels.replace(to_replace= 0, value='Bkg', inplace=False, limit=None, regex=False, method='pad', axis=None)
labels = labels.replace(to_replace= 1, value='HEU', inplace=False, limit=None, regex=False, method='pad', axis=None)
labels = labels.replace(to_replace= 2, value='WGPu', inplace=False, limit=None, regex=False, method='pad', axis=None)
labels = labels.replace(to_replace= 3, value='I131', inplace=False, limit=None, regex=False, method='pad', axis=None)
labels = labels.replace(to_replace= 4, value='Co60', inplace=False, limit=None, regex=False, method='pad', axis=None)
labels = labels.replace(to_replace= 5, value='Tc99m', inplace=False, limit=None, regex=False, method='pad', axis=None)
labels = labels.replace(to_replace= 6, value='Tc+HEU', inplace=False, limit=None, regex=False, method='pad', axis=None)

# for i in range(0,labels.shape[0]):
#     if labels[i] == 0:
#         labels[i] = 'Bkg'
#     elif labels[i] == 1:
#         labels[i] = 'HEU'
#     elif labels[i] == 2:
#         labels[i] = 'WGPu'
#     elif labels[i] == 3:
#         labels[i] = 'I131'
#     elif labels[i] == 4:
#         labels[i] = 'Co60'
#     elif labels[i] == 5:
#         labels[i] = 'Tc99m'
#     else:
#         labels[i] = 'Tc+HEU'
        
print(labels.head())
print(labels.shape)
labels = labels.values.reshape([labels.shape[0],1])
print(type(labels))
print(labels.shape)

0    Bkg
1    Bkg
2    Bkg
3    Bkg
4    Bkg
Name: SourceID, dtype: object
(39005,)
<class 'numpy.ndarray'>
(39005, 1)


In [11]:
junk = labels.copy()
junk = pd.DataFrame(data=junk, index=None)
featkeys = junk.copy()
featdata = channel_data.copy()

print(featkeys.shape)
print(featdata.shape)
print(type(labels))
print(type(featkeys))
print(type(featdata))

(39005, 1)
(39005, 128)
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


## Next we will perform cross validations to select hyperparameters for four different models: 
full channel, subset channel, full feature, subset feature

In [12]:
# preprocess dataset, split into training and test part
def data_split(data, keys, columns):
    X_train, X_test, y_train, y_test = train_test_split(data, keys, 
                                                            test_size=0.33, random_state=42)

    cst = ColumnSelectTransformer(columns)
    X_train1 = cst.transform(X_train)
    X_train_std = StandardScaler().fit_transform(X_train1)
    X_test1 = cst.transform(X_test)
    X_test_std = StandardScaler().fit_transform(X_test1)

    return(columns, X_train_std, X_test_std, y_train, y_test)

In [13]:
def data_transform(dataTRN, dataTEST, columns):
    # just transform the already split data    ### This is redundant if you already used the split command!!

    cst = ColumnSelectTransformer(columns)
    X_train1 = cst.transform(dataTRN)
    X_train_std = StandardScaler().fit_transform(X_train1)
    X_test1 = cst.transform(dataTEST)
    X_test_std = StandardScaler().fit_transform(X_test1)

    return(columns, X_train_std, X_test_std)

In [14]:
col_fullfeat, X_train, X_test, y_train, y_test = data_split(featdata, featkeys, features)
#col_subfeat, X_train_subfeat, X_test_subfeat, y_train_subfeat, y_test_subfeat = data_split(featdata, featkeys, subset_features)
#X_train, X_test, y_train, y_test = train_test_split(featdata, featkeys, test_size=0.33, random_state=42)

In [15]:
# col_fullfeat, X_train_fullfeat, X_test_fullfeat = data_transform(featdataTRN, featdataTES, features)
# y_train_fullfeat, y_test_fullfeat = featkeys, featkeysTES

In [17]:
#print(X_train_fullfeat.shape)
#print(X_test_fullfeat.shape)
#print(y_train_fullfeat.shape)
#print(y_test_fullfeat.shape)
featdataTRN = X_train.copy()
featdataTES = X_test.copy()
featkeysTRN = y_train.copy()
featkeysTES = y_test.copy()
print(featdataTRN.shape)
print(featdataTES.shape)
print(featdata.shape)
print(featkeysTRN.shape)
print(featkeysTES.shape)
print(type(X_train))
print(type(featdata))

(26133, 128)
(12872, 128)
(39005, 128)
(26133, 1)
(12872, 1)
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


In [18]:
print(featkeysTES.shape)
print(type(featkeysTES))
print(type(featdataTRN))

(12872, 1)
<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


In [19]:
# #  only if not splitting ????
# col_fullfeat, X_train_fullfeat, X_test_fullfeat = data_transform(featdataTRN, featdataTES, features)
# y_train_fullfeat, y_test_fullfeat = featkeys, featkeysTES
# #col_subfeat, X_train_subfeat, X_test_subfeat = data_transform(featdataTRN, featdataTES, subset_features)
# #y_train_subfeat, y_test_subfeat = featkeysTRN, featkeysTES

In [20]:
#featkeysTES = 0
print(featdata.shape)
print(featkeys.shape)
print(featdataTRN.shape)
print(featkeysTRN.shape)
print(featdataTES.shape)
print(featkeysTES.shape)
print(len(features))

(39005, 128)
(39005, 1)
(26133, 128)
(26133, 1)
(12872, 128)
(12872, 1)
128


In [21]:
# take a quick look, these should have the same number of rows
print(featdata.shape)
print(featkeys.shape)
#print(featkeys1.shape)
print(featkeysTRN.shape)
print(featkeysTES.shape)
print(featdataTRN.shape)
print(featdataTES.shape)

(39005, 128)
(39005, 1)
(26133, 1)
(12872, 1)
(26133, 128)
(12872, 128)


In [22]:
# use the split data with the PCR & PLS in the dataframe

#featdata = featdataTRN
#featkeys = featkeysTRN


In [23]:
featdata.head()

Unnamed: 0,Channel1,Channel2,Channel3,Channel4,Channel5,Channel6,Channel7,Channel8,Channel9,Channel10,...,Channel119,Channel120,Channel121,Channel122,Channel123,Channel124,Channel125,Channel126,Channel127,Channel128
0,5,28,90,150,134,141,89,71,66,59,...,0,0,1,1,0,0,0,1,3,2
1,4,30,98,171,139,102,107,99,65,62,...,0,1,0,0,0,1,0,0,1,0
2,7,31,106,165,134,131,115,84,70,57,...,0,0,0,0,1,0,0,1,2,0
3,3,34,108,179,142,129,85,85,68,65,...,0,0,0,0,1,0,0,1,3,1
4,10,31,96,162,170,98,118,93,63,53,...,0,1,0,0,1,1,0,0,0,1


In [24]:
featkeys.shape

(39005, 1)

## Next we will perform cross validations to select hyperparameters for different models: 
full 128 channel

In [25]:
#perform cross-validation to find best value of alpha

def alpha_CV(columns, X_train, X_test, y_train, y_test):
    alphas = np.logspace(-5, 3, 9)
    names = []
    for i in alphas:
        names.append('alpha ' + str(i))

    classifiers = []
    for i in alphas:
        classifiers.append(MLPClassifier(solver='adam', activation='tanh', alpha=i, random_state=1))

    x = alphas
    y = []

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y.append(score)
        #print(clf, score)
    
    #plt.plot(x, y)
    #plt.show()
    return(zip(alphas, y))

In [26]:
#perform cross-validation to find best value of alpha

def alpha_CVreg(columns, X_train, X_test, y_train, y_test):
    alphas = np.logspace(-5, 3, 9)
    names = []
    for i in alphas:
        names.append('alpha ' + str(i))

    classifiers = []
    for i in alphas:
        classifiers.append(MLPRegressor(solver='adam', activation='tanh', alpha=i, random_state=1))

    x = alphas
    y = []

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y.append(score)
        #print(clf, score)
    
    #plt.plot(x, y)
    #plt.show()
    return(zip(alphas, y))

In [27]:
#perform cross-validation to find best solver

def solver_CV(columns, X_train, X_test, y_train, y_test):
    solvers = ['lbfgs','sgd','adam']
    names = []
    for i in solvers:
        names.append('solver= ' + str(i))

    classifiers = []
    for i in solvers:
        classifiers.append(MLPClassifier(solver=i, activation='tanh', learning_rate='adaptive', alpha=0.1, random_state=1))
        
    x = [1,2,3]
    y = []

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y.append(score)
        #print(clf, score)

    #plt.plot(x, y)
    #plt.show()
    return(zip(solvers, y))

In [28]:
#perform cross-validation to find best solver

def solver_CVreg(columns, X_train, X_test, y_train, y_test):
    solvers = ['lbfgs','sgd','adam']
    names = []
    for i in solvers:
        names.append('solver= ' + str(i))

    classifiers = []
    for i in solvers:
        classifiers.append(MLPRegressor(solver=i, activation='tanh', learning_rate='adaptive', alpha=0.1, random_state=1))
        
    x = [1,2,3]
    y = []

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y.append(score)
        #print(clf, score)

    #plt.plot(x, y)
    #plt.show()
    return(zip(solvers, y))

In [29]:
#perform some cross-validation to find best activation function
def activation_CV(columns, X_train, X_test, y_train, y_test):
    activators = ['identity', 'logistic', 'tanh', 'relu']
    names = []
    for i in activators:
        names.append('activator= ' + str(i))

    classifiers = []
    for i in activators:
        classifiers.append(MLPClassifier(solver='adam', activation=i, alpha=0.1, random_state=1))

    x = [1,2,3,4]
    y = []

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y.append(score)
        #print(clf, score)

    #plt.plot(x, y)
    #plt.show()
    return(zip(activators, y))

In [30]:
#perform some cross-validation to find best activation function
def activation_CVreg(columns, X_train, X_test, y_train, y_test):
    activators = ['identity', 'logistic', 'tanh', 'relu']
    names = []
    for i in activators:
        names.append('activator= ' + str(i))

    classifiers = []
    for i in activators:
        classifiers.append(MLPRegressor(solver='adam', activation=i, alpha=0.1, random_state=1))

    x = [1,2,3,4]
    y = []

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y.append(score)
        #print(clf, score)

    #plt.plot(x, y)
    #plt.show()
    return(zip(activators, y))

In [31]:
# preprocess dataset, split into training and test part
def data_split(data, keys, columns):
    X_train, X_test, y_train, y_test = train_test_split(data, keys, 
                                                            test_size=0.33, random_state=42)

    cst = ColumnSelectTransformer(columns)
    X_train1 = cst.transform(X_train)
    X_train_std = StandardScaler().fit_transform(X_train1)
    X_test1 = cst.transform(X_test)
    X_test_std = StandardScaler().fit_transform(X_test1)

    return(columns, X_train_std, X_test_std, y_train, y_test)
    


In [32]:
def data_transform(dataTRN, dataTEST, columns):
    # just transform the already split data

    cst = ColumnSelectTransformer(columns)
    X_train1 = cst.transform(dataTRN)
    X_train_std = StandardScaler().fit_transform(X_train1)
    X_test1 = cst.transform(dataTEST)
    X_test_std = StandardScaler().fit_transform(X_test1)

    return(columns, X_train_std, X_test_std)

In [620]:
#col_fullfeat, X_train_fullfeat, X_test_fullfeat, y_train_fullfeat, y_test_fullfeat = data_split(featdata, featkeys, features)
#col_subfeat, X_train_subfeat, X_test_subfeat, y_train_subfeat, y_test_subfeat = data_split(featdata, featkeys, subset_features)

In [33]:
#featkeysTES = 0
print(featdata.shape)
print(featkeys.shape)
print(featdataTRN.shape)
print(featkeysTRN.shape)
print(featdataTES.shape)
print(featkeysTES.shape)
print(len(features))

(39005, 128)
(39005, 1)
(26133, 128)
(26133, 1)
(12872, 128)
(12872, 1)
128


In [622]:
#col_fullfeat, X_train_fullfeat, X_test_fullfeat = data_transform(featdataTRN, featdataTES, features)
#y_train_fullfeat, y_test_fullfeat = featkeys, featkeysTES
#col_subfeat, X_train_subfeat, X_test_subfeat = data_transform(featdataTRN, featdataTES, subset_features)
#y_train_subfeat, y_test_subfeat = featkeysTRN, featkeysTES

In [34]:
alpha_full_feat = alpha_CV(features, X_train, X_test, y_train, y_test)
print(list(alpha_full_feat))

solver_full_feat = solver_CV(features, X_train, X_test, y_train, y_test)
print(list(solver_full_feat))

activation_full_feat = activation_CV(features, X_train, X_test, y_train, y_test)
print(list(activation_full_feat))

  y = column_or_1d(y, warn=True)


[(1.0000000000000001e-05, 0.5682877563704164), (0.0001, 0.5682877563704164), (0.001, 0.56968614045991295), (0.01, 0.57636730888750776), (0.10000000000000001, 0.61839651957737729), (1.0, 0.71177750155376007), (10.0, 0.65273461777501551), (100.0, 0.41019266625233064), (1000.0, 0.38906152889993784)]
[('lbfgs', 0.55445929148539463), ('sgd', 0.70455251709136113), ('adam', 0.61839651957737729)]
[('identity', 0.70400870105655688), ('logistic', 0.71022374145431943), ('tanh', 0.61839651957737729), ('relu', 0.63408949658172775)]


In [35]:
forest_full_feat = forest_CV(features, X_train, X_test, y_train, y_test)
#print(list(forest_full_feat))

n_est=500 max_feat=50 min_samp_split=2 min_samp_leaf=1 0.703309509012


### Finallly we will train four different models on the full data set

In [443]:
# just read in data from files and over-write
# featkeysTRN = pd.read_csv('/home/jack/projects/dengue/data/sj_train100per_labels.csv')

# featdataTRN = pd.read_csv('/home/jack/projects/dengue/data/sj_train100per.csv')
# featdataTES = pd.read_csv('/home/jack/projects/dengue/data/sj_testTrue.csv')

In [444]:
print(featdata.shape)
print(featkeys.shape)
print(featdataTRN.shape)
print(featkeysTRN.shape)
print(featdataTES.shape)

print(len(features))

(39005, 128)
(39005, 1)
(26133, 128)
(26133, 1)
(12872, 128)
128


In [445]:
print(type(featdata))
print(type(featkeys))
print(type(featdataTRN))
print(type(featkeysTRN))
print(type(featdataTES))

print(len(features))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
128


In [452]:
# Create a Pipeline which you can use to train and predict
# Step 1: take the data (training or testing) and select only the columns of interest
# Step 2: transform all of the features to Standard Variables
# Step 3: feed the data into a Multi-Layer Perceptron 

NN_full_feat = Pipeline([
    ('cst', ColumnSelectTransformer(features)),
    ('sdt', StandardScaler()),
    ('mlp', MLPRegressor(solver='sgd', activation='logistic', alpha=1.0)) #, alpha=0.1))
    ])

RF_full_feat = Pipeline([
    ('cst', ColumnSelectTransformer(features)),
    ('sdt', StandardScaler()),
    ('rfr', RandomForestRegressor(n_estimators=500))
    ])

In [453]:
junkDTRN = featdataTRN.copy()
junkDTRN = pd.DataFrame(data=junkDTRN, index=None)  ### WHAT THE FUNKKKKKKKKKKKKKK!!!!!!!!!!!!!!!!
featdataTRN = junkDTRN.copy()
type(featdataTRN)

pandas.core.frame.DataFrame

In [454]:
print(featdataTRN.shape)
print(featdataTRN.head())
print(featkeysTRN.shape)
print(featkeysTES.shape)
print(type(featdataTRN))
featkeysTRN.head()

(26133, 128)
        0         1         2         3         4         5         6    \
0 -0.889134 -0.759561 -0.179408 -0.323511 -0.337397 -0.165041 -0.350181   
1 -0.302847 -0.259215 -0.039553 -0.071390  0.354448  0.034934 -0.074294   
2 -0.595990  1.141752 -0.179408 -0.281491 -0.181732 -0.040056 -0.212238   
3  0.283441 -0.359285 -0.431148 -0.113410 -0.389286 -0.490001 -0.192531   
4  1.456015  2.742858  0.743636  0.895075  0.613890  1.209788  0.516891   

        7         8         9      ...          118      119       120  \
0 -0.468994 -0.194901 -0.282010    ...    -0.352879 -0.34047 -0.354456   
1  0.256482  0.118815  0.766883    ...    -0.352879 -0.34047 -0.354456   
2 -0.087165 -0.473760 -0.362694    ...    -0.352879 -0.34047 -0.354456   
3 -0.354446 -0.857191 -0.604746    ...     2.280410 -0.34047 -0.354456   
4  0.905592  0.955391  0.887909    ...    -0.352879 -0.34047 -0.354456   

        121       122       123       124       125       126       127  
0 -0.386464 -0.43

Unnamed: 0,0
24135,WGPu
16088,I131
25285,I131
38496,Bkg
4303,Tc99m


In [584]:
params =  {'rfr__n_estimators':500, 'rfr__max_features': 50}
RF_full_feat.set_params(**params)

RF_full_feat.fit(featdataTRN,featkeysTRN)
y_pred5 = RF_full_feat.predict(featdataTES)

AttributeError: 'numpy.ndarray' object has no attribute 'filter'

In [None]:
NN_subset_feat.fit(featdataTRN,featkeysTRN)
y_pred4 = NN_subset_feat.predict(featdataTES)

In [None]:
print(len(y_pred5))
len(y_pred4)

In [41]:
# split the data into training and testing
X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(featdata, featkeys, test_size=0.33, random_state=42)

In [42]:
print(len(featkeysTES))
len(subset_features)

NameError: name 'featkeysTES' is not defined

In [None]:
#NN_full_feat.set_params(alpha = 100)
NN_full_feat.fit(featdataTRN,featkeysTRN)
y_pred3 = NN_full_feat.predict(featdataTES)


y_test_feat = featkeysTES
# Save the trained model
#joblib.dump(NN_full_feat, '/Users/turk_la/Documents/SSAM/Data/UF6data/NN_full_features.pkl')

y_test_feat['pred'] = y_pred3
y_test_feat['dif'] = abs(y_test_feat['x'] - y_test_feat['pred'])
y_test_feat['dif'].sum()/len(featkeysTES)

In [None]:
NN_subset_feat.fit(featdataTRN,featkeysTRN)
y_pred4 = NN_subset_feat.predict(featdataTES)

y_test_feat = featkeysTES

y_test_feat['pred4'] = y_pred4
y_test_feat['dif_sub'] = abs(y_test_feat['x'] - y_test_feat['pred4'])
y_test_feat['dif_sub'].sum()/len(featkeysTES)

# Save the trained model
#joblib.dump(NN_subset_feat, '/Users/turk_la/Documents/SSAM/Data/UF6data/NN_sub_features.pkl')


## Try the Random Forest to do the classification

In [78]:
params =  {'rfr__n_estimators':800, 'rfr__max_features': 65}
RF_subset_feat.set_params(**params)

RF_subset_feat.fit(featdataTRN,featkeysTRN)
y_pred5 = RF_subset_feat.predict(featdataTES)

y_test_feat = featkeysTES

y_test_feat['pred5'] = y_pred5
y_test_feat['dif_sub_rf'] = abs(y_test_feat['x'] - y_test_feat['pred5'])
y_test_feat['dif_sub_rf'].sum()/len(featkeysTES)

NameError: name 'RF_subset_feat' is not defined

In [79]:
RF_full_feat.get_params()

{'cst': ColumnSelectTransformer(col_names=['Channel1', 'Channel2', 'Channel3', 'Channel4', 'Channel5', 'Channel6', 'Channel7', 'Channel8', 'Channel9', 'Channel10', 'Channel11', 'Channel12', 'Channel13', 'Channel14', 'Channel15', 'Channel16', 'Channel17', 'Channel18', 'Channel19', 'Channel20', 'Channel21', 'Channel22', 'Channel23',...', 'Channel122', 'Channel123', 'Channel124', 'Channel125', 'Channel126', 'Channel127', 'Channel128']),
 'cst__col_names': ['Channel1',
  'Channel2',
  'Channel3',
  'Channel4',
  'Channel5',
  'Channel6',
  'Channel7',
  'Channel8',
  'Channel9',
  'Channel10',
  'Channel11',
  'Channel12',
  'Channel13',
  'Channel14',
  'Channel15',
  'Channel16',
  'Channel17',
  'Channel18',
  'Channel19',
  'Channel20',
  'Channel21',
  'Channel22',
  'Channel23',
  'Channel24',
  'Channel25',
  'Channel26',
  'Channel27',
  'Channel28',
  'Channel29',
  'Channel30',
  'Channel31',
  'Channel32',
  'Channel33',
  'Channel34',
  'Channel35',
  'Channel36',
  'Channel37'

In [80]:
#params = ['n_estimators': 800, 'max_features': 165
params =  {'rfr__n_estimators':1200, 'rfr__max_features': 365}
RF_full_feat.set_params(**params)
RF_full_feat.fit(featdataTRN,featkeysTRN)
y_pred6 = RF_full_feat.predict(featdataTES)

y_test_feat = featkeysTES

y_test_feat['pred6'] = y_pred6
y_test_feat['dif_sub_rf'] = abs(y_test_feat['x'] - y_test_feat['pred6'])
y_test_feat['dif_sub_rf'].sum()/len(featkeysTES)

AttributeError: 'numpy.ndarray' object has no attribute 'filter'

In [None]:
y_pred3
print(y_pred4.shape)
w = sum(y_pred4)
print(w/260)

In [None]:
## making  submission
#submission = pd.read_csv('/home/jack/projects/dengue/data/submission_format.csv')

# submission 13 for Iq
submission = pd.read_csv('/home/jack/projects/dengue/results/predictions13_EM.csv')

count = 0
for i in y_pred4:
    if i < 0.0:
        print(i)
        submission.loc[count,'total_cases'] = 0.0
    else:
        submission.loc[count,'total_cases'] = i
    count += 1

submission.to_csv('/home/jack/projects/dengue/results/predictions29_EM.csv',index='FALSE')
    
#submission.total_cases = np.concatenate([y_pred6])
    
#submission.head()
#submission.tail()
#submission.shape
#submission.loc[411, 'weekofyear']

In [None]:
y_pred5

In [None]:
subset_features

In [None]:
y_pred4[:10]

In [None]:
forest_fullfeat = forest_CV(col_fullfeat, X_train_fullfeat, X_test_fullfeat, y_train_fullfeat, y_test_fullfeat['x'])

In [None]:
## Experimenting with implementing a time series aspect

X_tr, X_ts, y_tr, y_ts = featdata[:800], featdata[800:], featkeys[:800], featkeys[800:]

RF_full_feat.fit(X_tr,y_tr)

preds = []

new_val= y_tr['x'][799]

for i in range(936-800):

    #print(new_val)

    X_ts.loc[i+800,'prior_val'] = float(new_val)

    #print(X_ts.loc[i+800,'prior_val'])

    datum = X_ts[i:i+1]

    #print(datum)    

    new_val = RF_full_feat.predict(datum)

    preds.append(float(new_val))

    

y_ts['pred'] = preds

y_ts['dif'] = abs(y_ts['x'] - y_ts['pred'])

y_ts['dif'].sum()/len(y_ts)