In [1]:
import numpy as np
import pandas as pd
from citrine import load_data,output_string_to_float,data_split,choose_model,setup_param_grid
from citrine import fit_ml_model,metrics_ml_model,avg_metric,overall_avg
from citrine import fit_mlp_nn,metrics_mlp_model,wrapper_nn

Using TensorFlow backend.


### Data exploration and preprocessing
* **Data is split into features and multi-label target** 
* **Data is converted into the format fit for machine learning**

In [2]:
X,y = load_data('challenge_data/training_data.csv',['formulaA','formulaB'],'stabilityVec') #loading data, checking for NaN and returning X,Y

Y = output_string_to_float(y) #changing labels to usable format

Removing 0 nan rows
Original dataset shape: (2572, 99)
Features shape after dropping formulaA,formulaB and StabilityVector: (2572, 96)
Target shape: (2572, 9)


### Model training and validation
* **Data is divided into train and test for each label** 
* **Features are oversampled and standaridized and each label is fit to the model**
* **Hyperparameter tuning is done on the fly**
* **The weighted precision, recall and f1-score is collected for each label's test set**
* **The model average is also recorded**

### Random Forest Classifier and respective metrics

In [3]:
v,w = choose_model('RandomForestClassifier') #takes RandomForestClassifier as a model and gives the  
#hyperparameters which can be tuned along with the datatype
parameters = setup_param_grid(v,[[10,20],['gini','entropy'],['auto','sqrt'],[5,10]]) #setting up the dictionary for 
#hyperparameter optimization
rf_metrics_df = metrics_ml_model(X,Y,'RandomForestClassifier',parameters,None) #stores metrics 
rf_metrics_df.to_csv('rf.csv',sep=',')  #passing dataframe to a csv file 

There are 4 hyper parameters ['n_estimators', 'criterion', 'max_features', 'max_depth'] of type ['int', 'str', 'int', 'int'] respectively
Label: 0
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3782, 96)
Resampled y_train shape: (3782,)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)
Label: 1
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3718, 96)
Resampled y_train shape: (3718,)
RandomForestClassifier(bootstrap=True, class_weight=

In [4]:
precision = avg_metric(rf_metrics_df,'precision') #precision values weighted by the support for all 9 labels
print('The weighted precision for each label: '+str(precision))
overall_avg(precision,'precision') #average precision
recall = avg_metric(rf_metrics_df,'recall') #recall values weighted by the support for all 9 labels
print('The weighted recall for each label: '+str(recall))
overall_avg(recall,'recall') #average recall
f1 = avg_metric(rf_metrics_df,'f1-score')#f-1 scores weighted by the support for all 9 labels
print('The weighted f1-score for each label: '+str(f1))
overall_avg(f1,'f1-score')#average f1-score

The weighted precision for each label: [0.97, 0.97, 0.88, 0.94, 0.87, 0.93, 0.89, 0.87, 0.97]
The average precision is: 0.92
The weighted recall for each label: [0.96, 0.91, 0.84, 0.92, 0.83, 0.9, 0.83, 0.85, 0.96]
The average recall is: 0.89
The weighted f1-score for each label: [0.97, 0.93, 0.85, 0.92, 0.84, 0.91, 0.85, 0.86, 0.97]
The average f1-score is: 0.90


0.9

### Logistic Regression and respective metrics

In [5]:
v,w = choose_model('LogisticRegression') #takes Logistic regression as model and gives the hyper parameters which can be tuned
parameters = setup_param_grid(v,[['l1','l2'],[0.1,1],[True,False]]) #setting up the dictionary for hyper parameter optimization
lr_metrics_df = metrics_ml_model(X,Y,'LogisticRegression',parameters,None) #storing all the metrics in a dataframe
lr_metrics_df.to_csv('lr.csv',sep=',')  #passing dataframe to a csv file 

There are 3 hyper parameters ['penalty', 'C', 'fit_intercept'] of type ['str', 'float', 'bool'] respectively
Label: 0
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3782, 96)
Resampled y_train shape: (3782,)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=10, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Label: 1
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3718, 96)
Resampled y_train shape: (3718,)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=10, solver='liblinear', tol=0.0001,
      

In [6]:
precision = avg_metric(lr_metrics_df,'precision') #precision values weighted by the support for all 9 labels
print('The weighted precision for each label: '+str(precision))
overall_avg(precision,'precision')  #average precision
recall = avg_metric(lr_metrics_df,'recall') #recall values weighted by the support for all 9 labels
print('The weighted recall for each label: '+str(recall))
overall_avg(recall,'recall') #average recall
f1 = avg_metric(lr_metrics_df,'f1-score')#f-1 scores weighted by the support for all 9 labels
print('The weighted f1-score for each label: '+str(f1))
overall_avg(f1,'f1-score')#average f1-score

The weighted precision for each label: [0.97, 0.97, 0.81, 0.93, 0.8, 0.91, 0.88, 0.86, 0.97]
The average precision is: 0.90
The weighted recall for each label: [0.94, 0.86, 0.73, 0.82, 0.72, 0.76, 0.69, 0.77, 0.94]
The average recall is: 0.80
The weighted f1-score for each label: [0.95, 0.91, 0.75, 0.86, 0.74, 0.82, 0.73, 0.79, 0.95]
The average f1-score is: 0.83


0.83

### Support Vector Machine and respective metrics

In [7]:
v,w = choose_model('SVC')  #takes SupportVectorMachine as model and gives the hyper parameters which can be tuned
parameters = setup_param_grid(v,[['rbf','linear'],[1,10],[None,'balanced']]) #setting up the dictionary for hyper parameter optimization
svm_metrics_df = metrics_ml_model(X,Y,'SVC',parameters,None) #storing all the metrics in a dataframe
svm_metrics_df.to_csv('svm.csv',sep=',') #passing dataframe to a csv file

There are 3 hyper parameters ['kernel', 'C', 'class_weight'] of type ['str', 'float', 'dict'] respectively
Label: 0
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3782, 96)
Resampled y_train shape: (3782,)
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=10, shrinking=True,
  tol=0.001, verbose=False)
Label: 1
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3718, 96)
Resampled y_train shape: (3718,)
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=10, shrinking=True,
  tol=0.001, verbose=False)
Label: 2
25% Data split into test set
X_train s

In [8]:
precisions = avg_metric(svm_metrics_df,'precision') #precision values weighted by the support for all 9 labels
overall_avg(precisions,'precision')  #average precision
recall = avg_metric(svm_metrics_df,'recall') #recall values weighted by the support for all 9 labels
overall_avg(recall,'recall') #average recall
f1 = avg_metric(svm_metrics_df,'f1-score') #f-1 scores weighted by the support for all 9 labels
overall_avg(f1,'f1-score') #average f1-score

The average precision is: 0.92
The average recall is: 0.91
The average f1-score is: 0.92


0.92

### Multi-layer Perceptron (MLP) and respective metrics

In [9]:
v,w=choose_model('MLPClassifier')
parameters =setup_param_grid(v,[[256,512],['adam','RMSprop'],[0.5,0.3]])
df3 = wrapper_nn(parameters,X,Y)

There are 3 hyper parameters ['mlp_units', 'optimizer', 'dropout'] of type ['int', 'str', 'float'] respectively
For params: ['adam', 0.5, 256]
Label: 0
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3782, 96)
Resampled y_train shape: (3782,)
Epoch 00107: early stopping
Label: 1
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3718, 96)
Resampled y_train shape: (3718,)
Epoch 00130: early stopping
Label: 2
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (2936, 96)
Resampled y_train shape: (2936,)
Epoch 00243: early stopping
Label: 3
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3590, 96)
Re

In [25]:
X1 = load_data('challenge_data/test_data.csv',['formulaA','formulaB'],None)#loading the held-out test set

Removing 0 nan rows
Original dataset shape: (749, 99)
Features shape after dropping formulaA and formulaB: (749, 97)


In [15]:
x,y,z = df3[['f1-score','recall','precision']].idxmax() 
if x==y==z:
    print('The optimum parameters as chosen from f1-score,recall and precision are '+str(df3['optimizer'][x])+','+str(df3['dropout'][x])+' dropout,'+str(df3['neurons'][x])+' neurons')
    mlp_metrics_df,predicted_array = metrics_mlp_model(parameters,X,Y,df3['optimizer'][x],df3['dropout'][x],df3['neurons'][x],X1)#storing all metrics in a dataframe
elif x==y or x==z:
    print('The optimum parameters as chosen from f1-score and either one metric are '+str(df3['optimizer'][x])+','+str(df3['dropout'][x])+' dropout,'+str(df3['neurons'][x])+' neurons')
    mlp_metrics_df,predicted_array = metrics_mlp_model(parameters,X,Y,df3['optimizer'][x],df3['dropout'][x],df3['neurons'][x],X1)#storing all metrics in a dataframe
else:
    print('The optimum parameters as chosen from f1-score are '+str(df3['optimizer'][x])+','+str(df3['dropout'][x])+' dropout,'+str(df3['neurons'][x])+' neurons')
    mlp_metrics_df,predicted_array = metrics_mlp_model(parameters,X,Y,df3['optimizer'][x],df3['dropout'][x],df3['neurons'][x],X1)#storing all metrics in a dataframe

The optimum parameters as chosen from f1-score and either one metric are adam,0.5 dropout,256 neurons
Label: 0
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3782, 96)
Resampled y_train shape: (3782,)
Epoch 00123: early stopping
Label: 1
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3718, 96)
Resampled y_train shape: (3718,)
Epoch 00151: early stopping
Label: 2
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (2936, 96)
Resampled y_train shape: (2936,)
Epoch 00217: early stopping
Label: 3
25% Data split into test set
X_train shape: (1929, 96)
y_train shape: (1929,)
X_test shape: (643, 96)
y_test shape: (643,)
Resampled X_train shape: (3590, 96)
Resampled y_train shape: (3590,)
Epoch 0026

In [16]:
mlp_metrics_df.to_csv('mlp.csv',sep=',') #passing dataframe to a csv file
precision = avg_metric(mlp_metrics_df,'precision') #precision values weighted by the support for all 9 labels
overall_avg(precision,'precision')  #average precision
recall = avg_metric(mlp_metrics_df,'recall') #recall values weighted by the support for all 9 labels
overall_avg(recall,'recall') #average recall
f1 = avg_metric(mlp_metrics_df,'f1-score') #f-1 scores weighted by the support for all 9 labels
overall_avg(f1,'f1-score') #average f1-score

The average precision is: 0.92
The average recall is: 0.92
The average f1-score is: 0.92


0.92

### Predicting on the held out test_data.csv
* **Since the MLP has the highest precision,recall and f1-score, it has been used as the model to predict on the test_data.csv**
* **The predicted values can be found in the test_data_with_vec.csv file attached in the zip folder**

In [26]:
predicted_arr = np.array(predicted_array).reshape(X1.shape[0],Y.shape[1])
np.savetxt("y_predicted_nn.csv", predicted_arr, delimiter=",")
pd.read_csv('y_predicted_nn.csv',sep=',',names=['label1','label2','label3','label4','label5','label6','label7','label8','label9'])