In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

%matplotlib inline 

# define project path
projectpath = 'projectpath/'
train_data_csv = projectpath + 'ready_train.csv'
test_data_csv = projectpath + 'ready_test.csv'

df_train = pd.read_csv(train_data_csv,header=None)

# save target values with index into new dataframe 'labels'
df_labels = pd.read_csv(train_data_csv, names=['target'], header=None, usecols=[1])

# save as array
df_labels = df_labels['target'].values

# save data into new dataframe 'training_data'
df_training_data_raw = df_train.drop(columns=[0,1])

# split dataset into train (80%) and test data (20%); random_state=1 makes it reproducible (could be any number); stratify ensures that the proportion stays in test and training data sets
data_train, data_test, labels_train, labels_test = train_test_split(df_training_data_raw, df_labels, test_size=0.2, random_state=1, shuffle=True, stratify=df_labels)

In [5]:
labels_test[:5]

array(['G', 'F', 'G', 'J', 'G'], dtype=object)

# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
clf = RandomForestClassifier(bootstrap=False, criterion="gini", max_depth=None, max_features=9, min_samples_split=4)
clf.fit(data_train,labels_train)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=9, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## use gridsearchcv

In [7]:
# create a RF model and use it with GridSearchCV
rf_model = RandomForestClassifier(random_state=42)

# create a dictionary of all values we want to test for parameter n_estimators (= number of trees)
param_grid = {'n_estimators': np.arange(1, 150), 'max_depth': np.arange(2, 5)}

# use gridsearch to test all values for n_estimators in range defined above
optimal_model = GridSearchCV(rf_model, param_grid, cv=5)

## fit model on training data

In [None]:
# fit model to data
optimal_model.fit(data_train, labels_train)

## see how model performs on test data

In [10]:
# predict the labels for the test slice
labels_pred = clf.predict(data_test)
print(labels_pred[:100])

['A' 'F' 'E' 'J' 'B' 'J' 'B' 'D' 'G' 'J' 'J' 'C' 'G' 'C' 'G' 'C' 'H' 'E'
 'D' 'G' 'I' 'H' 'C' 'G' 'A' 'B' 'H' 'B' 'E' 'H' 'H' 'I' 'A' 'D' 'I' 'F'
 'J' 'B' 'A' 'A' 'E' 'H' 'I' 'I' 'I' 'D' 'A' 'H' 'E' 'G' 'J' 'B' 'A' 'J'
 'B' 'D' 'B' 'D' 'B' 'E' 'H' 'B' 'A' 'A' 'B' 'C' 'E' 'I' 'B' 'J' 'H' 'H'
 'E' 'I' 'H' 'H' 'F' 'J' 'D' 'H' 'G' 'B' 'G' 'G' 'I' 'C' 'F' 'B' 'A' 'B'
 'A' 'E' 'F' 'G' 'A' 'H' 'J' 'I' 'J' 'D']


In [11]:
data_test.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,776,777,778,779,780,781,782,783,784,785
23765,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24975,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
34248,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23778,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# check the accuracy for the test data
print("Accuracy:", metrics.accuracy_score(labels_test, labels_pred))
# check the precision for the test data
print("Precision:",metrics.precision_score(labels_test, labels_pred, average=None))
# check the recall for the test data
print("Recall:",metrics.recall_score(labels_test, labels_pred, average=None))

Accuracy: 0.8871527777777778
Precision: [0.87239583 0.85809906 0.91081081 0.88082902 0.8719346  0.89164491
 0.8902439  0.88266667 0.91643454 0.89801325]
Recall: [0.89452603 0.85580774 0.90106952 0.90787717 0.85447263 0.91188251
 0.87716956 0.88384513 0.87850467 0.90641711]


In [13]:
df_test=pd.read_csv(test_data_csv, sep=',', header=None)
test_data_new = df_test.drop(columns=[0])
# predict values for new data
predicted = clf.predict(test_data_new)
print(predicted[0:100])

['A' 'B' 'A' 'A' 'B' 'G' 'F' 'G' 'A' 'H' 'A' 'A' 'A' 'D' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'F' 'A' 'A' 'A' 'A' 'A' 'A' 'H' 'A' 'A' 'F' 'A' 'A'
 'H' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'J' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'B' 'A' 'A' 'A' 'A' 'A' 'A' 'D' 'C' 'A' 'A' 'A' 'A' 'A' 'A' 'D'
 'A' 'A' 'G' 'A' 'A' 'A' 'A' 'A' 'A' 'A']


In [14]:
df_submission = pd.DataFrame(columns=['id','target'])
predicted = pd.Series(predicted)
df_submission.head()

Unnamed: 0,id,target


In [15]:
# add calculated target values to csv and format for submission
df_submission['target'] = predicted
df_submission['id'] = df_submission.index
submission_file = projectpath + 'submission_rf.csv'
df_submission.head()

Unnamed: 0,id,target
0,0,A
1,1,B
2,2,A
3,3,A
4,4,B


In [16]:
df_submission.to_csv(submission_file, index=False, columns=['id','target'])