In [None]:
%matplotlib inline
!pip install dask_ml

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [4]:
#Load and split the data
training_data = pd.read_csv("train.csv")
testing_data = pd.read_csv("test.csv")
costa_rica_data = training_data.drop(['Target'], axis=1)
costa_rica_target = training_data['Target']

In [5]:
#Clean the data to either replace or remove string columns
costa_rica_data.select_dtypes(exclude=[np.number]).head()
costa_rica_data = costa_rica_data.select_dtypes(include=[np.number], exclude=[np.object]).fillna(0)

In [6]:
#Extract feature names
cols = costa_rica_data.columns.values
dicts = dict(zip(cols, range(len(cols))))

features = []
for key in dicts:
    if dicts[key] in [98, 135, 134, 131, 118, 133, 109, 132, 94, 2]:
        features.append(key)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(costa_rica_data[features].values, costa_rica_target.values, test_size= 0.2, random_state=42)

In [10]:
from dask.distributed import Client
client = Client()  # start a local Dask client

from sklearn.externals import joblib

with joblib.parallel_backend('dask'):
    clf = RandomForestClassifier(n_estimators=350, criterion='entropy', max_depth=15, random_state=42, max_features=7)
    model = clf.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    val_score = model.score(X_test, y_test)
    print('Random Forest Train Accuracy: '+str(round(train_score*100,2))+'%')
    print('Random Forest Validation Accuracy: '+str(round(val_score*100,2))+'%')
    print('Train/Test Delta: '+str(round((train_score - val_score)*100,2))+'%')

Random Forest Train Accuracy: 96.47%
Random Forest Validation Accuracy: 93.36%
Train/Test Delta: 3.11%
