In [1]:
!pip install pydot
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz
import pydot

[33mYou are using pip version 9.0.3, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
#Load and split the data
training_data = pd.read_csv("train.csv")
testing_data = pd.read_csv("test.csv")
costa_rica_data = training_data.drop(['Target'], axis=1)
costa_rica_target = training_data['Target']

#Clean the data to either replace or remove string columns
costa_rica_data.select_dtypes(exclude=[np.number]).head()
costa_rica_data = costa_rica_data.select_dtypes(include=[np.number], exclude=[np.object]).fillna(0)

#Split data into 80% train, 20% validation split
X_train, X_test, y_train, y_test = train_test_split(costa_rica_data.values, costa_rica_target.values, test_size= 0.2, random_state=42)

In [3]:
#Extract feature names
cols = costa_rica_data.columns.values
dicts = dict(zip(cols, range(len(cols))))

features = []
for key in dicts:
    if dicts[key] in [98, 135, 134, 131, 118, 133, 109, 132, 94, 2]:
        features.append(key)

In [4]:
#Fit Random Forest with these most important features
X_train, X_test, y_train, y_test = train_test_split(costa_rica_data[features].values, costa_rica_target.values, test_size= 0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=350, criterion='entropy', max_depth=15, random_state=42, max_features=7)
model = clf.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
train_score = model.score(X_train, y_train)
val_score = model.score(X_test, y_test)
print('Random Forest Train Accuracy: '+str(round(train_score*100,2))+'%')
print('Random Forest Validation Accuracy: '+str(round(val_score*100,2))+'%')
print('Train/Test Delta: '+str(round((train_score - val_score)*100,2))+'%')

Random Forest Train Accuracy: 96.47%
Random Forest Validation Accuracy: 93.36%
Train/Test Delta: 3.11%


In [5]:
#Visualize the Random Forest
from sklearn import tree
import graphviz
clf = clf.fit(X_train, y_train)

In [6]:
#Export as dot file
estimator = model.estimators_[5]
data = export_graphviz(estimator, out_file='tree.dot',
                feature_names=costa_rica_data[features].columns.values,
                filled=True,
                rounded=True)
graph = graphviz.Source(data, format="png")

In [7]:
graph = pydot.graph_from_dot_file('tree.dot')
graph

[<pydot.Dot at 0x1227bf490>]

In [8]:
#https://onlineconvertfree.com/convert-format/dot-to-png/