In [101]:
import pandas as pd 
import numpy as np 
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.tree import export_graphviz 
from sklearn.model_selection import KFold
from pydot import pydot
from pydot import dot_parser
from sklearn.model_selection import RandomizedSearchCV

In [89]:
food = pd.read_csv("food_cleaned.csv")
company = pd.read_csv("avg_week_donation.csv")

In [148]:
company.columns

Index(['company', 'avg_donation', 'max_donation', 'med_donation',
       'min_donation', 'num_donation', 'num_weeks', 'one_donation',
       'popular_month_frac', 'popular_weekday_frac', 'start_month',
       'tot_donation', 'week_donation', 'top_contributor',
       'max_weekday_Thursday', 'max_weekday_Tuesday', 'max_weekday_Friday',
       'max_weekday_Wednesday', 'pop_month_4', 'pop_month_1', 'pop_month_5',
       'pop_month_11', 'pop_month_10', 'pop_month_7', 'pop_month_2',
       'pop_month_3', 'pop_month_12', 'pop_month_8', 'pop_month_6',
       'pop_weekday_Wednesday', 'pop_weekday_Thursday', 'pop_weekday_Friday',
       'pop_weekday_Tuesday', 'size_50.0', 'size_5000.0', 'size_200.0',
       'size_10001.0', 'size_1000.0', 'size_10.0', 'size_10000.0',
       'state_California', 'state_underdeveloped', 'type_comm', 'type_bus',
       'type_fin', 'type_applied', 'type_sale', 'type_food', 'type_edu-health',
       'type_web', 'type_soft'],
      dtype='object')

In [149]:
#features = food.drop(["Unnamed: 0", "Id", "date", "number", "street", "city", "company", "zip", "lat", "lon", "pickup", "planned", "day", "year"], axis = 1)
#features.loc[(features["state"] != "California") & (features["state"] != "New York"), "state"] = "underdeveloped"
features = company.drop(["company", "top_contributor", "one_donation", "num_donation"], axis = 1) 

In [150]:
features

Unnamed: 0,avg_donation,max_donation,med_donation,min_donation,num_weeks,popular_month_frac,popular_weekday_frac,start_month,tot_donation,week_donation,...,state_underdeveloped,type_comm,type_bus,type_fin,type_applied,type_sale,type_food,type_edu-health,type_web,type_soft
0,27.000000,27.00,27.000,27.00,1,1.000000,1.000000,9,27.00,27.000000,...,0,0,0,0,0,0,0,0,0,0
1,14.947368,70.00,8.000,3.00,11,0.473684,0.526316,3,284.00,25.818182,...,0,1,0,0,0,0,0,0,0,0
2,26.250000,50.00,22.500,10.00,15,0.500000,0.750000,2,105.00,7.000000,...,0,0,1,0,0,0,0,0,0,0
3,72.625000,200.00,57.500,8.00,18,0.375000,0.250000,1,581.00,32.277778,...,0,0,0,1,0,0,0,0,0,0
4,95.875000,200.00,80.000,12.00,15,0.375000,1.000000,2,767.00,51.133333,...,0,0,0,1,0,0,0,0,0,0
5,11.886364,50.00,10.000,4.00,49,0.113636,0.977273,1,523.00,10.673469,...,0,0,0,0,1,0,0,0,0,0
6,62.400000,90.00,70.000,24.00,14,0.400000,0.600000,1,312.00,22.285714,...,0,0,0,0,0,1,0,0,0,0
7,127.918919,350.00,100.000,14.00,49,0.135135,1.000000,1,4733.00,96.591837,...,0,0,0,0,0,0,1,0,0,0
8,47.857143,75.00,40.000,30.00,26,0.428571,0.428571,2,335.00,12.884615,...,0,0,0,0,0,0,0,1,0,0
9,27.076923,70.00,20.000,12.00,12,0.384615,0.769231,8,352.00,29.333333,...,0,0,0,0,0,1,0,0,0,0


In [151]:
y = company["top_contributor"]
#features = pd.get_dummies(features)
feature_list = list(features.columns)
features = np.array(features)

In [152]:
train_features, test_features, train_labels, test_labels = train_test_split(features, y, test_size = 0.25, random_state = 23156)

In [154]:
rf = RandomForestClassifier(n_estimators = 1000, random_state = 2756)
rf.fit(train_features, train_labels);

In [155]:
predictions = rf.predict(test_features)
predictions

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0], dtype=int64)

In [156]:
sum(test_labels == predictions)/len(test_labels)

0.9583333333333334

In [157]:
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: tot_donation         Importance: 0.17
Variable: popular_month_frac   Importance: 0.15
Variable: num_weeks            Importance: 0.13
Variable: popular_weekday_frac Importance: 0.1
Variable: min_donation         Importance: 0.09
Variable: week_donation        Importance: 0.07
Variable: max_donation         Importance: 0.05
Variable: start_month          Importance: 0.05
Variable: avg_donation         Importance: 0.03
Variable: med_donation         Importance: 0.03
Variable: max_weekday_Friday   Importance: 0.02
Variable: max_weekday_Wednesday Importance: 0.01
Variable: pop_month_4          Importance: 0.01
Variable: pop_month_3          Importance: 0.01
Variable: pop_weekday_Friday   Importance: 0.01
Variable: size_50.0            Importance: 0.01
Variable: size_1000.0          Importance: 0.01
Variable: max_weekday_Thursday Importance: 0.0
Variable: max_weekday_Tuesday  Importance: 0.0
Variable: pop_month_1          Importance: 0.0
Variable: pop_month_5          Importance: 

In [158]:
tree = rf.estimators_[5]
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')

NameError: name 'dot_parser' is not defined

In [159]:
dot_parser.parse_dot_data("tree.dot")

TypeError: must be str, not ParseException

In [160]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ["auto", "sqrt"]
max_depth = [int(x) for x in np.linspace(start = 3, stop = 60, num = 20)] 
#max_depth.append(None)
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 10)]
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]
bootstrap = [True, False] 

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [166]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                              n_iter = 500, cv = 5, verbose = 2, random_state = 6819,
                              n_jobs = -1)
rf_random.fit(train_features, train_labels)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 27.7min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=500, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60], 'min_samples_split': [2, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=6819, refit=True,
          return_train_score='warn'

In [167]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 27,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 2000}

In [168]:
rf = RandomForestClassifier(n_estimators = 2000, 
                            bootstrap = False,
                            max_depth = 27,
                            max_features = "auto",
                            min_samples_leaf = 2,
                            min_samples_split = 2,
                            random_state = 23156)
rf.fit(train_features, train_labels);

In [169]:
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: tot_donation         Importance: 0.2
Variable: popular_month_frac   Importance: 0.16
Variable: num_weeks            Importance: 0.12
Variable: popular_weekday_frac Importance: 0.11
Variable: min_donation         Importance: 0.09
Variable: week_donation        Importance: 0.07
Variable: start_month          Importance: 0.06
Variable: max_donation         Importance: 0.04
Variable: avg_donation         Importance: 0.03
Variable: med_donation         Importance: 0.03
Variable: max_weekday_Friday   Importance: 0.02
Variable: max_weekday_Wednesday Importance: 0.01
Variable: pop_month_3          Importance: 0.01
Variable: pop_weekday_Friday   Importance: 0.01
Variable: size_50.0            Importance: 0.01
Variable: size_1000.0          Importance: 0.01
Variable: max_weekday_Thursday Importance: 0.0
Variable: max_weekday_Tuesday  Importance: 0.0
Variable: pop_month_4          Importance: 0.0
Variable: pop_month_1          Importance: 0.0
Variable: pop_month_5          Importance: 0

In [170]:
sum(rf.predict(test_features) == test_labels)/len(test_labels)

0.9583333333333334

Getting a bunch of different parameters and getting same accuracy, dataset probably too small 