In [646]:
import pandas as pd 
import numpy as np 
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.tree import export_graphviz 
from sklearn.model_selection import KFold
from pydot import pydot
#cloned into https://github.com/erocarrera/pydot
#from pydot import dot_parser
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import norm

# Which types of companies are one time donors? 

In [695]:
food.columns

Index(['pounds', 'state_California', 'state_New York', 'state_Underdeveloped',
       'month_1', 'month_10', 'month_11', 'month_12', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'year_2017', 'year_2018', 'weekday_Friday', 'weekday_Monday',
       'weekday_Saturday', 'weekday_Thursday', 'weekday_Tuesday',
       'weekday_Wednesday', 'size_10.0', 'size_1000.0', 'size_10000.0',
       'size_10001.0', 'size_200.0', 'size_50.0', 'size_500.0', 'size_5000.0',
       'type_applied', 'type_bus', 'type_comm', 'type_edu-health', 'type_fin',
       'type_food', 'type_other', 'type_sale', 'type_soft', 'type_web',
       'company', 'date'],
      dtype='object')

In [865]:
food = pd.read_csv("no_ell_outliers")
food = food.drop(["Unnamed: 0", "Unnamed: 0.1", "Id", "id", "number", "street", "city", 
                  "zip", "lat", "lon", "pickup", "planned", "month", "day", "year", "weekday"], axis = 1)
company = pd.read_csv("company_full.csv")
#company = company.drop("company", axis = 1)
company["multi_donor"] = 1 - company["one_donation"]
company = company.drop("one_donation", axis = 1)

In [866]:
food["type"].unique()

array(['food', 'other', 'comm', 'fin', 'soft', 'bus', 'edu-health',
       'applied', 'web', 'sale'], dtype=object)

### Combining company types based on proportion 

In [867]:
company_food = company[company["type_food"] == 1]["multi_donor"]
company_other = company[company["type_other"] == 1]["multi_donor"]
company_comm = company[company["type_comm"] == 1]["multi_donor"]
company_fin = company[company["type_fin"] == 1]["multi_donor"]
company_soft = company[company["type_soft"] == 1]["multi_donor"]
company_bus = company[company["type_bus"] == 1]["multi_donor"]
company_eh = company[company["type_edu-health"] == 1]["multi_donor"]
company_applied = company[company["type_applied"] == 1]["multi_donor"]
company_web = company[company["type_web"] == 1]["multi_donor"]
company_sale = company[company["type_sale"] == 1]["multi_donor"]
dists_name = ["Food", "Other", "Communication", "Finance", "Software", 
             "Business", "Education-Health", "Applied", "Web", "Sale"]
dists = [company_food, company_other, company_comm, company_fin, company_soft,
        company_bus, company_eh, company_applied, company_web, company_sale]

In [868]:
import random
props = []
for i in range(len(dists)):
    dist = [sum(dists[i].sample(1000, replace = True))/1000 for j in range(1000)]
    props.append(sum(dist)/1000)
props = np.array(props)

In [869]:
print("Company Type and Proportions:")
for i in range(len(props)):
    print(food["type"].unique()[i], props[i])

Company Type and Proportions:
food 0.628778
other 0.64721
comm 0.577347
fin 0.703347
soft 0.812676
bus 0.710036
edu-health 0.571004
applied 0.545552
web 0.578562
sale 0.646961


In [870]:
print()
for i in range(1, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[0]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("Food and " + dists_name[i] + ': {0:0.2f}'.format(
     1 - norm.cdf(abs(z))))


Food and Other: 0.20
Food and Communication: 0.01
Food and Finance: 0.00
Food and Software: 0.00
Food and Business: 0.00
Food and Education-Health: 0.00
Food and Applied: 0.00
Food and Web: 0.01
Food and Sale: 0.20


In [871]:
for i in range(2, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[1]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("Other and " + dists_name[i] + ': {0:0.2f}'.format(
     1 - norm.cdf(abs(z))))

Other and Communication: 0.00
Other and Finance: 0.00
Other and Software: 0.00
Other and Business: 0.00
Other and Education-Health: 0.00
Other and Applied: 0.00
Other and Web: 0.00
Other and Sale: 0.50


In [872]:
for i in range(3, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[2]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("Communication and " + dists_name[i] + ': {0:0.2f}'.format(
      1 - norm.cdf(abs(z))))

Communication and Finance: 0.00
Communication and Software: 0.00
Communication and Business: 0.00
Communication and Education-Health: 0.39
Communication and Applied: 0.08
Communication and Web: 0.48
Communication and Sale: 0.00


In [873]:
for i in range(4, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[3]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("Finance and " + dists_name[i] + ': {0:0.2f}'.format(
      1 - norm.cdf(abs(z))))

Finance and Software: 0.00
Finance and Business: 0.37
Finance and Education-Health: 0.00
Finance and Applied: 0.00
Finance and Web: 0.00
Finance and Sale: 0.00


In [874]:
for i in range(5, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[4]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("Software and " + dists_name[i] + ': {0:0.2f}'.format(
      1 - norm.cdf(abs(z))))

Software and Business: 0.00
Software and Education-Health: 0.00
Software and Applied: 0.00
Software and Web: 0.00
Software and Sale: 0.00


In [875]:
for i in range(6, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[5]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("Business and " + dists_name[i] + ': {0:0.2f}'.format(
      1 - norm.cdf(abs(z))))

Business and Education-Health: 0.00
Business and Applied: 0.00
Business and Web: 0.00
Business and Sale: 0.00


In [876]:
for i in range(7, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[6]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("Education-Health and " + dists_name[i] + ': {0:0.2f}'.format(
      1 - norm.cdf(abs(z))))

Education-Health and Applied: 0.13
Education-Health and Web: 0.37
Education-Health and Sale: 0.00


In [877]:
for i in range(8, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[7]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("Applied and " + dists_name[i] + ': {0:0.2f}'.format(
      1 - norm.cdf(abs(z))))

Applied and Web: 0.07
Applied and Sale: 0.00


In [878]:
for i in range(9, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[8]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("Web and " + dists_name[i] + ': {0:0.2f}'.format(
      1 - norm.cdf(abs(z))))

Web and Sale: 0.00


In [879]:
company.columns

Index(['company', 'size_10.0', 'size_1000.0', 'size_10000.0', 'size_10001.0',
       'size_200.0', 'size_50.0', 'size_500.0', 'size_5000.0',
       'state_California', 'state_New York', 'state_underdeveloped',
       'type_applied', 'type_bus', 'type_comm', 'type_edu-health', 'type_fin',
       'type_food', 'type_other', 'type_sale', 'type_soft', 'type_web',
       'multi_donor'],
      dtype='object')

In [880]:
company["food_other_sale"] = company.iloc[:, [17, 18, 19]].sum(axis = 1)
company["comm_edu/health_web"] = company.iloc[:, [14, 15, 21]].sum(axis = 1)
company["fin_bus"] = company.iloc[:, [13, 16]].sum(axis = 1)
company = company.drop([ "type_food", "type_other", "type_sale", "type_comm", 
                       "type_edu-health", "type_web", "type_fin", "type_bus"], axis = 1)

### Combining company sizes based on proportions

In [881]:
company.columns

Index(['company', 'size_10.0', 'size_1000.0', 'size_10000.0', 'size_10001.0',
       'size_200.0', 'size_50.0', 'size_500.0', 'size_5000.0',
       'state_California', 'state_New York', 'state_underdeveloped',
       'type_applied', 'type_soft', 'multi_donor', 'food_other_sale',
       'comm_edu/health_web', 'fin_bus'],
      dtype='object')

In [882]:
company_10 = company[company["size_10.0"] == 1]["multi_donor"]
company_50 = company[company["size_50.0"] == 1]["multi_donor"]
company_200 = company[company["size_200.0"] == 1]["multi_donor"]
company_500 = company[company["size_500.0"] == 1]["multi_donor"]
company_1000 = company[company["size_1000.0"] == 1]["multi_donor"]
company_5000 = company[company["size_5000.0"] == 1]["multi_donor"]
company_10000 = company[company["size_10000.0"] == 1]["multi_donor"]
company_10001 = company[company["size_10001.0"] == 1]["multi_donor"]
dists_name = ["10", "50", "200", "500", "1000", 
             "5000", "10000", "10001"]
dists = [company_10, company_50, company_200, company_500, company_1000,
        company_5000, company_10000, company_10001]

In [883]:
import random
props = []
for i in range(len(dists)):
    dist = [sum(dists[i].sample(1000, replace = True))/1000 for j in range(1000)]
    props.append(sum(dist)/1000)
props = np.array(props)

In [884]:
print("Company Size and Proportions:")
for i in range(len(props)):
    print(dists_name[i] + ": ", props[i])

Company Size and Proportions:
10:  0.583443
50:  0.593456
200:  0.661663
500:  0.644822
1000:  0.899745
5000:  0.590446
10000:  0.499443
10001:  0.443786


In [885]:
for i in range(1, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[0]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("10 and " + dists_name[i] + ': {0:0.2f}'.format(
     1 - norm.cdf(abs(z))))

10 and 50: 0.32
10 and 200: 0.00
10 and 500: 0.00
10 and 1000: 0.00
10 and 5000: 0.38
10 and 10000: 0.00
10 and 10001: 0.00


In [886]:
for i in range(2, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[1]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("50 and " + dists_name[i] + ': {0:0.2f}'.format(
     1 - norm.cdf(abs(z))))

50 and 200: 0.00
50 and 500: 0.01
50 and 1000: 0.00
50 and 5000: 0.45
50 and 10000: 0.00
50 and 10001: 0.00


In [887]:
for i in range(3, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[2]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("200 and " + dists_name[i] + ': {0:0.2f}'.format(
     1 - norm.cdf(abs(z))))

200 and 500: 0.21
200 and 1000: 0.00
200 and 5000: 0.00
200 and 10000: 0.00
200 and 10001: 0.00


In [888]:
for i in range(4, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[3]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("500 and " + dists_name[i] + ': {0:0.2f}'.format(
     1 - norm.cdf(abs(z))))

500 and 1000: 0.00
500 and 5000: 0.01
500 and 10000: 0.00
500 and 10001: 0.00


In [889]:
for i in range(5, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[4]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("1000 and " + dists_name[i] + ': {0:0.2f}'.format(
     1 - norm.cdf(abs(z))))

1000 and 5000: 0.00
1000 and 10000: 0.00
1000 and 10001: 0.00


In [890]:
for i in range(6, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[5]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("5000 and " + dists_name[i] + ': {0:0.2f}'.format(
     1 - norm.cdf(abs(z))))

5000 and 10000: 0.00
5000 and 10001: 0.00


In [891]:
for i in range(7, len(dists)): 
    n1 = 1000
    n2 = 1000
    p1 = props[6]
    p2 = props[i]
    p = (n1 * p1 + n2 * p2)/(n1 + n2)
    z = (p1 - p2)/(p * (1 - p) * (1/n1 + 1/n2))**0.5
    print("10000 and " + dists_name[i] + ': {0:0.2f}'.format(
     1 - norm.cdf(abs(z))))

10000 and 10001: 0.01


In [892]:
company.columns

Index(['company', 'size_10.0', 'size_1000.0', 'size_10000.0', 'size_10001.0',
       'size_200.0', 'size_50.0', 'size_500.0', 'size_5000.0',
       'state_California', 'state_New York', 'state_underdeveloped',
       'type_applied', 'type_soft', 'multi_donor', 'food_other_sale',
       'comm_edu/health_web', 'fin_bus'],
      dtype='object')

In [893]:
company["size_10/50"] = company.iloc[:, [1, 6]].sum(axis = 1)
company["size_200/500"] = company.iloc[:, [5, 7]].sum(axis = 1)
company = company.drop(["size_10.0", "size_50.0", "size_200.0", "size_500.0"], axis = 1)

### Looking at whether a company would be a one time donor before their first donation

In [894]:
#features = food.drop(["Unnamed: 0", "Id", "date", "number", "street", "city", "company", "zip", "lat", "lon", "pickup", "planned", "day", "year"], axis = 1)
#features.loc[(features["state"] != "California") & (features["state"] != "New York"), "state"] = "underdeveloped"
features = company.drop(["multi_donor", "company"], axis = 1) 

In [895]:
company.columns

Index(['company', 'size_1000.0', 'size_10000.0', 'size_10001.0', 'size_5000.0',
       'state_California', 'state_New York', 'state_underdeveloped',
       'type_applied', 'type_soft', 'multi_donor', 'food_other_sale',
       'comm_edu/health_web', 'fin_bus', 'size_10/50', 'size_200/500'],
      dtype='object')

In [896]:
features.shape

(242, 14)

In [897]:
y = company["multi_donor"]
#features = pd.get_dummies(features)
feature_list = list(features.columns)
features = np.array(features)

In [898]:
train_features, test_features, train_labels, test_labels = train_test_split(features, y, test_size = 0.25, random_state = 23156)

In [899]:
rf = RandomForestClassifier(n_estimators = 1000, random_state = 12356)
rf.fit(train_features, train_labels);

In [900]:
predictions = rf.predict(test_features)
predictions

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1], dtype=int64)

In [901]:
sum(test_labels == predictions)/len(test_labels)

0.6885245901639344

In [902]:
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: state_underdeveloped Importance: 0.11
Variable: size_200/500         Importance: 0.1
Variable: state_New York       Importance: 0.09
Variable: size_10/50           Importance: 0.09
Variable: state_California     Importance: 0.08
Variable: comm_edu/health_web  Importance: 0.08
Variable: size_1000.0          Importance: 0.07
Variable: fin_bus              Importance: 0.07
Variable: size_5000.0          Importance: 0.06
Variable: type_applied         Importance: 0.06
Variable: food_other_sale      Importance: 0.06
Variable: size_10000.0         Importance: 0.04
Variable: size_10001.0         Importance: 0.04
Variable: type_soft            Importance: 0.04


In [541]:
#tree = rf.estimators_[5]
#export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
#(graph, ) = graph_from_dot_file('tree.dot')
#graph.write_png('tree.png')
#dot_parser.parse_dot_data("tree.dot")

In [542]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ["auto", "sqrt"]
max_depth = [int(x) for x in np.linspace(start = 3, stop = 60, num = 20)] 
#max_depth.append(None)
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 10)]
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]
bootstrap = [True, False] 

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [903]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                              n_iter = 50, cv = 10, verbose = 2, random_state = 6819,
                              n_jobs = -1)
rf_random.fit(train_features, train_labels)
rf_random.best_params_

Fitting 10 folds for each of 50 candidates, totalling 500 fits


KeyboardInterrupt: 

In [904]:
rf = RandomForestClassifier(bootstrap = True,
                            max_depth = 3,
                            max_features = "auto",
                            min_samples_leaf = 1,
                            min_samples_split = 8,
                            n_estimators = 1000,
                            random_state = 12356)
rf.fit(train_features, train_labels);
sum(rf.predict(test_features) == test_labels)/len(test_labels)

0.7213114754098361

In [905]:
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: state_underdeveloped Importance: 0.21
Variable: size_1000.0          Importance: 0.14
Variable: size_10/50           Importance: 0.08
Variable: state_California     Importance: 0.07
Variable: state_New York       Importance: 0.07
Variable: type_applied         Importance: 0.06
Variable: comm_edu/health_web  Importance: 0.06
Variable: fin_bus              Importance: 0.06
Variable: size_200/500         Importance: 0.06
Variable: size_5000.0          Importance: 0.05
Variable: size_10001.0         Importance: 0.04
Variable: type_soft            Importance: 0.04
Variable: food_other_sale      Importance: 0.04
Variable: size_10000.0         Importance: 0.03


In [908]:
from sklearn.metrics import precision_score, recall_score
precision_accuracy = precision_score(test_labels, rf.predict(test_features))
recall_accuracy = recall_score(test_labels, rf.predict(test_features))

print('Precision Score: {0:0.2f}'.format(
      precision_accuracy))
print('Recall Score: {0:0.2f}'.format(
      recall_accuracy))

Precision Score: 0.73
Recall Score: 0.95


In [909]:
preds = rf.predict(test_features)
predicted_one_time = [i for i in range(len(preds)) if preds[i] == 1]
predicted_multi_time = [i for i in range(len(preds)) if preds[i] == 0]
true_one_time = [i for i in range(len(test_labels)) if np.array(test_labels)[i] == 1]
true_multi_time = [i for i in range(len(test_labels)) if np.array(test_labels)[i] == 0]
true_positives = len(np.intersect1d(predicted_one_time, true_one_time))
false_positives = len(np.intersect1d(predicted_one_time, true_multi_time))
true_negatives = len(np.intersect1d(predicted_multi_time, true_multi_time))
false_negatives = len(np.intersect1d(predicted_multi_time, true_one_time))

In [910]:
precision = true_positives/(true_positives + false_positives)
recall = true_positives/(true_positives + false_negatives)

In [911]:
print(precision, recall)

0.7272727272727273 0.9523809523809523


4

### Looking at whether a company would be a one time donor after donation 

In [819]:
food["date"] = pd.to_datetime(food["date"])
food["size"] = food["size"].astype("str")
company2_num_donation = food.groupby("company")["company"].count()
company2_one_donation = (company2_num_donation == 1).astype("int")
company2_size = food.groupby("company")["size"].first()
company2_type = food.groupby("company")["type"].first()
company2_state = food.groupby("company")["state"].first()
company2_last_donation_weeks = (np.max(food["date"]) - food.groupby("company")["date"].agg(np.max)).apply(lambda x: x.days)//7
#company2_last_donation = food[food.groupby("company").date.transform("max") == food["date"]].groupby("company").first()["pounds"]
company2 = pd.DataFrame({"multi_donor": 1 - company2_one_donation,
                       "size": company2_size, "type": company2_type,
                       "state": company2_state, 
#                        "last_donation": company2_last_donation,
                        "last_donation_weeks": company2_last_donation_weeks,
                       })

In [820]:
company2 = pd.get_dummies(company2)

In [821]:
company2.columns

Index(['last_donation_weeks', 'multi_donor', 'size_10.0', 'size_1000.0',
       'size_10000.0', 'size_10001.0', 'size_200.0', 'size_50.0', 'size_500.0',
       'size_5000.0', 'state_California', 'state_New York',
       'state_Underdeveloped', 'type_applied', 'type_bus', 'type_comm',
       'type_edu-health', 'type_fin', 'type_food', 'type_other', 'type_sale',
       'type_soft', 'type_web'],
      dtype='object')

In [839]:
company2["food_other_sale"] = company2.iloc[:, [19, 20, 21]].sum(axis = 1)
company2["comm_edu/health_web"] = company2.iloc[:, [16, 17, 23]].sum(axis = 1)
company2["fin_bus"] = company2.iloc[:, [18, 15]].sum(axis = 1)
company2 = company2.drop([ "type_food", "type_other", "type_sale", "type_comm", 
                       "type_edu-health", "type_web", "type_fin", "type_bus"], axis = 1)
#company2["size_10/50"] = company2.iloc[:, [3, 8]].sum(axis = 1)
#company2["size_200/500"] = company2.iloc[:, [7, 9]].sum(axis = 1)
#company2 = company2.drop(["size_10.0", "size_50.0", "size_200.0", "size_500.0"], axis = 1)

IndexError: positional indexers are out-of-bounds

In [840]:
features = company2.drop(["multi_donor"], axis = 1) 
y = company2["multi_donor"]
#features = pd.get_dummies(features)
feature_list = list(features.columns)
features = np.array(features)

In [841]:
train_features, test_features, train_labels, test_labels = train_test_split(features, y, test_size = 0.25, random_state = 23156)

In [842]:
rf = RandomForestClassifier(n_estimators = 1000, random_state = 12356)
rf.fit(train_features, train_labels);

In [843]:
predictions = rf.predict(test_features)
predictions

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 1])

In [844]:
sum(test_labels == predictions)/len(test_labels)

0.7083333333333334

In [845]:
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: last_donation_weeks  Importance: 0.56
Variable: size_50.0            Importance: 0.05
Variable: state_Underdeveloped Importance: 0.04
Variable: comm_edu/health_web  Importance: 0.04
Variable: fin_bus              Importance: 0.04
Variable: size_10.0            Importance: 0.03
Variable: size_1000.0          Importance: 0.03
Variable: size_200.0           Importance: 0.03
Variable: state_California     Importance: 0.03
Variable: state_New York       Importance: 0.03
Variable: type_applied         Importance: 0.03
Variable: food_other_sale      Importance: 0.03
Variable: size_500.0           Importance: 0.02
Variable: size_5000.0          Importance: 0.02
Variable: size_10000.0         Importance: 0.01
Variable: size_10001.0         Importance: 0.01
Variable: type_soft            Importance: 0.01


In [846]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                              n_iter = 50, cv = 10, verbose = 2, random_state = 6819,
                              n_jobs = -1)
rf_random.fit(train_features, train_labels)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   53.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.3min finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=50, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60], 'min_samples_split': [2, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=6819, refit=True,
          return_train_score='warn'

In [847]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 3,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 7,
 'n_estimators': 800}

In [858]:
rf = RandomForestClassifier(bootstrap = True,
                            max_depth = 3,
                            max_features = "sqrt",
                            min_samples_leaf = 5,
                            min_samples_split = 7,
                            n_estimators = 800,
                            random_state = 12356)
rf.fit(train_features, train_labels);
sum(rf.predict(test_features) == test_labels)/len(test_labels)

0.6458333333333334

In [859]:
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: last_donation_weeks  Importance: 0.51
Variable: size_50.0            Importance: 0.09
Variable: state_Underdeveloped Importance: 0.07
Variable: size_10.0            Importance: 0.04
Variable: size_1000.0          Importance: 0.04
Variable: state_New York       Importance: 0.04
Variable: size_200.0           Importance: 0.03
Variable: state_California     Importance: 0.03
Variable: fin_bus              Importance: 0.03
Variable: size_500.0           Importance: 0.02
Variable: size_5000.0          Importance: 0.02
Variable: type_applied         Importance: 0.02
Variable: food_other_sale      Importance: 0.02
Variable: comm_edu/health_web  Importance: 0.02
Variable: type_soft            Importance: 0.01
Variable: size_10000.0         Importance: 0.0
Variable: size_10001.0         Importance: 0.0


In [860]:
preds = rf.predict(test_features)
predicted_one_time = [i for i in range(len(preds)) if preds[i] == 1]
predicted_multi_time = [i for i in range(len(preds)) if preds[i] == 0]
true_one_time = [i for i in range(len(test_labels)) if np.array(test_labels)[i] == 1]
true_multi_time = [i for i in range(len(test_labels)) if np.array(test_labels)[i] == 0]
true_positives = len(np.intersect1d(predicted_one_time, true_one_time))
false_positives = len(np.intersect1d(predicted_one_time, true_multi_time))
true_negatives = len(np.intersect1d(predicted_multi_time, true_multi_time))
false_negatives = len(np.intersect1d(predicted_multi_time, true_one_time))

In [861]:
false_negatives

2

In [862]:
true_positives/(true_positives + false_negatives)

0.9354838709677419

In [863]:
true_positives + false_negatives

31

In [912]:
true_positives/(true_positives + false_positives)

0.7272727272727273