In [438]:
# Imports
%matplotlib inline

import os
import numpy as np
import pandas as pd
import numpy.random as rand
from itertools import islice
from sklearn.ensemble import (GradientBoostingRegressor, 
                              GradientBoostingClassifier, 
                              AdaBoostClassifier,
                              RandomForestClassifier)
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC
import math
import sklearn.datasets as datasets
# import sklearn.cross_validation as cv
import sklearn.model_selection as cv
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import log_loss, make_scorer, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.linear_model import LogisticRegression

plt.style.use('ggplot')

from pylab import rcParams
rcParams['figure.figsize'] = (12, 8)
# from IPython.display import HTML

In [413]:
def score_matrix(obj):
    name = str(obj.__class__).split(".")[-1][:-2]
    score = obj.score(X_test,y_test)
    y_pred = obj.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
    precision = tp / (tp +fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print("Model: {}".format(name))
    print("precision = {:.2%}".format(precision))
    print("recall = {:.2%}".format(recall))
    print('accuracy = {:.2%}, score = {:.2%}'.format(accuracy,score))

In [414]:
user_df = pd.read_csv('Data/Donations_File.csv', parse_dates=['Date'])

In [415]:
user_df.drop(columns=['Donation ID','User ID','Givz Everywehre?','To Charity','Advised Charity',
                     'Unnamed: 10','Recurring Status','Advised Charity EIN','To Charity EIN'],inplace=True)

In [416]:
user_df['Event?'] = user_df['Event?'].apply(lambda x: False if math.isnan(x) else True)

In [417]:
# Drop Negative Tip Amounts
user_df = user_df[user_df.Tip >= 0]

In [418]:
#Calculate Proportion of Donations that Include Tip
tip_df = user_df[user_df.Tip > 0]

user_tip_proportion = (tip_df.shape[0])/(user_df.shape[0])

In [419]:
user_df['Tip'] = user_df['Tip'].apply(lambda x: False if x == 0.0 else True)

In [420]:
# Number to Beat!
user_tip_proportion

0.10453648915187377

In [421]:
#user_df['Date'] = pd.to_datetime(user_df['Date'])
user_df['Year'] = user_df['Date'].apply(lambda x: x.year)
user_df['Month'] = user_df['Date'].apply(lambda x: x.month)
user_df['Day'] = user_df['Date'].apply(lambda x: x.day)
user_df['Hour'] = user_df['Date'].apply(lambda x: x.hour)
user_df['Minute'] = user_df['Date'].apply(lambda x: x.minute)


In [422]:
#user_df['Advised Charity EIN'] = user_df['Advised Charity EIN'].apply(lambda x: int(x.replace(',', '')))

In [423]:
user_df['Message to Charity2'] = user_df['Message to Charity2'].apply(lambda x: True if type(x) == str else False)
user_df['Message to Charity1'] = user_df['Message to Charity1'].apply(lambda x: True if type(x) == str else False)
user_df['Message'] = (user_df['Message to Charity2'] == True) | (user_df['Message to Charity1']==True)

In [424]:
user_df.drop(columns=['Date','Message to Charity1','Message to Charity2'],inplace=True)

In [425]:
user_df.head(100)

Unnamed: 0,Amount,In Honor Of?,Event?,Cover fee?,Tip,Year,Month,Day,Hour,Minute,Message
0,25.00,False,False,False,False,2019,9,11,3,17,False
1,20.00,False,False,False,False,2019,9,10,22,55,False
2,21.00,False,False,False,False,2019,9,10,22,55,False
3,140.00,False,False,False,False,2019,9,10,22,29,False
4,1.00,False,False,False,False,2019,9,10,16,52,False
5,5.00,False,False,False,True,2019,9,10,10,13,False
6,1400.00,False,False,False,False,2019,9,10,9,49,False
7,1.00,False,False,False,False,2019,9,9,17,17,False
8,12.00,False,False,False,False,2019,9,9,16,17,False
9,117.00,False,False,False,False,2019,9,9,12,3,False


In [426]:
# df['month'] = df['start_date'].dt.month
# df['year'] = df['start_date'].dt.year
# df['day_of_week'] = df['start_date'].dt.dayofweek
# df['hour'] = df['start_date'].dt.hour
# df.head()

In [427]:
y = user_df['Tip']
X = user_df.drop(columns=['Tip'])

In [428]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Random Forest

In [435]:
forest = RandomForestClassifier(oob_score=True,n_estimators=300)
forest.fit(X_train,y_train)
score_matrix(forest)

Model: RandomForestClassifier
precision = 67.35%
recall = 50.00%
accuracy = 92.27%, score = 92.27%


# Logistic Regression

In [439]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
score_matrix(log_reg)

Model: LogisticRegression
precision = 41.18%
recall = 10.61%
accuracy = 89.12%, score = 89.12%


