In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, recall_score, f1_score, r2_score, mean_squared_error

from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE, RFECV

# Propensity Model

In [None]:
# read data after manipulation
path = '/content/drive/My Drive/' # set to your local project path
df_original = pd.read_csv(path + 'df_original.csv')

In [None]:
# get data for propensity model fitting
df_prop = df_original[df_original['Year'] != 2023]
df_prop

Unnamed: 0,CONSTITUENTLOOKUPID,USR_STDNT_IND,USR_CMMTT_MBR_IND,USR_STAFF_IND,USR_FRIEND_IND,USR_UM_OTHER_DGR_IND,DEN_IND,LAW_IND,NUR_IND,PHR_IND,...,ETHNICITY_Native American,ETHNICITY_Not Indic,ETHNICITY_Other,ETHNICITY_Unknown,ETHNICITY_White,Capacity_Scale,GENDER_Male,GENDER_Other,GENDER_Unknown,DonationThisYear
301622,1002090.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,8.0,1.0,0.0,0.0,1
301623,1002363.0,0,1,0,1,0,0,1,0,0,...,0.0,0.0,0.0,1.0,0.0,6.0,0.0,0.0,1.0,0
301624,1002545.0,0,1,0,1,0,0,1,0,0,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,1.0,0
301625,1012666.0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,1
301626,1016659.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,6.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259335,991016.0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,0
1259336,991387.0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,0.0,0
1259337,991405.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,0
1259338,992461.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,4.0,1.0,0.0,0.0,1


In [None]:
# get 2023 data for prediction
df_prop_pred = df_original[df_original['Year'] == 2023]
df_prop_pred

Unnamed: 0,CONSTITUENTLOOKUPID,USR_STDNT_IND,USR_CMMTT_MBR_IND,USR_STAFF_IND,USR_FRIEND_IND,USR_UM_OTHER_DGR_IND,DEN_IND,LAW_IND,NUR_IND,PHR_IND,...,ETHNICITY_Native American,ETHNICITY_Not Indic,ETHNICITY_Other,ETHNICITY_Unknown,ETHNICITY_White,Capacity_Scale,GENDER_Male,GENDER_Other,GENDER_Unknown,DonationThisYear
0,1002090.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,8.0,1.0,0.0,0.0,0
1,1002363.0,0,1,0,1,0,0,1,0,0,...,0.0,0.0,0.0,1.0,0.0,6.0,0.0,0.0,1.0,0
2,1002545.0,0,1,0,1,0,0,1,0,0,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,1.0,0
3,1012666.0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,0
4,1016659.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,6.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301617,984322.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,1.0,1
301618,991016.0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,0
301619,991387.0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,0.0,1
301620,991405.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,0


## Random Forest

### Fit model

In [None]:
# input features selected by random forest
features_selected_rf = ['y_f_Firstdonation','log_amount_1y_prior','log_amount_2y_prior','log_max_amount',
             'log_amount_3y_prior','comp_age','log_first_amount','log_amount_4y_prior',
             'Capacity_Scale','log_amount_5y_prior','log_amount_6y_prior','log_amount_7y_prior',
             'log_amount_8y_prior','log_amount_9y_prior','log_amount_10y_prior']

In [None]:
# define X and y
X = df_prop[features_selected_rf]
y = df_prop['DonationThisYear']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

rfc = RandomForestClassifier(n_estimators=200, random_state=0, max_features = 'sqrt')

rfc.fit(X_train, y_train)

# prediction
rfc_predict = rfc.predict(X_test)

KeyboardInterrupt: ignored

In [None]:
# evaluation
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== AUC ===")
print(roc_auc_score(y_test, rfc.predict_proba(X_test)[:,1]))

### Predict 2023

In [None]:
# selected features from random forest
X_2023_prop_rf = df_prop_pred[features_selected_rf]

In [None]:
# prediction
Prob_Pred_rf = rfc.predict_proba(X_2023_prop_rf)

In [None]:
# create dataframe to record prediction
prop_pred_23_rf = pd.DataFrame({'CONSTITUENTLOOKUPID':df_prop_pred['CONSTITUENTLOOKUPID'].tolist(), 
                             'Prob_Pred':Prob_Pred_rf[:,1]})
prop_pred_23_rf

### Output prediction result (optional)

In [None]:
# prop_pred_23_rf.to_csv('prop_pred_23(rf).csv', index = False)

## LightGBM

### Fit model

In [None]:
# input features selected by lightgbm
features_selected_lgb = ['USR_CMMTT_MBR_IND', 'USR_STAFF_IND', 'USR_FRIEND_IND', 'comp_age',
       'USR_ALUM_IND', 'USR_PRNT_IND', 'USR_FACL_IND', 'USR_ACTV_EMAIL_IND',
       'y_f_Firstdonation', 'log_max_amount', 'log_first_amount',
       'log_amount_1y_prior', 'log_amount_2y_prior', 'log_amount_3y_prior',
       'log_amount_4y_prior', 'log_amount_5y_prior', 'log_amount_6y_prior',
       'log_amount_7y_prior', 'log_amount_8y_prior', 'log_amount_9y_prior',
       'log_amount_10y_prior', 'Capacity_Scale', 'GENDER_Male']

In [None]:
# define X and y
X = df_prop[features_selected_lgb]
y = df_prop['DonationThisYear']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

lgbm = LGBMClassifier(learning_rate = 0.1, num_leaves = 127)

lgbm.fit(X_train, y_train)

# prediction
lgbm_predict = lgbm.predict(X_test)

In [None]:
# evaluation
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, lgbm_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, lgbm_predict))
print('\n')
print("=== AUC ===")
print(roc_auc_score(y_test, lgbm.predict_proba(X_test)[:,1]))

### Predict 2023

In [None]:
# selected features from lightgbm
X_2023_prop_lgb = df_prop_pred[features_selected_lgb]

In [None]:
# prediction
Prob_Pred_lgb = lgbm.predict_proba(X_2023_prop_lgb)

In [None]:
# create dataframe to record prediction
prop_pred_23_lgb = pd.DataFrame({'CONSTITUENTLOOKUPID':df_prop_pred['CONSTITUENTLOOKUPID'].tolist(), 
                             'Prob_Pred':Prob_Pred_lgb[:,1]})
prop_pred_23_lgb

### Output prediction result (optional)

In [None]:
# prop_pred_23_lgb.to_csv('prop_pred_23(lgb).csv', index = False)

# Amount Model

In [None]:
# get data for amount model fitting

# filter out donation in 2023
amt = df_original[(df_original['Year'] != 2023) & (df_original['log_amount_thisyear'] > 0)]


# limit donation amount to 25000
df_amt = amt[(amt['log_amount_thisyear'] < np.log(25000+1)) & 
                (amt['log_amount_1y_prior'] <  np.log(25000+1))& 
                (amt['log_amount_2y_prior'] <  np.log(25000+1))& 
                (amt['log_amount_3y_prior'] <  np.log(25000+1))& 
                (amt['log_amount_4y_prior'] <  np.log(25000+1))& 
                (amt['log_amount_5y_prior'] <  np.log(25000+1))& 
                (amt['log_amount_6y_prior'] <  np.log(25000+1))& 
                (amt['log_amount_7y_prior'] <  np.log(25000+1))& 
                (amt['log_amount_8y_prior'] <  np.log(25000+1))& 
                (amt['log_amount_9y_prior'] <  np.log(25000+1))& 
                (amt['log_amount_10y_prior'] <  np.log(25000+1))]
df_amt

Unnamed: 0,CONSTITUENTLOOKUPID,USR_STDNT_IND,USR_CMMTT_MBR_IND,USR_STAFF_IND,USR_FRIEND_IND,USR_UM_OTHER_DGR_IND,DEN_IND,LAW_IND,NUR_IND,PHR_IND,...,ETHNICITY_Native American,ETHNICITY_Not Indic,ETHNICITY_Other,ETHNICITY_Unknown,ETHNICITY_White,Capacity_Scale,GENDER_Male,GENDER_Other,GENDER_Unknown,DonationThisYear
301622,1002090.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,8.0,1.0,0.0,0.0,1
301625,1012666.0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,1
301629,1021011.0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,6.0,1.0,0.0,0.0,1
301630,1025053.0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,6.0,0.0,0.0,1.0,1
301632,1030449.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,5.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259319,950403.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,6.0,1.0,0.0,0.0,1
1259320,951948.0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,6.0,0.0,0.0,0.0,1
1259332,982039.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,1.0,1
1259334,984322.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,1.0,1


In [None]:
# get 2023 data for prediction
df_amt_pred = df_original[df_original['Year'] == 2023]
df_amt_pred

Unnamed: 0,CONSTITUENTLOOKUPID,USR_STDNT_IND,USR_CMMTT_MBR_IND,USR_STAFF_IND,USR_FRIEND_IND,USR_UM_OTHER_DGR_IND,DEN_IND,LAW_IND,NUR_IND,PHR_IND,...,ETHNICITY_Native American,ETHNICITY_Not Indic,ETHNICITY_Other,ETHNICITY_Unknown,ETHNICITY_White,Capacity_Scale,GENDER_Male,GENDER_Other,GENDER_Unknown,DonationThisYear
0,1002090.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,8.0,1.0,0.0,0.0,0
1,1002363.0,0,1,0,1,0,0,1,0,0,...,0.0,0.0,0.0,1.0,0.0,6.0,0.0,0.0,1.0,0
2,1002545.0,0,1,0,1,0,0,1,0,0,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,1.0,0
3,1012666.0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,0
4,1016659.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,6.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301617,984322.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,1.0,1
301618,991016.0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,0
301619,991387.0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,0.0,1
301620,991405.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,7.0,0.0,0.0,0.0,0


## Random Forest Regression

In [None]:
# input features selected by random forest regression
features_selected_rf_amt = ['log_max_amount','log_amount_1y_prior','y_f_Firstdonation','log_amount_2y_prior',
                     'comp_age','log_first_amount','log_amount_3y_prior','Capacity_Scale','log_amount_4y_prior',
                     'log_amount_5y_prior','log_amount_6y_prior','log_amount_9y_prior','log_amount_7y_prior',
                     'log_amount_10y_prior','log_amount_8y_prior','GENDER_Male','USR_FRIEND_IND','USR_PRNT_IND',
                     'USR_STAFF_IND', 'USR_ACTV_EMAIL_IND','GENDER_Unknown','LSA_IND','USR_UM_UG_DGR_IND',
                     'USR_UM_GRAD_DGR_IND','USR_MUA_ALUM_IND','USR_ALUM_IND','USR_CMMTT_MBR_IND','ROSS_IND',
                     'USR_FACL_IND']

In [None]:
# define X and y
X = df_amt[features_selected_rf_amt]
y = df_amt['log_amount_thisyear']

In [None]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# random forest model: fit and predict
rfr = RandomForestRegressor(n_estimators=200, random_state=0, max_features = 'auto')
rfr.fit(X_train, y_train)

# predictions
rfr_predict = rfr.predict(X_test)

  warn(


In [None]:
# evaluation
r2_score(y_test, rfr_predict)

0.8296994003039417

## Predict 2023 

In [None]:
# selected features from random forest regression
X_2023 = df_amt_pred[features_selected_rf_amt]

In [None]:
# prediction
predict_2023 = rfr.predict(X_2023)

# exponentiation of predicted amount
Amount_Pred = np.exp(predict_2023)-1

In [None]:
# create dataframe to record prediction
amt_pred_23 = pd.DataFrame({'CONSTITUENTLOOKUPID':df_amt_pred['CONSTITUENTLOOKUPID'].tolist(), 
                            'Amount_Pred':Amount_Pred})
amt_pred_23

Unnamed: 0,CONSTITUENTLOOKUPID,Amount_Pred
0,1002090.0,63.369477
1,1002363.0,224.044398
2,1002545.0,89.346753
3,1012666.0,38.267005
4,1016659.0,18.029937
...,...,...
301617,984322.0,45.859901
301618,991016.0,944.310941
301619,991387.0,66.726100
301620,991405.0,686.466919


## Output prediction result (optional)

In [None]:
# amt_pred_23.to_csv('amt_pred_23.csv', index = False)

# Score & Grade

In [None]:
# Choose either random forest or lightgbm as predicted propensity
# prop_pred_23 = prop_pred_23_rf
prop_pred_23 = prop_pred_23_lgb

In [None]:
# concat results
result = pd.concat([prop_pred_23, amt_pred_23['Amount_Pred']], axis=1, join='inner')

## Compute Donation Score

In [None]:
# Compute Donation Score by mutiplying predicted probability and amount
result['Donation_Score'] = result['Prob_Pred'] * result['Amount_Pred']

## Add AG Grade

In [None]:
# create empirical cumulative distribution of donation score
ecdf = sm.distributions.ECDF(result['Donation_Score'])

In [None]:
# grade level definition
def score(x):
    if x > 0.9:
        return 'A'
    elif x > 0.8:
        return 'B'
    elif x > 0.7:
        return 'C'
    elif x > 0.6:
        return 'D'
    elif x > 0.5:
        return 'E'
    elif x > 0.4:
        return 'F'
    elif x > 0.3:
        return 'G'
    elif x > 0.2:
        return 'H'
    elif x > 0.1:
        return 'I'
    return 'J'

In [None]:
result['Donation_normalized'] = ecdf(result['Donation_Score'])
result['AG Grade'] = result['Donation_normalized'].apply(score)

In [None]:
result = result.drop(['Donation_normalized'], axis=1)
# final output
result.to_csv('model_pred.csv', index = False)