# Charity data analysis

This project was originally developed in R for a machine learning course.
Redeveloping in Python3 as a programming exercise.

The goal is to maximize the return on investment for a targeted mailing by targeting likely donors.

**To do:**
* Can x_df and c_df creation be captured in a function (DRY)?
* Outlier and influential point detection and management
* Missing value handling
* Adjustments for non-normal distributions in predictors and target
* Logistic regression
* GAM
* LDA (not usually used for qualitative variables, but use anyway as given method to try)
* QDA
* KNN classifier
* Decision tree
* Bagging and random forest
* Boosting
* SVC

In [30]:
import numpy as np
import pandas as pd

from sklearn import preprocessing

In [3]:
charity = pd.read_csv('charity.csv')

In [4]:
# Pause EDA until environmental issue with pandas_profiling can be worked out.
# Work from known facts about the data from previous project version; dataset is the same file from 2016.

charity.head(5)

Unnamed: 0,ID,reg1,reg2,reg3,reg4,home,chld,hinc,genf,wrat,...,npro,tgif,lgif,rgif,tdon,tlag,agif,donr,damt,part
0,1,0,0,1,0,1,1,4,1,8,...,20,81,81,19,17,6,21.05,0.0,0.0,train
1,2,0,0,1,0,1,2,4,0,8,...,95,156,16,17,19,3,13.26,1.0,15.0,train
2,3,0,0,1,0,1,1,5,1,8,...,64,86,15,10,22,8,17.37,,,test
3,4,0,0,0,0,1,1,4,0,8,...,51,56,18,7,14,7,9.59,,,test
4,5,0,0,1,0,1,0,4,1,4,...,85,132,15,10,10,6,12.07,1.0,17.0,valid


In [5]:
charity.shape

(8009, 24)

In [6]:
charity.columns

Index(['ID', 'reg1', 'reg2', 'reg3', 'reg4', 'home', 'chld', 'hinc', 'genf',
       'wrat', 'avhv', 'incm', 'inca', 'plow', 'npro', 'tgif', 'lgif', 'rgif',
       'tdon', 'tlag', 'agif', 'donr', 'damt', 'part'],
      dtype='object')

In [7]:
# Dataset pre-prepared with designations for training/validation/testing split.
# Normal response rate is around 10%, and training/validation sets have oversampled donors to address class imbalance.

charity_train = charity.loc[charity['part'] == 'train']

In [8]:
charity_train.shape

(3984, 24)

In [9]:
# Select predictive features, dropping ID value and target.
# Python handles numbers differently, so cols are 1-21.

x_train = charity_train.iloc[:, 1:21]

In [10]:
x_train.head(5)

Unnamed: 0,reg1,reg2,reg3,reg4,home,chld,hinc,genf,wrat,avhv,incm,inca,plow,npro,tgif,lgif,rgif,tdon,tlag,agif
0,0,0,1,0,1,1,4,1,8,302,76,82,0,20,81,81,19,17,6,21.05
1,0,0,1,0,1,2,4,0,8,262,130,130,1,95,156,16,17,19,3,13.26
5,0,1,0,0,1,1,5,0,9,114,17,25,44,83,131,5,3,13,4,4.12
9,0,0,0,0,1,3,4,1,7,200,38,58,5,42,63,12,10,19,3,9.42
11,0,0,0,1,1,3,4,1,6,272,69,69,0,98,169,29,36,23,7,8.97


In [11]:
# Create a label vector to hold donr values

c_train = charity_train.iloc[:, 21]

In [12]:
c_train.head(10)

0     0.0
1     1.0
5     1.0
9     0.0
11    1.0
12    1.0
16    1.0
18    1.0
23    0.0
25    0.0
Name: donr, dtype: float64

In [13]:
c_train_len = len(c_train)
c_train_len

3984

In [14]:
# Create response variable showing donation amounts for known donors.

y_train = charity_train[(charity_train.donr == 1)][['damt']]

In [15]:
y_train_len = len(y_train)
y_train_len

1995

In [16]:
charity_valid = charity.loc[charity['part'] == 'valid']

In [17]:
charity_valid.shape

(2018, 24)

In [18]:
x_valid = charity_valid.iloc[:, 1:21]

In [19]:
x_valid.head()

Unnamed: 0,reg1,reg2,reg3,reg4,home,chld,hinc,genf,wrat,avhv,incm,inca,plow,npro,tgif,lgif,rgif,tdon,tlag,agif
4,0,0,1,0,1,0,4,1,4,295,39,71,14,85,132,15,10,10,6,12.07
6,0,0,0,0,1,3,4,0,8,145,39,42,10,50,74,6,5,22,3,6.5
7,0,0,0,0,1,3,2,0,5,165,34,35,19,11,41,4,2,20,7,3.45
10,0,0,1,0,1,3,2,1,8,152,46,46,20,100,414,25,14,39,7,10.12
13,0,0,0,1,1,0,4,0,8,108,21,36,32,54,117,5,4,15,9,5.11


In [20]:
c_valid = charity_valid.iloc[:, 21]

In [21]:
c_valid.head()

4     1.0
6     0.0
7     0.0
10    0.0
13    1.0
Name: donr, dtype: float64

In [22]:
y_valid = charity_valid[(charity_valid.donr == 1)][['damt']]

In [26]:
y_valid_len = len(y_valid)
y_valid_len

999

In [23]:
charity_test = charity.loc[charity['part'] == 'test']

In [24]:
charity_test.shape

(2007, 24)

In [27]:
x_test = charity_test.iloc[:, 1:21]

In [29]:
x_test.head(5)

Unnamed: 0,reg1,reg2,reg3,reg4,home,chld,hinc,genf,wrat,avhv,incm,inca,plow,npro,tgif,lgif,rgif,tdon,tlag,agif
2,0,0,1,0,1,1,5,1,8,303,61,90,6,64,86,15,10,22,8,17.37
3,0,0,0,0,1,1,4,0,8,317,121,121,0,51,56,18,7,14,7,9.59
8,0,0,1,0,1,2,3,1,5,194,112,112,0,75,160,28,34,14,4,14.0
15,1,0,0,0,0,2,3,1,8,127,24,33,15,39,72,5,4,15,9,5.24
19,0,0,1,0,0,2,4,0,3,137,21,40,17,95,186,32,35,23,28,11.7


In [42]:
# Standardize features to zero mean and unit standard deviation for algorithms that require standardization.

df_list = [x_train, x_test, x_valid]

In [32]:
# Get mean of every column in the predictor feature tables.

In [40]:
for i in df_list:
    print(i.mean(axis=0))

reg1      0.204819
reg2      0.336094
reg3      0.123494
reg4      0.134789
home      0.883283
chld      1.577058
hinc      3.946285
genf      0.604920
wrat      7.053213
avhv    185.184237
incm     44.288404
inca     57.135542
plow     13.732681
npro     61.629267
tgif    116.744729
lgif     23.193022
rgif     15.546687
tdon     18.814759
tlag      6.301958
agif     11.659239
dtype: float64
reg1      0.197309
reg2      0.235177
reg3      0.170902
reg4      0.160438
home      0.812656
chld      2.115595
hinc      3.817638
genf      0.597907
wrat      6.587942
avhv    178.879920
incm     42.054808
inca     55.347783
plow     15.274539
npro     56.074240
tgif    104.905830
lgif     22.818137
rgif     15.685102
tdon     19.086198
tlag      6.496761
agif     11.689616
dtype: float64
reg1      0.194747
reg2      0.368682
reg3      0.116947
reg4      0.127849
home      0.887017
chld      1.597621
hinc      3.924678
genf      0.613479
wrat      6.963826
avhv    181.380575
incm     43.278494
i

In [43]:
# Get standard deviation for every column in feature sets.

for i in df_list:
    print(i.std(axis=0))

reg1     0.403620
reg2     0.472431
reg3     0.329045
reg4     0.341541
home     0.321123
chld     1.410511
hinc     1.397112
genf     0.488929
wrat     2.320034
avhv    74.700078
incm    25.175555
inca    25.276045
plow    13.093882
npro    30.339701
tgif    85.459111
lgif    31.278131
rgif    12.077199
tdon     5.582317
tlag     3.598049
agif     6.411263
dtype: float64
reg1     0.398067
reg2     0.424215
reg3     0.376517
reg4     0.367104
home     0.390285
chld     1.289806
hinc     1.644238
genf     0.490443
wrat     2.673886
avhv    72.456313
incm    24.784213
inca    24.744207
plow    14.242448
npro    30.127131
tgif    81.692848
lgif    31.547039
rgif    12.468045
tdon     6.305613
tlag     3.881837
agif     6.683308
dtype: float64
reg1     0.396104
reg2     0.482567
reg3     0.321437
reg4     0.334005
home     0.316651
chld     1.416537
hinc     1.411503
genf     0.487073
wrat     2.351843
avhv    68.750639
incm    23.614629
inca    23.927271
plow    13.123934
npro    30.23699

In [45]:
scaler = preprocessing.StandardScaler()

In [47]:
x_train_scl = scaler.fit_transform(x_train[x_train.columns]) # Need to send to dataframe

array([[-0.50751922, -0.71150417,  2.6641248 , ..., -0.32513145,
        -0.08393319,  1.4649126 ],
       [-0.50751922, -0.71150417,  2.6641248 , ...,  0.03318769,
        -0.91782306,  0.2497108 ],
       [-0.50751922,  1.40547314, -0.37535779, ..., -1.04176975,
        -0.63985977, -1.17608439],
       ..., 
       [ 1.97036873, -0.71150417, -0.37535779, ...,  0.92898556,
        -0.08393319,  3.52092797],
       [ 1.97036873, -0.71150417, -0.37535779, ...,  1.46646428,
         0.1940301 , -0.42730792],
       [-0.50751922,  1.40547314, -0.37535779, ...,  0.74982599,
        -0.63985977, -1.11680625]])