In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import iqr
from sklearn import preprocessing as pp
from sklearn.linear_model import SGDClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix

%matplotlib inline

In [2]:
charity = pd.read_csv('charity.csv')

In [3]:
# Dataset pre-prepared with designations for training/validation/testing split.
# Normal response rate is around 10%, and training/validation sets have oversampled donors to address class imbalance.

charity_train = charity.loc[charity['part'] == 'train']
charity_train = charity_train.drop(columns = ['ID', 'part'])
charity_train.shape

(3984, 22)

In [4]:
charity_train_num = charity_train[['avhv', 'incm', 'inca', 'tgif', 'lgif', 'rgif', 'agif']]
charity_train_num.shape

(3984, 7)

In [5]:
# Find unusual values with IQR method
# Code reference: https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba

Q1 = charity_train_num.quantile(0.25)
Q3 = charity_train_num.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
print(Q3)
print(Q1)

avhv    85.0
incm    28.0
inca    28.0
tgif    78.0
lgif    15.0
rgif    13.0
agif     7.8
dtype: float64
avhv    219.00
incm     55.00
inca     68.00
tgif    143.00
lgif     25.00
rgif     20.00
agif     14.79
Name: 0.75, dtype: float64
avhv    134.00
incm     27.00
inca     40.00
tgif     65.00
lgif     10.00
rgif      7.00
agif      6.99
Name: 0.25, dtype: float64


In [6]:
# Remove outliers from numeric dataframe
# Avoid hard-coding relevant values
ct_orm = charity_train_num[~((charity_train_num < (Q1 - 1.5 * IQR)) | (charity_train_num > (Q3 + 1.5 * IQR))).any(axis=1)]
ct_orm.shape

(3171, 7)

In [7]:
# Filter training dataframe by numeric dataframe index to drop outliers

ct_dropout = charity_train[charity_train.index.isin(ct_orm.index)]
ct_dropout.shape
c_train = ct_dropout.pop('donr').values

In [8]:
# Paper uses log transform to normalize data; sklearn has Box-Cox and Yeo-Johnson transforms.

column_trans = ColumnTransformer(
    [('incm_bc', pp.PowerTransformer(method='box-cox', standardize=False), ['incm']),
     ('inca_bc', pp.PowerTransformer(method='box-cox', standardize=False), ['inca']),
    ('tgif_bc', pp.PowerTransformer(method='box-cox', standardize=False), ['tgif']),
    ('lgif_bc', pp.PowerTransformer(method='box-cox', standardize=False), ['lgif']),
    ('rgif_bc', pp.PowerTransformer(method='box-cox', standardize=False), ['rgif']),
    ('agif_bc', pp.PowerTransformer(method='box-cox', standardize=False), ['agif'])],
    remainder='passthrough')

charity_train_pt = ct_dropout.copy()

man_trns = column_trans.fit_transform(charity_train_pt)
man_trns

array([[  4.33722069,   5.47521015,   3.9741291 , ...,  13.        ,
          4.        ,  12.        ],
       [  6.34412396,   8.03848233,   3.47930707, ...,  19.        ,
          3.        ,   0.        ],
       [  8.1556214 ,   8.65446248,   4.13903945, ...,  23.        ,
          7.        ,  17.        ],
       ..., 
       [  5.52817331,   7.33498763,   3.67752228, ...,  14.        ,
          7.        ,   0.        ],
       [  7.37420468,   8.65446248,   4.1863403 , ...,  27.        ,
          7.        ,   0.        ],
       [  8.52320387,   9.57397731,   3.47930707, ...,  23.        ,
          4.        ,  12.        ]])

In [9]:
# PolynomialFeatures to generate polynomial features and interaction terms

poly = pp.PolynomialFeatures(2) # Default value accepted to avoid overfitting
poly_trns = poly.fit_transform(man_trns)
poly_trns


array([[   1.        ,    4.33722069,    5.47521015, ...,   16.        ,
          48.        ,  144.        ],
       [   1.        ,    6.34412396,    8.03848233, ...,    9.        ,
           0.        ,    0.        ],
       [   1.        ,    8.1556214 ,    8.65446248, ...,   49.        ,
         119.        ,  289.        ],
       ..., 
       [   1.        ,    5.52817331,    7.33498763, ...,   49.        ,
           0.        ,    0.        ],
       [   1.        ,    7.37420468,    8.65446248, ...,   49.        ,
           0.        ,    0.        ],
       [   1.        ,    8.52320387,    9.57397731, ...,   16.        ,
          48.        ,  144.        ]])

In [10]:
scaler = pp.StandardScaler().fit(poly_trns)

In [11]:
scaled_train = scaler.transform(poly_trns)
scaled_train

array([[ 0.        , -1.49901066, -1.83318904, ..., -0.46397672,
         0.15741423,  0.41199722],
       [ 0.        ,  0.04529147,  0.39321866, ..., -0.5540138 ,
        -0.76287405, -0.90775346],
       [ 0.        ,  1.43922978,  0.92824689, ..., -0.03951621,
         1.51867398,  1.74091284],
       ..., 
       [ 0.        , -0.58257851, -0.21782297, ..., -0.03951621,
        -0.76287405, -0.90775346],
       [ 0.        ,  0.83793348,  0.92824689, ..., -0.03951621,
        -0.76287405, -0.90775346],
       [ 0.        ,  1.72208268,  1.72691935, ..., -0.46397672,
         0.15741423,  0.41199722]])

In [15]:
clf = SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(scaled_train, c_train)

In [24]:
en_preds = clf.predict(scaled_train)

In [25]:
confusion_matrix(c_train, en_preds)

array([[1643,    0],
       [   4, 1524]])

In [16]:
charity_valid = charity.loc[charity['part'] == 'valid']

In [17]:
x_valid = charity_valid.iloc[:, 1:21]

In [18]:
c_valid = charity_valid.iloc[:, 21]

In [19]:
y_valid = charity_valid[(charity_valid.donr == 1)][['damt']]

In [20]:
y_valid_len = len(y_valid)
y_valid_len

999

In [None]:
charity_test = charity.loc[charity['part'] == 'test']

In [None]:
x_test = charity_test.iloc[:, 1:21]

In [None]:
# Standardize features to zero mean and unit standard deviation for algorithms that require standardization.

df_list = [x_train, x_test, x_valid]

In [None]:
x_train_std = scaler.fit_transform(x_train[x_train.columns]) # Need to send to dataframe

In [None]:
x_valid_std = scaler.fit_transform(x_valid[x_valid.columns])

In [None]:
x_valid_std = scaler.fit_transform(x_test[x_test.columns])