In [None]:
# TO-DO: Try PolynomialFeatures plus ridge regression/the lasso for feature selection

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import iqr
from sklearn import preprocessing as pp
from sklearn.linear_model import LogisticRegressionCV
from sklearn.compose import ColumnTransformer

%matplotlib inline

In [None]:
charity = pd.read_csv('charity.csv')

In [None]:
def triola_id(data):
    
    '''
    Function to check for unusual values in a feature column per Mario Triola's Elementary Statistics.
    Data may be non-normally distributed. If a value is greater than ((1.5*IQR) + Q3), it is considered unusually high.
    A value less than ((1.5*IQR) - Q1) is considered unsually low. Truncate features at max/min usual value.
    Function follows code at https://machinelearningmastery.com/how-to-use-statistics-to-identify-outliers-in-data/.
    
    TO-DO: Try a version following 
    https://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data/22357811#22357811
    '''
    
    q1_val, q3_val = data.quantile(.25), data.quantile(.75)
    iqr_val = iqr(data)
    iqr_cut = iqr_val * 1.5
    min_us, max_us = q1_val - iqr_cut, q3_val + iqr_cut
    return min_us, max_us

test = charity.copy()
test2 = charity.copy()
test = test[['plow', 'incm', 'avhv']]
test.apply(triola_trunc, axis=0)

In [None]:
# Pause EDA until environmental issue with pandas_profiling can be worked out.
# Work from known facts about the data from previous project version; dataset is the same file from 2016.

In [None]:
# Dataset pre-prepared with designations for training/validation/testing split.
# Normal response rate is around 10%, and training/validation sets have oversampled donors to address class imbalance.

charity_train = charity.loc[charity['part'] == 'train']
charity_train = charity_train.drop(columns = ['ID', 'part'])
c_train = charity_train.pop('donr').values

In [None]:
# Create pipeline manually specifying variables and transformations specified in paper.
# Found a copy-paste error in original code with variables run as "model 3" over validation set in paper;
# this set produced the lowest AIC on the *training* data in original project after correlation checks,
# VIF checks, and fwd/bwd/stepwise selection.

# Features specified with manual variable selection:
#     -reg1 and reg2: Geographic regions in which potential donors live
#     -hm_ch_int: Term indicating an interaction between home ownership and number of children
#     -incm_tgif_int: Term indicating an interaction between median family income in the neighborhood
#                     and total gifts given by the donor over his/her lifetime
#     -hinc_sq: Square transform of household income categorization--in retrospect, this transform doesn't make
#               sense to me. The square transform is used to reduce left skew 
#               (http://fmwww.bc.edu/repec/bocode/t/transint.html).
#               This is really a categorical variable representing buckets of income values, not continuous.
#               Removing for the purposes of this exercise. Might have been a given in project assignment.
#     -wrat: Index of relative wealth within each state based on median family income and population stats. 
#            EDA section of paper indicates that more than half of all potential donors are in the top two 
#            categories, something that says to me now that it might be interesting to explore two buckets instead.
#     -tdon: Time since last donation
#     -tlag: Number of months between first and second gifts to the charity

man_train = charity_train.copy()
man_train['hm_ch_int'] = man_train['home'] * man_train['chld']
man_train['incm_tgif_int'] = man_train['incm'] * man_train['tgif']
man_train['hinc_sq'] = np.square(man_train['hinc'])
man_train = man_train['reg1', 'reg2', 'hm_ch_int', 'incm_tgif_int', 'wrat', 'tdon', 'tlag']
man_train.head()

In [None]:
# TESTING: ColumnTransformer with logistic regression features identified in my course paper
# Paper uses log transform to normalize data; sklearn has Box-Cox and Yeo-Johnson transforms.

column_trans = ColumnTransformer(
    [('incm_bc', pp.PowerTransformer(method='box-cox', standardize=False), ['incm']),
    ('tgif_bc', pp.PowerTransformer(method='box-cox', standardize=False), ['tgif'])],
    remainder='passthrough')

man_trns = column_trans.fit_transform(man_train)
man_trns

In [None]:
# Select predictive features, dropping ID value and target.
# Python handles numbers differently, so cols are 1-21.

x_train = charity_train.iloc[:, 1:21]

In [None]:
# Create a label vector to hold donr values

c_train = charity_train.iloc[:, 21]

In [None]:
c_train_len = len(c_train)

In [None]:
# Create response variable showing donation amounts for known donors.

y_train = charity_train[(charity_train.donr == 1)][['damt']]

In [None]:
y_train_len = len(y_train)

In [None]:
charity_valid = charity.loc[charity['part'] == 'valid']

In [None]:
x_valid = charity_valid.iloc[:, 1:21]

In [None]:
c_valid = charity_valid.iloc[:, 21]

In [None]:
y_valid = charity_valid[(charity_valid.donr == 1)][['damt']]

In [None]:
y_valid_len = len(y_valid)
y_valid_len

In [None]:
charity_test = charity.loc[charity['part'] == 'test']

In [None]:
x_test = charity_test.iloc[:, 1:21]

In [None]:
# Standardize features to zero mean and unit standard deviation for algorithms that require standardization.

df_list = [x_train, x_test, x_valid]

In [None]:
scaler = preprocessing.StandardScaler()

In [None]:
x_train_std = scaler.fit_transform(x_train[x_train.columns]) # Need to send to dataframe

In [None]:
x_valid_std = scaler.fit_transform(x_valid[x_valid.columns])

In [None]:
x_valid_std = scaler.fit_transform(x_test[x_test.columns])