In [196]:
import os
import re
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from collections import Counter
from collections import defaultdict
from collections import deque


In [197]:
def marital_status_to_num(val):
    if val == 'M':
        return 0
    elif val == 'S':
        return 1
    elif val == 'D':
        return 2
    elif val == 'W':
        return 3
    else:
        return np.nan

def marital_status_to_str(val):
    if val == 0:
        return 'married'
    elif val == 1:
        return 'single'
    elif val == 2:
        return 'divorced'
    elif val == 3:
        return 'widowed'
    else:
        return np.nan

data_dict = {
    'data': {
        'rename': {
            'ID Number': 'id',
            'Lifetime HC': 'cum_donation',
            'Email Present': 'has_email',
            'BusPhone Present': 'has_business_phone',
            'Grad Year': 'grad_year',
            'Marital Status': 'marital_status',
            'SpouseID Present': 'has_spousal_record',
            'JobTitle Present': 'has_job_title',
            'VarsityAth Present': 'has_activity_athlete',
            'StudGovt Present': 'has_activity_government',
            'OtherStudActs Present': 'has_activity_other',
            'Greek Present': 'has_activity_greek',
            'Prefix is Mr.': 'is_mr',
            'Prefix is Ms.': 'is_ms',
            'Prefix is Dr.': 'is_dr',
            'Prefix is Mrs.': 'is_mrs'
        },
        'dtype': {
            'ID Number': str,
            'Lifetime HC': float,
            'Grad Year': int,
            'Marital Status': str
        }
    },
    'dummy': {
        'list': [
            'cum_range',
            'grad_decade',
            'imp_marital_status',
        ],
        'prefix': {
            'cum_range': 'bin_cum_range',
            'grad_decade': 'bin_grad_decade',
            'imp_marital_status': 'bin_imp_marital_status'
        },
        'giving': {
            'bins': [
                0.00,
                1.00,
                1000.00,
                10000.00,
                25000.00,
                50000.00,
                100000.00,
                250000.00,
                500000.00,
                1000000.00,
                2500000.00,
                5000000.00,
                10000000.00,
                15000000.00
            ],
            'labels': [
                '$0',
                '$1-$999.99',
                '$1K-$9.99K',
                '$10K-$24.99K',
                '$25K-$49.99K',
                '$50K-$99.99K',
                '$100K-$249.99K',
                '$250K-$499.99K',
                '$500K-$999.99K',
                '$1M-$2.49M',
                '$2.5M-$4.99M',
                '$5M-$9.99M',
                '$10M-$14.99M'
            ]
        },
        'grad': {
            'bins': [
                1900,
                1910,
                1920,
                1930,
                1940,
                1950,
                1960,
                1970,
                1980,
                1990,
                2000,
                2010,
                2020
            ],
            'labels': [
                '1900s',
                '1910s',
                '1920s',
                '1930s',
                '1940s',
                '1950s',
                '1960s',
                '1970s',
                '1980s',
                '1990s',
                '2000s',
                '2010s'
            ]
        }
    },
    'cols': {
        'bools': [
            'has_email',
            'has_business_phone',
            'has_spousal_record',
            'has_job_title',
            'has_activity_athlete',
            'has_activity_government',
            'has_activity_other',
            'has_activity_greek',
            'is_mr',
            'is_ms',
            'is_dr',
            'is_mrs',
            'has_donated'
        ],
        'bin_grad': [
            'bin_grad_decade_1900s',
            'bin_grad_decade_1910s',
            'bin_grad_decade_1920s',
            'bin_grad_decade_1930s',
            'bin_grad_decade_1940s',
            'bin_grad_decade_1950s',
            'bin_grad_decade_1960s',
            'bin_grad_decade_1970s',
            'bin_grad_decade_1980s',
            'bin_grad_decade_1990s',
            'bin_grad_decade_2000s',
            'bin_grad_decade_2010s'
        ],
        'bin_giving': [
            'bin_cum_range_$0',
            'bin_cum_range_$1-$999.99',
            'bin_cum_range_$1K-$9.99K',
            'bin_cum_range_$10K-$24.99K',
            'bin_cum_range_$25K-$49.99K',
            'bin_cum_range_$50K-$99.99K',
            'bin_cum_range_$100K-$249.99K',
            'bin_cum_range_$250K-$499.99K',
            'bin_cum_range_$500K-$999.99K',
            'bin_cum_range_$1M-$2.49M',
            'bin_cum_range_$2.5M-$4.99M',
            'bin_cum_range_$5M-$9.99M',
            'bin_cum_range_$10M-$14.99M'
        ],
        'bin_marital': [
            'bin_imp_marital_status_divorced',
            'bin_imp_marital_status_married',
            'bin_imp_marital_status_single',
            'bin_imp_marital_status_widowed'
        ],
        'log': [
            'has_donated'
        ],
        'lin': [
            'cum_donation'
        ]
    }
}

In [198]:
project_dir = os.path.join(os.path.abspath(''),os.pardir)
data_dir = os.path.join(project_dir,'data')
data_raw_dir = os.path.join(data_dir,'raw')

file_list = []

for file in os.scandir(data_raw_dir):
    file_list.append(os.path.join(data_raw_dir,file.name))

df = pd.read_excel(io=file_list[0],
                   sheet_name='Sheet1',
                   dtype=data_dict['data']['dtype'])
df = df.rename(columns=data_dict['data']['rename'])

  warn(msg)


In [199]:
# drop irrelevant columns
df = df.drop(columns='id')

In [200]:
# create a logistic column version of cum_donation_value called has_donated as an additional target variable
df['has_donated'] = np.where(df['cum_donation']>0.0,True,False)

y_log = df[['has_donated']]
y_lin = df[['cum_donation']]

In [201]:
# imputing
df['marital_status'] = df['marital_status'].apply(marital_status_to_num)
df.loc[df['marital_status'].isnull(),'m_marital_status'] = 1
df.loc[df['marital_status'].notnull(),'m_marital_status'] = 0

df_marital_status = df[['m_marital_status']]
df = df.drop(columns='m_marital_status')

imputer = KNNImputer(n_neighbors=5)
df = pd.DataFrame(imputer.fit_transform(df),columns=df.columns)
df = df.rename(columns={'marital_status': 'imp_marital_status'})
df = pd.concat((df,df_marital_status),axis=1)

df['imp_marital_status'] = df['imp_marital_status'].round().astype(int)
df['imp_marital_status'] = df['imp_marital_status'].apply(marital_status_to_str)
df[data_dict['cols']['bools']] = df[data_dict['cols']['bools']].astype(int)

In [202]:
# binning
df.sort_values('cum_donation',inplace=True)
df['cum_range'] = \
    pd.cut(x=df['cum_donation'],
           bins=data_dict['dummy']['giving']['bins'],
           labels=data_dict['dummy']['giving']['labels'],
           right=False)
df.sort_values('grad_year',inplace=True)
df['grad_decade'] = pd.cut(x=df['grad_year'],
                           bins=data_dict['dummy']['grad']['bins'],
                           labels=data_dict['dummy']['grad']['labels'],
                           right=False)
df.sort_index(inplace=True)

In [203]:
# dummying
dummy_filter = df[data_dict['dummy']['list']]
dummy_df = pd.get_dummies(data=dummy_filter,
                          prefix=data_dict['dummy']['prefix'],
                          columns=data_dict['dummy']['list'])
df = pd.concat(([df,dummy_df]),axis=1)

# all possible lin/log features
X = df.drop(columns=['imp_marital_status','cum_range','grad_decade'])

Xy_dict = {
    'log': {
        'X': X,
        'y': y_log
    },
    'lin': {
        'X': sm.add_constant(X),
        'y': y_lin
    }
}

In [204]:
# Different models will be denoted through an encoded sting of
# bits from right to left. As more variety is introduced this
# bitwise representation will grow in length.

# 0th bit: is linear regression

dict_3bit = defaultdict()

for i in [1,0]:
    model_bit = deque()
    if i == 1:
        model_bit.appendleft('1')
        feature_list_lv0 = list((Counter(Xy_dict['lin']['X'].columns.tolist())
                                 -Counter(data_dict['cols']['lin']))
                                .elements())
    else:
        model_bit.appendleft('0')
        feature_list_lv0 = list((Counter(Xy_dict['log']['X'].columns.tolist())
                                 -Counter(data_dict['cols']['log']))
                                .elements())

    # 1st bit: is grad_year binned
    for j in [1,0]:
        if j == 1:
            model_bit.appendleft('1')
            feature_list_lv1 = list((Counter(feature_list_lv0)
                                     -Counter(['grad_year']))
                                    .elements())
        else:
            model_bit.appendleft('0')
            feature_list_lv1 = list((Counter(feature_list_lv0)
                                     -Counter(data_dict['cols']['bin_grad']))
                                    .elements())

        # 2nd bit: is cum_donation binned
        for k in [1,0]:
            feature_list_lv2 = feature_list_lv1
            if k == 1:
                model_bit.appendleft('1')
                feature_list_lv2 = list((Counter(feature_list_lv1)
                                         -Counter(['cum_donation']))
                                        .elements())
            else:
                model_bit.appendleft('0')
                feature_list_lv2 = list((Counter(feature_list_lv1)
                     -Counter(data_dict['cols']['bin_giving']))
                    .elements())
            bit_str = ''.join(model_bit)

            # filter linear regressions where linear target would be
            # present
            if not re.match('1\d1',bit_str):
                model_dict = {
                    bit_str: {
                        'features': feature_list_lv2
                    }
                }
                dict_3bit.update(model_dict)
            model_bit.popleft()
        model_bit.popleft()
    model_bit.popleft()

In [205]:
# 3rd bit: chi-square feature selection (linear is always 0)
test = SelectKBest(score_func=chi2,k='all')
np.set_printoptions(precision=3)

dict_4bit = defaultdict()

for k_model, v_data in dict_3bit.items():
    model_bit = deque(k_model)
    features = v_data.get('features')
    for i in [1,0]:
        # only calculate chi for logistic regressions
        if (i == 1) and (model_bit[-1] == '0'):
            model_bit.appendleft('1')
            chi_scores = test.fit(Xy_dict['log']['X'][features],y_log)
            chi_list = list(zip(list(features),chi_scores.scores_))
            chi_df = pd.DataFrame(chi_list,
                                         columns=['predictor','chi2'])
            chi_df = chi_df[chi_df['chi2']>3.8]
            sig_feat = list(chi_df['predictor'])
            out_data = {
                'features': sig_feat,
                'chi_scores': chi_df
            }
        else:
            model_bit.appendleft('0')
            out_data = {
                'features': features
            }
        bit_str = ''.join(model_bit)
        dict_4bit.update({bit_str: out_data})
        model_bit.popleft()

# for k,v in dict_4bit.items():
#     print(k)
#     for ki,vi in v.items():
#         print('\t',ki)

In [206]:
# 4th-5th bit: Scaling
# 00 - unscaled
# 01 - MinMaxScaler
# 10 - StandardScaler
# 11 - RobustScaler

dict_6bit = defaultdict()

for k_model, v_data in dict_4bit.items():
    for scale in [('00',None),('01',MinMaxScaler()),('10',StandardScaler()),
                  ('11',RobustScaler())]:
        model_bit = deque(k_model)
        leaves = v_data

        if model_bit[-1] == '0':
            reg = 'log'
        else:
            reg = 'lin'

        leaves.update({'y_scaled': Xy_dict[reg]['y'],
                       'X_scaled': Xy_dict[reg]['X']})

        if scale[0] == '00':
            model_bit.appendleft(scale[0])

        else:
            if reg == 'lin':
                y_scaled = scale[1].fit_transform(np.array(Xy_dict[reg]['y'])
                                                  .reshape(-1,1))
                leaves.update({'y_scaled':y_scaled})

            model_bit.appendleft(scale[0])
            X_scaled = scale[1].fit_transform(
                Xy_dict[reg]['X'][leaves['features']])
            leaves.update({'X_scaled': X_scaled})
        bit_str = ''.join(model_bit)
        dict_6bit.update({bit_str: leaves})

# for k,v in dict_6bit.items():
#     print(k)
#     for ki,vi in v.items():
#         print('\t',ki)
#         print('\t\t',vi)

000011
	 features
		 ['const', 'has_email', 'has_business_phone', 'has_spousal_record', 'has_job_title', 'has_activity_athlete', 'has_activity_government', 'has_activity_other', 'has_activity_greek', 'is_mr', 'is_ms', 'is_dr', 'is_mrs', 'has_donated', 'm_marital_status', 'bin_grad_decade_1900s', 'bin_grad_decade_1910s', 'bin_grad_decade_1920s', 'bin_grad_decade_1930s', 'bin_grad_decade_1940s', 'bin_grad_decade_1950s', 'bin_grad_decade_1960s', 'bin_grad_decade_1970s', 'bin_grad_decade_1980s', 'bin_grad_decade_1990s', 'bin_grad_decade_2000s', 'bin_grad_decade_2010s', 'bin_imp_marital_status_divorced', 'bin_imp_marital_status_married', 'bin_imp_marital_status_single', 'bin_imp_marital_status_widowed']
	 y_scaled
		 [[0.651]
 [0.   ]
 [0.   ]
 ...
 [0.122]
 [0.   ]
 [0.041]]
	 X_scaled
		 [[ 0.  0.  1. ...  0.  0.  0.]
 [ 0.  0.  1. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 ...
 [ 0. -1.  0. ...  0.  0.  0.]
 [ 0. -1.  0. ...  0.  0.  0.]
 [ 0. -1.  0. ... -1.  0.  1.]]
010011
	 fea

In [207]:
# 6th-7th bit: Automatic Feature Selection
# 00 - no automatic feature selection
# 01 - Random Forest Importances (rfi)
# 10 - Recursive Feature Elimination (rfe)
# 11 - Forward Feature Elimination (ffe)

def append_rfi(model_obj):
    pass

def append_rfe(model_obj):
    pass

def append_ffe(model_obj):
    pass

dict_8bit = defaultdict()

for k_model, v_data in dict_6bit.items():
    for auto in [('00', None),('01',append_rfi),('10',append_rfe),
                 ('11',append_ffe)]:
        model_bit = deque(k_model)
        data = v_data
        model_bit.appendleft(auto[0])
        
        if auto[0] != '00':
            data = auto[1](data)
        bit_str = ''.join(model_bit)
        dict_8bit.update({bit_str: data})

In [208]:
# 9th bit: Modeling
# 1 - Stacking
# 0 - Cross-fold validation

# Assignment 2

## Introduction

## Data Exploration

## Data-Preparation

### Identifying Significant Features

### Imputing and Variable Creation

## Data Modeling

### Scaling

## Model Evaluation

## Conclusion