In [551]:
# import libraries

import os
import re
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from collections import Counter
from collections import defaultdict
from collections import deque


In [552]:
# Set some options
np.set_printoptions(precision=3)
np.seterr(divide='ignore',invalid='ignore')

{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [577]:
s = '012345678'
print(s[-8:-6])

12


In [553]:
# define data and util functions

def marital_status_to_num(val):
    if val == 'M':
        return 0
    elif val == 'S':
        return 1
    elif val == 'D':
        return 2
    elif val == 'W':
        return 3
    else:
        return np.nan

def marital_status_to_str(val):
    if val == 0:
        return 'married'
    elif val == 1:
        return 'single'
    elif val == 2:
        return 'divorced'
    elif val == 3:
        return 'widowed'
    else:
        return np.nan

data_dict = {
    'data': {
        'rename': {
            'ID Number': 'id',
            'Lifetime HC': 'cum_donation',
            'Email Present': 'has_email',
            'BusPhone Present': 'has_business_phone',
            'Grad Year': 'grad_year',
            'Marital Status': 'marital_status',
            'SpouseID Present': 'has_spousal_record',
            'JobTitle Present': 'has_job_title',
            'VarsityAth Present': 'has_activity_athlete',
            'StudGovt Present': 'has_activity_government',
            'OtherStudActs Present': 'has_activity_other',
            'Greek Present': 'has_activity_greek',
            'Prefix is Mr.': 'is_mr',
            'Prefix is Ms.': 'is_ms',
            'Prefix is Dr.': 'is_dr',
            'Prefix is Mrs.': 'is_mrs'
        },
        'dtype': {
            'ID Number': str,
            'Lifetime HC': float,
            'Grad Year': int,
            'Marital Status': str
        }
    },
    'dummy': {
        'list': [
            'cum_range',
            'grad_decade',
            'imp_marital_status',
        ],
        'prefix': {
            'cum_range': 'bin_cum_range',
            'grad_decade': 'bin_grad_decade',
            'imp_marital_status': 'bin_imp_marital_status'
        },
        'giving': {
            'bins': [
                0.00,
                1.00,
                1000.00,
                10000.00,
                25000.00,
                50000.00,
                100000.00,
                250000.00,
                500000.00,
                1000000.00,
                2500000.00,
                5000000.00,
                10000000.00,
                15000000.00
            ],
            'labels': [
                '$0',
                '$1-$999.99',
                '$1K-$9.99K',
                '$10K-$24.99K',
                '$25K-$49.99K',
                '$50K-$99.99K',
                '$100K-$249.99K',
                '$250K-$499.99K',
                '$500K-$999.99K',
                '$1M-$2.49M',
                '$2.5M-$4.99M',
                '$5M-$9.99M',
                '$10M-$14.99M'
            ]
        },
        'grad': {
            'bins': [
                1900,
                1910,
                1920,
                1930,
                1940,
                1950,
                1960,
                1970,
                1980,
                1990,
                2000,
                2010,
                2020
            ],
            'labels': [
                '1900s',
                '1910s',
                '1920s',
                '1930s',
                '1940s',
                '1950s',
                '1960s',
                '1970s',
                '1980s',
                '1990s',
                '2000s',
                '2010s'
            ]
        }
    },
    'cols': {
        'bools': [
            'has_email',
            'has_business_phone',
            'has_spousal_record',
            'has_job_title',
            'has_activity_athlete',
            'has_activity_government',
            'has_activity_other',
            'has_activity_greek',
            'is_mr',
            'is_ms',
            'is_dr',
            'is_mrs',
            'has_donated'
        ],
        'bin_grad': [
            'bin_grad_decade_1900s',
            'bin_grad_decade_1910s',
            'bin_grad_decade_1920s',
            'bin_grad_decade_1930s',
            'bin_grad_decade_1940s',
            'bin_grad_decade_1950s',
            'bin_grad_decade_1960s',
            'bin_grad_decade_1970s',
            'bin_grad_decade_1980s',
            'bin_grad_decade_1990s',
            'bin_grad_decade_2000s',
            'bin_grad_decade_2010s'
        ],
        'bin_giving': [
            'bin_cum_range_$0',
            'bin_cum_range_$1-$999.99',
            'bin_cum_range_$1K-$9.99K',
            'bin_cum_range_$10K-$24.99K',
            'bin_cum_range_$25K-$49.99K',
            'bin_cum_range_$50K-$99.99K',
            'bin_cum_range_$100K-$249.99K',
            'bin_cum_range_$250K-$499.99K',
            'bin_cum_range_$500K-$999.99K',
            'bin_cum_range_$1M-$2.49M',
            'bin_cum_range_$2.5M-$4.99M',
            'bin_cum_range_$5M-$9.99M',
            'bin_cum_range_$10M-$14.99M'
        ],
        'bin_marital': [
            'bin_imp_marital_status_divorced',
            'bin_imp_marital_status_married',
            'bin_imp_marital_status_single',
            'bin_imp_marital_status_widowed'
        ],
        'log': [
            'has_donated'
        ],
        'lin': [
            'cum_donation'
        ]
    }
}

In [554]:
project_dir = os.path.join(os.path.abspath(''),os.pardir)
data_dir = os.path.join(project_dir,'data')
data_raw_dir = os.path.join(data_dir,'raw')

file_list = []

for file in os.scandir(data_raw_dir):
    file_list.append(os.path.join(data_raw_dir,file.name))

df = pd.read_excel(io=file_list[0],
                   sheet_name='Sheet1',
                   dtype=data_dict['data']['dtype'])
df = df.rename(columns=data_dict['data']['rename'])

  warn(msg)


In [555]:
# drop irrelevant columns
df = df.drop(columns='id')

In [556]:
# create a logistic column version of cum_donation_value called has_donated as an additional target variable
df['has_donated'] = np.where(df['cum_donation']>0.0,1,0)

y_log = df[['has_donated']]
y_lin = df[['cum_donation']]

In [557]:
# imputing
df['marital_status'] = df['marital_status'].apply(marital_status_to_num)
df.loc[df['marital_status'].isnull(),'m_marital_status'] = 1
df.loc[df['marital_status'].notnull(),'m_marital_status'] = 0

df_marital_status = df[['m_marital_status']]
df = df.drop(columns='m_marital_status')

imputer = KNNImputer(n_neighbors=5)
df = pd.DataFrame(imputer.fit_transform(df),columns=df.columns)
df = df.rename(columns={'marital_status': 'imp_marital_status'})
df = pd.concat((df,df_marital_status),axis=1)

df['imp_marital_status'] = df['imp_marital_status'].round().astype(int)
df['imp_marital_status'] = df['imp_marital_status'].apply(marital_status_to_str)
df[data_dict['cols']['bools']] = df[data_dict['cols']['bools']].astype(int)

In [558]:
# binning
df.sort_values('cum_donation',inplace=True)
df['cum_range'] = \
    pd.cut(x=df['cum_donation'],
           bins=data_dict['dummy']['giving']['bins'],
           labels=data_dict['dummy']['giving']['labels'],
           right=False)
df.sort_values('grad_year',inplace=True)
df['grad_decade'] = pd.cut(x=df['grad_year'],
                           bins=data_dict['dummy']['grad']['bins'],
                           labels=data_dict['dummy']['grad']['labels'],
                           right=False)
df.sort_index(inplace=True)

In [559]:
# dummying
dummy_filter = df[data_dict['dummy']['list']]
dummy_df = pd.get_dummies(data=dummy_filter,
                          prefix=data_dict['dummy']['prefix'],
                          columns=data_dict['dummy']['list'])
df = pd.concat(([df,dummy_df]),axis=1)

# all possible lin/log features
X = df.drop(columns=['imp_marital_status','cum_range','grad_decade'])

In [560]:
# Different models will be denoted through an encoded sting of
# bits from right to left. As more variety is introduced this
# bitwise representation will grow in length.

# 0th bit [-1]:         0 - logistic regression
#                       1 - linear regression
# 1st bit [-2]:         0 - grad_year int
#                       1 - grad_year binned
# 2nd bit [-3]:         0 - cum_donation float
#                       1 - cum_donation binned
# 3rd-5th bit [-6:-3]:  000 - no automatic feature selection
#                       001 - chi square filtering (chi)
#                       010 - Random Forest Importance (rfi)
#                       011 - Recursive Feature Elimination Cross Validation (rfe)
#                       100 - Forward Feature Elimination (ffe)
# 6th-7th bit [-8:-6]:  00 - unscaled
#                       01 - MinMaxScaler
#                       10 - StandardScaler
#                       11 - RobustScaler
# 8th bit [-9]:         0 - Cross Fold Validation
#                       1 - Stacking

# 0th bit: is linear regression

dict_3bit = dict()

for i in [1,0]:
    model_bit = deque()
    if i == 1:
        model_bit.appendleft('1')
        y = y_lin
        X = sm.add_constant(X)
        feature_list_lv0 = list((Counter(X.columns.tolist())
                                 -Counter(data_dict['cols']['lin']))
                                .elements())

    else:
        model_bit.appendleft('0')
        y = y_log
        X = X
        feature_list_lv0 = list((Counter(X.columns.tolist())
                                 -Counter(data_dict['cols']['log']))
                                .elements())


    # 1st bit: is grad_year binned
    for j in [1,0]:
        if j == 1:
            model_bit.appendleft('1')
            feature_list_lv1 = list((Counter(feature_list_lv0)
                                     -Counter(['grad_year']))
                                    .elements())
        else:
            model_bit.appendleft('0')
            feature_list_lv1 = list((Counter(feature_list_lv0)
                                     -Counter(data_dict['cols']['bin_grad']))
                                    .elements())

        # 2nd bit: is cum_donation binned
        for k in [1,0]:
            feature_list_lv2 = feature_list_lv1
            if k == 1:
                model_bit.appendleft('1')
                feature_list_lv2 = list((Counter(feature_list_lv1)
                                         -Counter(['cum_donation']))
                                        .elements())
            else:
                model_bit.appendleft('0')
                feature_list_lv2 = list((Counter(feature_list_lv1)
                     -Counter(data_dict['cols']['bin_giving']))
                    .elements())
            bit_str = ''.join(model_bit)

            # filter linear regressions where linear target would be
            # present
            if not re.match('1\d1',bit_str):
                model_dict = {
                    bit_str: {
                        'y': y,
                        'X': X[feature_list_lv2]
                    }
                }
                dict_3bit.update(model_dict)
            model_bit.popleft()
        model_bit.popleft()
    model_bit.popleft()

In [561]:
# 3th-5th bit: Automatic Feature Selection (afs)
# 000 - no automatic feature selection
# 001 - chi square filtering (chi)
# 010 - Random Forest Importance (rfi)
# 011 - Recursive Feature Elimination Cross Validation (rfe)
# 100 - Forward Feature Elimination (ffe)

def get_chi(bit_str:str, d):
    test = SelectKBest(score_func=chi2,k='all')

    label = d['y']
    features = d['X']

    feature_list = list(features.columns)
    chi_scores = test.fit(features,label)
    chi_list = list(zip(feature_list,chi_scores.scores_))
    chi_df = pd.DataFrame(chi_list,columns=['feature','chi'])
    chi_df = chi_df[chi_df['chi']>3.8]

    filtered_features = chi_df['feature'].tolist()
    d.update({'X':d['X'][filtered_features]})
    d.update({'afs':chi_df})

    return d

def get_rfi(bit_str:str, d):
    label = d['y']
    features = d['X']

    feature_list = list(features.columns)
    np_features = np.array(features)

    if bit_str[-1] == '0':
        rf = RandomForestClassifier(n_estimators=1000)
    else:
        rf = RandomForestRegressor(n_estimators=1000)

    rf.fit(np_features,label.values.ravel())
    importances = list(rf.feature_importances_)
    feature_importances = list(zip(feature_list,importances))
    rfi_df = pd.DataFrame(feature_importances,columns=['feature','rfi'])
    rfi_df = rfi_df.sort_values(by='rfi',ascending=False)

    # filter out features that do not exceed the average importance value
    importance_filter = (rfi_df['rfi'].sum())/(rfi_df['rfi'].count())
    rfi_df = rfi_df[rfi_df['rfi']>importance_filter]

    filtered_features = rfi_df['feature'].tolist()
    d.update({'X':d['X'][filtered_features]})
    d.update({'afs': rfi_df})

    return d

def get_rfecv(bit_str, d):
    label = d['y']
    features = d['X']

    feature_list = list(features.columns)

    if bit_str[-1] == '0':
        estimator = LogisticRegression(max_iter=500)
    else:
        estimator = LinearRegression()
    selector = RFECV(estimator=estimator)
    selector.fit(features,label.values.ravel())
    ranking = list(selector.support_)
    feature_ranking = list(zip(feature_list,ranking))
    rfecv_df = pd.DataFrame(feature_ranking,columns=['feature','rfe'])

    # filter out features rfe considers insignificant
    rfecv_df = rfecv_df[rfecv_df['rfe']==True]

    filtered_features = rfecv_df['feature'].tolist()
    d.update({'X':d['X'][filtered_features]})
    d.update({'afs': rfecv_df})
    return d

def get_ffe(bit_str:str, d):
    label = d['y']
    features = d['X']
    feature_list = list(features.columns)
    ffe = f_regression(features,label.values.ravel())
    ffe_list = []
    for i in range(0,len(feature_list)):
        ffe_list.append({'feature':feature_list[i],
                                'ffe':ffe[0][i]})
    ffe_df = pd.DataFrame(ffe_list)

    ffe_filter = (ffe_df['ffe'].sum()/ffe_df['ffe'].count())
    ffe_df = ffe_df[ffe_df['ffe']>ffe_filter]

    filtered_features = ffe_df['feature'].tolist()
    d.update({'X':d['X'][filtered_features]})
    d.update({'afs':ffe_df})
    return d

dict_6bit = dict()

for k_model, v_data in dict_3bit.items():
    for auto in [('000', None), ('001', get_chi), ('010', get_rfi),
                 ('011', get_rfecv), ('100', get_ffe)]:
        model_bit = deque(k_model)
        data = v_data.copy()

        # skips no automatic feature selection
        if auto[0] == '000':
            model_bit.appendleft(auto[0])
        # skips chi-squaring linear regressions
        elif (auto[1] is get_chi) and (bit_str[-1] == '1'):
            model_bit.appendleft('000')
        else:
            model_bit.appendleft(auto[0])
            data = auto[1](model_bit,data)
        bit_str = ''.join(model_bit)
        dict_6bit.update({bit_str: data})

In [562]:
# for k,v in dict_5bit.items():
#     print(k)
#     for ki,vi in v.items():
#         print('\t',ki,'\t',np.shape(vi))

In [563]:
# 6th-7th bit: Scaling
# 00 - unscaled
# 01 - MinMaxScaler
# 10 - StandardScaler
# 11 - RobustScaler

dict_8bit = defaultdict()

for k_model, v_data in dict_6bit.items():
    for scale in [('00',None),('01',MinMaxScaler()),('10',StandardScaler()),
                  ('11',RobustScaler())]:
        X_scaler = scale[1]
        y_scaler = scale[1]
        model_bit = deque(k_model)
        data = v_data.copy()

        y_scale = data['y']
        y_scale_labels = list(y_scale.columns)
        X_scale = data['X']
        X_scale_labels = list(X_scale.columns)

        data.update({'y_scale': y_scale,
                     'y_scale_labels': y_scale_labels,
                     'X_scale': X_scale,
                     'X_scale_labels': X_scale_labels,
                     'X_scaler': None,
                     'y_scaler': None})

        if scale[0] == '00':
            model_bit.appendleft(scale[0])

        else:
            if model_bit[-1] == '1':
                y_scale = y_scaler.fit_transform(np.array(y_scale)
                                                  .reshape(-1,1))
                data.update({'y_scale': y_scale,
                             'y_scaler': y_scaler})

            model_bit.appendleft(scale[0])
            X_scale = X_scaler.fit_transform(X_scale)
            data.update({'X_scale': X_scale,
                         'X_scaler': X_scaler})
        bit_str = ''.join(model_bit)
        dict_8bit.update({bit_str: data})

In [564]:
print(len(dict_8bit))

for k,v in dict_8bit.items():
    print(k)
    for ki,vi in v.items():
        print('\t',ki,'\t',np.shape(vi))

112
00000011
	 y 	 (5000, 1)
	 X 	 (5000, 31)
	 y_scale 	 (5000, 1)
	 y_scale_labels 	 (1,)
	 X_scale 	 (5000, 31)
	 X_scale_labels 	 (31,)
01000011
	 y 	 (5000, 1)
	 X 	 (5000, 31)
	 y_scale 	 (5000, 1)
	 y_scale_labels 	 (1,)
	 X_scale 	 (5000, 31)
	 X_scale_labels 	 (31,)
10000011
	 y 	 (5000, 1)
	 X 	 (5000, 31)
	 y_scale 	 (5000, 1)
	 y_scale_labels 	 (1,)
	 X_scale 	 (5000, 31)
	 X_scale_labels 	 (31,)
11000011
	 y 	 (5000, 1)
	 X 	 (5000, 31)
	 y_scale 	 (5000, 1)
	 y_scale_labels 	 (1,)
	 X_scale 	 (5000, 31)
	 X_scale_labels 	 (31,)
00010011
	 y 	 (5000, 1)
	 X 	 (5000, 9)
	 afs 	 (9, 2)
	 y_scale 	 (5000, 1)
	 y_scale_labels 	 (1,)
	 X_scale 	 (5000, 9)
	 X_scale_labels 	 (9,)
01010011
	 y 	 (5000, 1)
	 X 	 (5000, 9)
	 afs 	 (9, 2)
	 y_scale 	 (5000, 1)
	 y_scale_labels 	 (1,)
	 X_scale 	 (5000, 9)
	 X_scale_labels 	 (9,)
10010011
	 y 	 (5000, 1)
	 X 	 (5000, 9)
	 afs 	 (9, 2)
	 y_scale 	 (5000, 1)
	 y_scale_labels 	 (1,)
	 X_scale 	 (5000, 9)
	 X_scale_labels 	 (9,)
11010011

In [565]:
# 9th bit: Modeling
# also rescaling
# 1 - Stacking
# 0 - Cross-fold validation

def get_stack(X,y,scaler,selection_name,selection,X_label,y_label):
    pass

def get_cross_fold_log(X,y,X_scaler,y_scaler,selection_name,selection,X_label,
                       y_label,k_fold):
    accuracy_list = list()
    precision_list = list()
    recall_list = list()
    f1_list = list()
    coef_list = list()
    count = 0

    for train_index, test_index in k_fold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        lm = LogisticRegression(fit_intercept=True,
                                solver='liblinear')
        lm.fit(X_train, np.ravel(y_train))

        y_pred = lm.predict(X_test)

        accuracy = metrics.accuracy_score(y_test,y_pred)
        precision = metrics.precision_score(y_test,y_pred,zero_division=0)
        recall = metrics.recall_score(y_test,y_pred,zero_division=0)
        f1 = metrics.f1_score(y_test,y_pred,zero_division=0)

        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
        coef_list.append(pd.DataFrame({"Feature":X_label,"Coefficients":lm.coef_[0]}))

    return {
        'scale' : scale,
        'selection' : selection,
        'accuracy_avg' : np.mean(accuracy_list),
        'accuracy_sd' : np.std(accuracy_list),
        'precision_avg' : np.mean(precision_list),
        'precision_sd' : np.std(precision_list),
        'recall_avg' : np.mean(recall_list),
        'recall_sd' : np.std(recall_list),
        'f1_avg' : np.mean(f1_list),
        'f1_sd' : np.std(f1_list),
        'coef' : pd.concat(coef_list).groupby('Feature',as_index=False,sort=False)['Coefficients'].mean()
    }

def get_cross_fold_lin(X,y,X_scaler,y_scaler,selection_name,selection,X_label,
                       y_label,k_fold):
    rmseList = list()
    bicList = list()
    rsquareLst = list()

    result = {

    }

    return result

dict_9bit = defaultdict()

for k_model, v_data in dict_8bit.items():
    for model in ['0','1']:
        model_bit = deque(k_model)
        data = v_data.copy()

        # unwrap the dict
        X = data['X_scale']
        y = data['y_scale']
        X_scaler = data['X_scaler']
        y_scaler = data['y_scaler']

        for i in [('000', None), ('001', 'chi'), ('010', 'rfi'),
                 ('011', 'rfecv'), ('100', 'ffe')]:
            if bit_str[-6:-3] == i[0]:
                selection_name = i[1]
                selection = data['afs']
        X_label = data['X_scale_labels']
        y_label = data['y_scale_labels']

        if model == '0':
            model_bit.appendleft('0')
            k_fold = KFold(shuffle=True)
            if model_bit[-1] == '0':
                result = get_cross_fold_log(X,y,X_scaler,y_scaler,selection_name,selection,X_label,y_label,k_fold)
            else:
                result = get_cross_fold_lin(X,y,X_scaler,y_scaler,selection_name,selection,X_label,y_label,k_fold)
        else:
            model_bit.appendleft('1')
            result = get_stack(X,y,scaler,selection_name,selection,X_label,y_label)




# Assignment 2

## Introduction

## Data Exploration

## Data-Preparation

### Identifying Significant Features

### Imputing and Variable Creation

## Data Modeling

### Scaling

## Model Evaluation

## Conclusion