In [84]:
import os
import pandas as pd
import numpy as np

In [85]:
def to_bool(val):
    if val == 1:
        return True
    elif val == 0:
        return False
    else:
        return pd.NA

data_dict = {
    'data': {
        'rename': {
            'ID Number': 'id',
            'Lifetime HC': 'cum_donation',
            'Email Present': 'has_email',
            'BusPhone Present': 'has_business_phone',
            'Grad Year': 'grad_year',
            'Marital Status': 'marital_status',
            'SpouseID Present': 'has_spousal_record',
            'JobTitle Present': 'has_job_title',
            'VarsityAth Present': 'has_activity_athlete',
            'StudGovt Present': 'has_activity_government',
            'OtherStudActs Present': 'has_activity_other',
            'Greek Present': 'has_activity_greek',
            'Prefix is Mr.': 'is_mr',
            'Prefix is Ms.': 'is_ms',
            'Prefix is Dr.': 'is_dr',
            'Prefix is Mrs.': 'is_mrs'
        },
        'dtype': {
            'ID Number': str,
            'Lifetime HC': float,
            'Grad Year': int,
            'Marital Status': str
        },
        'converters': {
            'Email Present': to_bool,
            'BusPhone Present': to_bool,
            'SpouseID Present': to_bool,
            'JobTitle Present': to_bool,
            'VarsityAth Present': to_bool,
            'StudGovt Present': to_bool,
            'OtherStudActs Present': to_bool,
            'Greek Present': to_bool,
            'Prefix is Mr.': to_bool,
            'Prefix is Ms.': to_bool,
            'Prefix is Dr.': to_bool,
            'Prefix is Mrs.': to_bool
        }
    },
    'bin_don': {
        'bins': [
            0.00,
            1.00,
            1000.00,
            10000.00,
            25000.00,
            50000.00,
            100000.00,
            250000.00,
            500000.00,
            1000000.00,
            2500000.00,
            5000000.00,
            10000000.00,
            15000000.00
        ],
        'labels': [
            '$0',
            '$1-$999.99',
            '$1K-$9.99K',
            '$10K-$24.99K',
            '$25K-$49.99K',
            '$50K-$99.99K',
            '$100K-$249.99K',
            '$250K-$499.99K',
            '$500K-$999.99K',
            '$1M-$2.49M',
            '$2.5M-$4.99M',
            '$5M-$9.99M',
            '$10M-$14.99M'
        ]
    }
}

In [86]:
project_dir = os.path.join(os.path.abspath(''),os.pardir)
data_dir = os.path.join(project_dir,'data')
data_raw_dir = os.path.join(data_dir,'raw')

file_list = []

for file in os.scandir(data_raw_dir):
    file_list.append(os.path.join(data_raw_dir,file.name))

df = pd.read_excel(io=file_list[0],
                   sheet_name='Sheet1',
                   header=0,
                   dtype=data_dict['data']['dtype'],
                   converters=data_dict['data']['converters'])
df = df.rename(columns=data_dict['data']['rename'])

  warn(msg)


In [87]:
# remove columns that add no data
del df['id']

# create a logistic column version of cum_donation_value called has_donated as an additional target variable
df['has_donated'] = np.where(df['cum_donation']>0.0,True,False)

# linear target cum_donation_value
y_lin = df['cum_donation']

# logistic target has_donated
y_log = df['has_donated']

# imputing

# knnimputing

# binning
df.sort_values('cum_donation')
df['bin_cum_don'] = \
    pd.cut(x=df['cum_donation'],
           bins=data_dict['bin_don']['bins'],
           labels=data_dict['bin_don']['labels'],
           right=False)
df

Unnamed: 0,cum_donation,has_email,has_business_phone,grad_year,marital_status,has_spousal_record,has_job_title,has_activity_athlete,has_activity_government,has_activity_other,has_activity_greek,is_mr,is_ms,is_dr,is_mrs,has_donated,bin_cum_don
0,159.5,True,True,1960,M,True,False,False,False,False,False,False,False,True,False,True,$1-$999.99
1,0.0,True,True,1978,M,False,True,True,False,True,False,True,False,False,False,False,$0
2,0.0,True,False,2003,M,False,True,False,False,True,False,True,False,False,False,False,$0
3,1500.0,True,False,1973,S,False,True,False,False,False,False,False,False,True,False,True,$1K-$9.99K
4,773.0,True,True,1969,M,True,True,False,False,True,True,False,False,True,False,True,$1-$999.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,False,True,2000,U,False,False,False,False,False,False,False,False,True,False,False,$0
4996,0.0,False,False,1983,S,False,False,False,False,False,False,False,False,False,False,False,$0
4997,30.0,False,False,1976,U,False,False,False,False,False,False,False,True,False,False,True,$1-$999.99
4998,0.0,False,False,1969,U,False,False,False,False,False,False,False,True,False,False,False,$0


In [88]:
# M - Married, S - Single, U - Unknown, D - Divorced, W - Widowed, PEZ - don't know
# TODO: move all U, E, P, Z, and NaN to Unknown and categorize

# Assignment 2

## Introduction

## Data Exploration

## Data-Preparation

### Identifying Significant Features

### Imputing and Variable Creation

## Data Modeling

### Scaling

## Model Evaluation

## Conclusion