In [4]:
import mysql.connector
import pandas as pd
import csv
from datetime import datetime
from itertools import product
import os
import json
import boto3
from io import BytesIO
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

### GLOBAL VARS ###

## AWS ##
s3 = boto3.client('s3')
bucket1 = 'bootcampjanuary2018-1a'
bucket2 = 'bootcampaugust2017-1b'

## QUERIES & DB ##
# get all table names
table_name_query = "SELECT table_name FROM information_schema.tables WHERE table_type = 'base table'"

# get number of rows in a table
get_row_count_query = "SELECT COUNT(*) FROM Bootcamp.{}"

# get full table
get_table = "SELECT * FROM Bootcamp.{}"

# credentials for connecting to the MySQL db
db_creds = json.load(open('hidden/creds.json'))

## DATA FRAME VARS ##

needed_cols = [
    'loan_amnt', 'open_acc', 'pub_rec', 'total_acc', 'inq_last_6mths', 'earliest_cr_line',
    'inactive_loans', 'bad_loans', 'pub_rec_zero', 'collections_12_mths_ex_med', 'home_ownership',
    'issue_d', 'status', 'grade', 'term', 'funded_amnt', 'funded_amnt', 'funded_amnt_inv', 
    'initial_list_status', 'sub_grade', 'is_inc_v', 'pymnt_plan', 'purpose', 'mths_since_last_delinq', 
    'id', 'member_id'
]

grades = 'ABCDEFG' # loan grades

# sub grades
sub_grade = [''.join([x[0], str(x[1])]) for x in product(grades, range(1,6))]

# map regions
regions_num = {
    1: 'NE', 
    2: 'SE', 
    3: 'NC', 
    4: 'NW', 
    5: 'SW'
}

regions = {
    'NE': ['ME', 'NA', 'VT', 'MA', 'RI', 'CT', 'NJ', 'PA', 'MD', 'VA', 
           'WV', 'KY', 'OH', 'IN', 'NY', 'NH', 'DC', 'DE'],
    'SE': ['NC', 'SC', 'GA', 'FL', 'AL', 'TN', 'MS', 'AR', 'LA', 'OK', 'TX'],
    'NC': ['ND', 'SD', 'NE', 'KS', 'MO', 'IA', 'MN', 'WI', 'MI', 'IL'],
    'NW': ['WA', 'OR', 'ID', 'MT', 'WY', 'AK'],
    'SW': ['CA', 'NV', 'UT', 'CO', 'NM', 'AZ', 'HI']
}

regionsFix = {}

for key in regions.keys():
    for state in regions[key]:
        regionsFix[state] = key

purposes = ["car", "credit_card", "other", "house", "debt_consolidation",
                "home_improvement", "small_business", "medical", "vacation",
                "moving", "wedding", "major_purchase"]

def getDBCursor(creds):
    """
    Get the cursor for the db to perform transaction.

    Input: creds (dictionary of login credentials)
    Output: MySQL Connection, MySQl cursor object
    """
    cnx = mysql.connector.connect(**creds)
    return cnx, cnx.cursor()

def queryDB(cursor, query):
    """
    Execute a sql query.

    Input: cursor (MySQL cursor object), query (string of sql query)
    Output: dictionary containing the lines of the query
    """
    cursor.execute(query)
    return cursor.fetchall()

def getAllYearsData(conn, start_year, end_year):
    """
    Get a dataframe with all the years data.
    
    Input: conn (db connection), start_year (int), end_year int)
    Output: df (dataframe)
    """
    if start_year > end_year:
        t = start_year
        start_year = end_year
    df = pd.DataFrame()
    for year in range(start_year, end_year + 1):
        df = pd.concat([df, pd.read_sql('SELECT * FROM Bootcamp.{}_Data'.format(str(year)), con=conn)])
    return df

def getMemberInfo(conn):
    """
    Get the membership information table in a dataframe
    
    Input: conn (db connection)
    Output: dataframe
    """
    return pd.read_sql('SELECT * FROM Bootcamp.Member_Information', con=conn)
    
def getBootcampData(creds, start_year, end_year):
    """
    Input: creds (db credentials)
    Output: dataframe with the combined member_id and year info
    """
    if start_year > end_year:
        t = start_year
        start_year = end_year
        end_year = start_year
    cnx, curs = getDBCursor(creds)
    year_df = getAllYearsData(cnx, start_year, end_year)
    mem_df = getMemberInfo(cnx)
    cnx.close()
    curs.close()
    # drop rows without a loan amount (ex. only a subset of the years is requested.)
    return pd.merge(year_df, mem_df, on='member_id', how='inner') 

def executeQuery(creds, query):
    """
    Executes a query by connecting, querying, and closes the db connection

    Input: creds (db login credentials), query (string of sql query)
    Output: dictionary of query results
    """
    result = None
    try:
        conn, cursor = getDBCursor(creds)
        result = queryDB(cursor, query)
    except Exception as e:
        print(e)
    finally:
        conn.close()
        cursor.close()
    return result

def getTableRowCount(table):
    """
    Get the number of rows in a given table
    
    Input: tabel_name (str)
    Output: number of rows (int)
    """
    try:
        return executeQuery(db_creds, get_row_count_query.format(table_name))[0][0]
    except:
        return None
    
def getDataFrame(table_name):
    """
    Create a pandas dataframe from a file.
    
    Input: filename (csv file, str)
    Output: pandas dataframe
    """
    try:
        conn, cursor = getDBCursor(db_creds)
        result = pd.read_sql_query(get_table.format(table_name), conn)
    finally:
        conn.close()
        cursor.close()
    return result
    
def cleanData(table_name, mem_info):
    """
    Create a pandas dataframe from a file. Clean up the data.
    
    Input: table_name (str), mem_info (dataframe)
    Output: pandas dataframe object
    """
    #df = getBootcampData(db_creds, 2007, 2014).apply(pd.to_numeric, errors='ignore')
    df = getDataFrame(table_name)
    df = pd.merge(df, mem_info, on='member_id').apply(pd.to_numeric, errors='ignore')
    
    # check that all the needed columns are present 
    needed = ','.join(df.columns)
    if not all(col in needed for col in needed_cols):
        return pd.DataFrame()
    
    # delete rows with nulls in these columns
    df = dropNanRowsColSpecific(df, ['loan_amnt', 'open_acc', 'pub_rec', 'total_acc', 'inq_last_6mths'])
    
    # fix dates
    for col in list(df.columns.values): 
        if col.endswith('_d') or col in ['earliest_cr_line']:
            df[col] = df[col].apply(lambda x: cleanDate(str(x).strip()))
    
    # drop unneccessary columns
    col_to_drop = ['id', 'loan_status', 'url', 'desc', 'title', 'revol_bal', 'revol_util', 
                   'policy_code', 'grade_num', 'sub_grade_num', 'mths_since_last_record',
                  'collections_12_mths_zero', 'emp_title', 'emp_length',
                  'mths_since_last_major_derog', 'mths_since_last_record', 'delinq_2yrs_zero', 'zip_code']
    df.drop(col_to_drop, axis=1, inplace=True, errors='ignore')
    
    # specific fixes for certain columns
    df = fixFundedToApplied(df)
    df = fixTerm(df)
    df = fixInitListStatus(df)
    df = fixGrade(df)
    df = fixSubGrade(df)
    df = fixIsIncV(df)
    df = fixDeliquency(df)
    df = fixFundedToApplied(df)
    df = fixPaymentPlan(df)
    df = fixPurpose(df)
    df = fixStatus(df)
    df = fixIssueDate(df)
    df = fixRegion(df)
    
    # change columns to only have either a 0 or 1
    df = fixBinary01(df, 'inactive_loans')
    df = fixBinary01(df, 'bad_loans')
    df = fixBinary01(df, 'pub_rec_zero')
    
    # fill nan's with specified default values
    nan_fill = {
        'collections_12_mths_ex_med': 0,
    }
    df.fillna(nan_fill, inplace=True)
    
    df['return'] = df['total_pymnt']/df['funded_amnt'] #% return for each loan
    df['profit'] = df['total_pymnt'] - df['funded_amnt'] #calculate profit per loan
    
    return df

def fixRegion(df):
    """
    Add region column. 
    
    Input: df (dataframe)
    Output: dataframe
    """
    df['region'] = df['addr_state'].map(regionsFix)
    return df

def cleanDate(in_date):
    """
    Turn str into date.
    
    Input: in_date (str)
    Output: datetime obj 
    """
    if len(in_date) > 8:
        return datetime.strptime(in_date[:8], '%Y%m%d')
    else:
        return ''
    
def fixIssueDate(df):
    """
    Eleminate impossible dates
    
    Input: df (dataframe)
    Output: dataframe
    """
    return df[df['issue_d'].apply(lambda x:  datetime(2006,12,31) < x < datetime.now())]
    
def fixStatus(df):
    """
    Strip whitespace.
    
    Input: df (dataframe)
    Output: dataframe
    """
    df['status'] = df['status'].apply(lambda x: x.strip().lower())
    ones = ['default', 'charged off']
    df['status_binary'] = df['status'].apply(lambda x: 1 if x in ones else 0)
    return df

def dropNanRowsColSpecific(df, col_to_drop):
    """
    Drop the rows from the df that have nulls in the specified columns.
    
    Input: df (dataframe), cols_to_drop (list of strings)
    Output: df (dataframe)
    """
    relevant_cols = []
    for col in col_to_drop:
        try:
            df[col]
            relevant_cols.append(col)
        except KeyError:
            pass
    return df.dropna(subset=relevant_cols, how='any')

def fixGrade(df):
    """
    Remove grades outside of range a-g
    Input: df (dataframe)
    Output: dataframe
    """
    df['grade'] = df['grade'].apply(lambda x: x.upper())
    return df[df['grade'].isin(list(grades))]

def fixTerm(df):
    """
    Drop the month part of the term and cast as int.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['term'] = df['term'].apply(lambda x: int(float(str(x).split()[0])))
    return df

def fixFundedToApplied(df):
    """
    If funded > applied amount remove the record.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    # only keep the records where they were funded less that they applied for
    df = df[df['funded_amnt'] < 2 * df['funded_amnt']]
    return df[df['funded_amnt'] >= df['funded_amnt_inv']]

def fixInitListStatus(df):
    """
    Remove rows that aren't F or W
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['initial_list_status'] = df['initial_list_status'].apply(lambda x: str(x).upper())
    df = df[df['initial_list_status'].isin(['F', 'W'])]
    df['initial_list_status'] = df['initial_list_status'].apply(lambda x: 0 if x == 'F' else 1)
    return df

def fixEmpLength(value):
    """
    Clean employment length, should only contain a numeric integer value
    i.e. 10+ years transforms to 10
         < 1 transforms to 1
         
    Input: value of cell (String)
    Output: years of employment (Int)
    """
    value = value.strip()
    if value.startswith('< 1'):
        return 1
    if value.startswith('10') and value[2] == '+':
        return 10
    
    splitVal = value.split(' ')
    if (splitVal[0].isdigit()):
        value = splitVal[0]
        
    return int(value)

def fixSubGrade(df):
    """
    Remove rows that don't have a subgrade A1,A2,A3,A4,A5,....,G1...G4,G5
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['sub_grade'] = df['sub_grade'].apply(lambda x: x.upper())
    return df[df['sub_grade'].isin(sub_grade)]
    
def fixIsIncV(df):
    """
    Remove if row doesn't have enumerated status.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['is_inc_v'] = df['is_inc_v'].apply(lambda x: x.lower())
    return df[df['is_inc_v'].isin(["verified", "source verified", "not verified"])]

def fixPaymentPlan(df):
    """
    Only leave boolean (val 'n', 'y').
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['pymnt_plan'] = df['pymnt_plan'].apply(lambda x: x.lower())
    df = df[df['pymnt_plan'].isin(['n', 'y'])]
    df['pymnt_plan'] = df['pymnt_plan'].apply(lambda x: 0 if x == 'n' else 1)
    return df

def fixPurpose(df):
    """
    Only leave enumerated values. 
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    # remove rows not in the given list of purposes
    df['purpose'] = df['purpose'].apply(lambda x: x.lower())
    return df[df['purpose'].isin(purposes)]

def fixBinary01(df, col_name):
    """
    Only leave boolean (val 0, 1).
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df[col_name] = df[col_name].astype('float', errors='ignore')
    return df[df[col_name].isin(range(2))]

def toInt(i):
    """
    Convert input to integer.
    
    Input: i (anything)
    Output: i (int), if error return original
    """
    try: 
        return int(i.strip())
    except TypeError:
        return i
    
def fixDeliquency(df):
    """
    If the delinq_2yrs col or the mths_since_last_delinq col has value greater than 1, 
    make new col with true, else false
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['deliquency'] = df.apply(lambda r: 1 if r['delinq_2yrs'] > 0 or 
                                (r['mths_since_last_delinq'] <= 24 and r['mths_since_last_delinq'] > 0) else 0, 
                                axis = 1)
    df.drop(['delinq_2yrs', 'last_delinq_none'], axis=1, inplace=True)
    return df

def getDBTables():
    query = "SELECT table_name FROM information_schema.tables WHERE table_type = 'base table'"
    tables = [x[0] for x in executeQuery(db_creds, query)]
    data_tables = {}
    memb_count = executeQuery(db_creds, get_row_count_query.format('Member_Information'))
    if memb_count:
        memb_count = memb_count[0][0]
        if memb_count > 0:
            data_tables['Member_Information'] = memb_count

    for table in tables:
        if 'data' in table.lower():
            try:
                y_start = table.find('20')
                if int(table[y_start:y_start + 4]) < datetime.now().year:
                    data_tables[table] = executeQuery(db_creds, get_row_count_query.format(table))[0][0]
            except:
                pass
    return data_tables

def moveToDeprecated(table_name):
    """
    Move a file that is in /data to /deprecated.
    
    Input: table_name (str)
    """
    deprecated_path = 'deprecated/{}_{}.csv'.format(table_name, datetime.now().strftime('%Y_%m_%d'))
    current_path = 'data/{}.csv'.format(table_name)
    try:
        s3.copy_object(Bucket=bucket1, CopySource=bucket1+'/'+current_path, Key=deprecated_path)
        s3.delete_object(Bucket=bucket1, Key=current_path)
    except Exception as e:
        print(e)

def uploadFile(filename, obj, is_json):
    """
    Upload data obj to s3
    
    Input: filename (str), obj (data source)
    """
    tmp_name = '/tmp/' + filename
    if is_json:
        with open(tmp_name, 'w+') as outfile:  
            json.dump(obj, outfile)
    elif isinstance(df, pd.DataFrame):
        obj.to_csv(tmp_name)
        filename = 'data/' + filename
    else: # what were you trying to upload anyway...
        return
    s3.upload_file(tmp_name, bucket1, filename) # upload
    os.remove(tmp_name) # clean up
    
def s3ToDataFrame(table_name):
    obj = s3.get_object(Bucket=bucket1, Key='data/{}.csv'.format(table_name))
    io = BytesIO(obj['Body'].read())
    return pd.read_csv(io)

In [79]:
data_tables = getDBTables() # get table names from mysql
new_json = {} # place to hold new json that will be the reference next time

# get json with table names
obj = s3.get_object(Bucket=bucket1, Key='table_data.json') 
data_from_json = json.loads(obj['Body'].read().decode("utf-8"))

to_add = [] # files to upload to s3
files_good_to_go = {}
for table in list(data_tables):
    if table in data_from_json: # if in both
        if data_from_json[table] == data_tables[table]: 
            # if the size of both tables matches, we're good to go and can just move on
            files_good_to_go[table] = data_tables[table]
            del data_from_json[table]
            del data_tables[table]
        else: # move the old to deprecated and add table to to_add
            to_add.append(table)
            new_json[table] = data_tables[table]
    else: # just in MySQL
        to_add.append(table)
        new_json[table] = data_tables[table]
        
# data_from_json at this point is just what has already been processed should be removed. 
for table in data_from_json:
    moveToDeprecated(table)

member_info = None
if 'Member_Information' in to_add:
    member_info = getDataFrame('Member_Information')
    uploadFile('Member_Information.csv', member_info, False)
    files_good_to_go['Member_Information'] = new_json['Member_Information']
else: 
    member_info = s3ToDataFrame('Member_Information')
    
uploadFile('table_data.json', files_good_to_go, True)
json_upload = []
# to_add is the tables that need to be cleaned and added 
for table in to_add:
    if not 'data' in table.lower(): # skip items that aren't data files (membership information)
        continue
    df = cleanData(table, member_info)
    filename = '{}.csv'.format(table)
    uploadFile(filename, df, False)
    # upload json with progress to avoid redoing work if crash
    files_good_to_go[table] = new_json[table]
    uploadFile('table_data.json', files_good_to_go, True)

# join tables here into one df
df = None
for table in list(files_good_to_go):
    if not 'data' in table.lower(): # skip items that aren't data files (membership information)
        continue
    if isinstance(df, pd.DataFrame):
        df = pd.concat([df, s3ToDataFrame(table)])
    else:
        df = s3ToDataFrame(table)
        
## MODELING ##
historic_df = df[~df['status'].isin(['late (31-120 days)','current','in grace period','late (16-30 days)'])]
open_df = df[~df['status'].isin(['fully paid','default','charged off'])]

# Define predictors and response for feature selection
logistic = LogisticRegression()

# define response variable
y = historic_df['status_binary'].values

# Create dummy variables and combine with quantitative into predictors matrix
categories = historic_df[['is_inc_v','home_ownership','purpose','addr_state','sub_grade']]
categories = pd.get_dummies(categories, dummy_na = True)
quant = historic_df[['funded_amnt', 'term', 'int_rate','emp_length_num','dti', 'open_acc','total_acc']]
X = pd.concat([quant,categories], axis = 1)

# do stats
X2 = open_df.drop(['status','member_id','issue_d','last_pymnt_d','next_pymnt_d',
                   'last_credit_pull_d','final_d','earliest_cr_line','is_inc_v','home_ownership',
                   'addr_state','purpose','sub_grade'],axis = 1)
categories2 = open_df[['is_inc_v','home_ownership','purpose','addr_state','sub_grade']]
quant2 = open_df[['funded_amnt', 'term', 'int_rate','emp_length_num','dti', 'open_acc','total_acc']]
categories2 = pd.get_dummies(categories2, dummy_na = True)
X2 = pd.concat([quant2, categories2], axis = 1)

# make sure the columns of the df's match
hist_cols = set(X.columns)
open_cols = set(X2.columns)
in_hist = list(hist_cols-open_cols)
in_open = list(open_cols- hist_cols)
to_add_to_hist = {i: X2.columns[i]for i in sorted(X2.columns.get_loc(col) for col in in_open)}
to_add_to_open = {i: X.columns[i]for i in sorted(X.columns.get_loc(col) for col in in_hist)}

[X.insert(k, v, 0) for k, v in to_add_to_hist.items()]
[X2.insert(k, v, 0) for k, v in to_add_to_open.items()]

# Fit a logistic model (without using cross validation)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=62)
model = logistic.fit(x_train,y_train)
preds = logistic.predict(x_test)


#calculate model performance results
results = confusion_matrix(y_test,preds)
tn = results[1,1]
tp = results[0,0]
fp = results[0,1]
fn = results[1,0]

# calculate metrics
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)

model_metrics = [accuracy, precision, recall]
uploadFile('metrics.json', model_metrics, True)

defaults = logistic.predict(X2)
open_df['predict'] = logistic.predict(X2)

# rejoin everything
df = pd.concat([open_df, historic_df])
nan_fill = {
        'predict': -1,
    }
df.fillna(nan_fill, inplace=True)

tmp_name = '/tmp/modeled_data.csv'
df.to_csv(tmp_name)
s3.upload_file(tmp_name, bucket1, 'modeled_data.csv') # upload
os.remove(tmp_name) # clean up

remove:  2007_Data
tables joined


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


modeling finished
