In [511]:
import mysql.connector
import pandas as pd
import csv
from datetime import datetime
from itertools import product
import os
import json

### GLOBAL VARS ###
# credentials for connecting to the MySQL db
db_creds = json.load(open('hidden/creds.json'))

grades = 'ABCDEFG' # loan grades

# sub grades
sub_grade = [''.join([x[0], str(x[1])]) for x in product(grades, range(1,6))]
sub_grade_conversion = {sub_grade[i]: i for i in range(len(sub_grade))}

# map regions
regions_num = {
    1: 'NE', 
    2: 'SE', 
    3: 'NC', 
    4: 'NW', 
    5: 'SW'
}

regions = {
    1: ['ME', 'NA', 'VT', 'MA', 'RI', 'CT', 'NJ', 'PA', 'MD', 'VA', 'WV', 'KY', 'OH', 'IN'],
    2: ['NC', 'SC', 'GA', 'FL', 'AL', 'TN', 'MS', 'AR', 'LA', 'OK', 'TX'],
    3: ['ND', 'SD', 'NE', 'KS', 'MO', 'IA', 'MN', 'WI', 'MI', 'IL'],
    4: ['WA', 'OR', 'ID', 'MT', 'WY', 'AL'],
    5: ['CA', 'NV', 'UT', 'CO', 'NM', 'AZ', 'HI']
}


def getDBCursor(creds):
    """
    Get the cursor for the db to perform transaction.

    Input: creds (dictionary of login credentials)
    Output: MySQL Connection, MySQl cursor object
    """
    cnx = mysql.connector.connect(**creds)
    return cnx, cnx.cursor()

def queryDB(cursor, query):
    """
    Execute a sql query.

    Input: cursor (MySQL cursor object), query (string of sql query)
    Output: dictionary containing the lines of the query
    """
    cursor.execute(query)
    return cursor.fetchall()

def getAllYearsData(conn, start_year, end_year):
    """
    Get a dataframe with all the years data.
    
    Input: conn (db connection), start_year (int), end_year int)
    Output: df (dataframe)
    """
    if start_year > end_year:
        t = start_year
        start_year = end_year
    df = pd.DataFrame()
    for year in range(start_year, end_year + 1):
        df = pd.concat([df, pd.read_sql('SELECT * FROM Bootcamp.{}_Data'.format(str(year)), con=conn)])
    return df

def getMemberInfo(conn):
    """
    Get the membership information table in a dataframe
    
    Input: conn (db connection)
    Output: dataframe
    """
    return pd.read_sql('SELECT * FROM Bootcamp.Member_Information', con=conn)
    
def getBootcampData(creds, start_year, end_year):
    """
    Input: creds (db credentials)
    Output: dataframe with the combined member_id and year info
    """
    if start_year > end_year:
        t = start_year
        start_year = end_year
        end_year = start_year
    cnx, curs = getDBCursor(creds)
    year_df = getAllYearsData(cnx, start_year, end_year)
    mem_df = getMemberInfo(cnx)
    cnx.close()
    curs.close()
    # drop rows without a loan amount (ex. only a subset of the years is requested.)
    return pd.merge(year_df, mem_df, on='member_id', how='inner') 

def executeQuery(creds, query):
    """
    Executes a query by connecting, querying, and closes the db connection

    Input: creds (db login credentials), query (string of sql query)
    Output: dictionary of query results
    """
    result = None
    try:
        conn, cursor = getDBCursor(creds)
        result = queryDB(cursor, query)
    except Error as e:
        print(e)
    finally:
        conn.close()
        cursor.close()
    return result

def getDataFrame(filename):
    """
    Create a pandas dataframe from a file.
    
    Input: filename (csv file, str)
    Output: pandas dataframe
    """
    if not os.path.isfile(filename):
        return None
    try:
        df = pd.read_csv(filename)
    except IOError:
        print 'Use a csv file.'
        return None
    return df
    
def cleanData():
    """
    Create a pandas dataframe from a file. Clean up the data.
    
    Input: csv filename (str)
    Output: pandas dataframe object
    """
    df = getBootcampData(db_creds, 2007, 2007).apply(pd.to_numeric, errors='ignore')
    # delete rows with nulls in these columns
    df = dropNanRowsColSpecific(df, ['loan_amnt', 'open_acc', 'pub_rec', 'total_acc', 'inq_last_6mths'])
    
    # fix dates
    for col in list(df.columns.values): 
        if col.endswith('_d') or col in ['earliest_cr_line']:
            df[col] = df[col].apply(lambda x: cleanDate(x.strip()))
                
    # drop unneccessary columns
    col_to_drop = ['id', 'loan_status', 'url', 'desc', 'title', 'revol_bal', 'revol_util', 
                   'policy_code', 'grade_num', 'sub_grade_num', 'mths_since_last_record',
                  'collections_12_mths_zero', 'payment_inc_ratio', 'emp_title', 'emp_length',
                  'mths_since_last_major_derog', 'mths_since_last_record', 'delinq_2yrs_zero', 'zip_code']
    df.drop(col_to_drop, axis=1, inplace=True)
    
    # fill nan's with specified values
    nan_fill = {
        'term': 0
    }
    df.fillna(nan_fill, inplace=True)
    
    # specific fixes for certain columns
    df = fixTerm(df)
    df = fixInitListStatus(df)
    df = fixGrade(df)
    df = fixSubGrade(df)
    df = fixIsIncV(df)
    df = fixDeliquency(df)
    df = fixFundedToApplied(df)
    df = fixPaymentPlan(df)
    df = fixPurpose(df)
    df = fixStatus(df)
    df = fixIssueDate(df)
    
    # change columns to only have either a 0 or 1
    df = fixBinary01(df, 'inactive_loans')
    df = fixBinary01(df, 'bad_loans')
    df = fixBinary01(df, 'pub_rec_zero')
    
    return df

def cleanDate(in_date):
    """
    Turn str into date.
    
    Input: in_date (str)
    Output: datetime obj 
    """
    if len(in_date) > 8:
        return datetime.strptime(in_date[:8], '%Y%m%d')
    else:
        return ''
    
def fixIssueDate(df):
    """
    Eleminate impossible dates
    
    Input: df (dataframe)
    Output: dataframe
    """
    return df[df['issue_d'].apply(lambda x:  datetime(2006,12,31) < x < datetime.now())]
    
def fixStatus(df):
    """
    Strip whitespace.
    
    Input: df (dataframe)
    Output: dataframe
    """
    df['status'] = df['status'].apply(lambda x: x.strip())
    return df

def dropNanRowsColSpecific(df, col_to_drop):
    """
    Drop the rows from the df that have nulls in the specified columns.
    
    Input: df (dataframe), cols_to_drop (list of strings)
    Output: df (dataframe)
    """
    return df.dropna(subset=col_to_drop, how='any') # add inplace if needed

def fixGrade(df):
    """
    Remove grades outside of range a-g
    Input: df (dataframe)
    Output: dataframe
    """
    df['grade'] = df['grade'].apply(lambda x: x.upper())
    return df[df['grade'].isin(list(grades))]

def fixTerm(df):
    """
    Drop the month part of the term and cast as int.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['term'] = df['term'].apply(lambda x: int(x.split()[0]))
    return df

def fixFundedToApplied(df):
    """
    If funded > applied amount remove the record.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    # only keep the records where they were funded less that they applied for
    df = df[df['funded_amnt'] < 2 * df['loan_amnt']]
    return df[df['funded_amnt'] >= df['funded_amnt_inv']]

def fixInitListStatus(df):
    """
    Remove rows that aren't F or W
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['initial_list_status'] = df['initial_list_status'].apply(lambda x: x.upper())
    return df[df['initial_list_status'].isin(['F', 'W'])]

def fixEmpLength(value):
    """
    Clean employment length, should only contain a numeric integer value
    i.e. 10+ years transforms to 10
         < 1 transforms to 1
         
    Input: value of cell (String)
    Output: years of employment (Int)
    """
    value = value.strip()
    if value.startswith('< 1'):
        return 1
    if value.startswith('10') and value[2] == '+':
        return 10
    
    splitVal = value.split(' ')
    if (splitVal[0].isdigit()):
        value = splitVal[0]
        
    return int(value)

def fixSubGrade(df):
    """
    Remove rows that don't have a subgrade A1,A2,A3,A4,A5,....,G1...G4,G5
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    # generate subgrades
    sub_grades = [''.join([x[0], str(x[1])]) for x in product(grades, range(1,6))]

    df['sub_grade'] = df['sub_grade'].apply(lambda x: x.upper())
    return df[df['sub_grade'].isin(sub_grades)]
    
def fixIsIncV(df):
    """
    Remove if row doesn't have enumerated status.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['is_inc_v'] = df['is_inc_v'].apply(lambda x: x.lower())
    return df[df['is_inc_v'].isin(["verified", "source verified", "not verified"])]

def fixPaymentPlan(df):
    """
    Only leave boolean (val 'n', 'y').
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['pymnt_plan'] = df['pymnt_plan'].apply(lambda x: x.lower())
    return df[df['pymnt_plan'].isin(['n', 'y'])]

def fixPurpose(df):
    """
    Only leave enumerated values. 
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    purposes = ["car", "credit_card", "other", "house", "debt_consolidation",
                "home_improvement", "small_business", "medical", "vacation",
                "moving", "wedding", "major_purchase"]
    df['purpose'] = df['purpose'].apply(lambda x: x.lower())
    return df[df['purpose'].isin(purposes)]

def fixBinary01(df, col_name):
    """
    Only leave boolean (val 0, 1).
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df[col_name] = df[col_name].astype('float', errors='ignore')
    return df[df[col_name].isin(range(2))]

def toInt(i):
    """
    Convert input to integer.
    
    Input: i (anything)
    Output: i (int), if error return original
    """
    try: 
        return int(i.strip())
    except TypeError:
        return i
    
def fixDeliquency(df):
    """
    If the delinq_2yrs col or the mths_since_last_delinq col has value greater than 1, 
    make new col with true, else false
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['deliquency'] = df.apply(lambda r: 1 if r['delinq_2yrs'] > 0 or 
                                (r['mths_since_last_delinq'] <= 24 and r['mths_since_last_delinq'] > 0) else 0, 
                                axis = 1)
    df.drop(['delinq_2yrs', 'last_delinq_none'], axis=1, inplace=True)
    return df

df = cleanData()
print df.shape, str(datetime.now())

(597, 68) 2018-02-07 15:27:54.883994


In [130]:
full_df = getBootcampData(db_creds, 2007, 2009)

In [131]:
full_df.shape

(8277, 68)

In [474]:
df.tail()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,total_acc,mths_since_last_major_derog
598,191006,191003,7500,7500,1167,36,10.59,244.09,C,C2,...,12.1,2.0,1996-12-01 00:00:00,3.0,19.0,0.0,9.0,0.0,11.0,
599,191186,191169,14400,14400,725,36,11.22,472.94,C,C4,...,15.0,0.0,2002-08-01 00:00:00,4.0,0.0,0.0,6.0,0.0,17.0,
600,192193,191767,6000,6000,1250,36,13.12,202.52,D,D5,...,6.95,0.0,1998-05-01 00:00:00,0.0,47.0,0.0,3.0,0.0,5.0,
601,192239,191813,20000,20000000,225,36,11.22,656.86,C,C4,...,14.04,0.0,2003-01-01 00:00:00,3.0,0.0,0.0,8.0,0.0,14.0,
602,200073,192537,700,700,675,36,17.09,24.99,G,G1,...,28.42,0.0,2004-09-01 00:00:00,3.0,0.0,0.0,14.0,0.0,25.0,


In [493]:
sum(df['deliquency'])

105

In [508]:
datetime(2006,12,31) <datetime.now()

True

In [None]:
df.to_csv('2007_clean.csv')