In [411]:
import mysql.connector
import pandas as pd
import csv
from datetime import datetime
from pprint import pprint
from itertools import product

# credentials for connecting to the MySQL db
import json
db_creds = json.load(open('hidden/creds.json'))

grades = 'ABCDEFG'
tmp_df = None

def getDBCursor(creds):
    """
    Get the cursor for the db to perform transaction.

    Input: creds (dictionary of login credentials)
    Output: MySQL Connection, MySQl cursor object
    """
    cnx = mysql.connector.connect(**creds)
    return cnx, cnx.cursor()

def queryDB(cursor, query):
    """
    Execute a sql query.

    Input: cursor (MySQL cursor object), query (string of sql query)
    Output: dictionary containing the lines of the query
    """
    cursor.execute(query)
    return cursor.fetchall()

def getAllYearsData(conn, start_year, end_year):
    """
    Get a dataframe with all the years data.
    
    Input: conn (db connection), start_year (int), end_year int)
    Output: df (dataframe)
    """
    df = pd.DataFrame()
    for year in range(start_year, end_year + 1):
        df = pd.concat([df, pd.read_sql('SELECT * FROM Bootcamp.{}_Data'.format(str(year)), con=conn)])
    return df

def getMemberInfo(conn):
    """
    Get the membership information table in a dataframe
    
    Input: conn (db connection)
    Output: dataframe
    """
    return pd.read_sql('SELECT * FROM Bootcamp.Member_Information', con=conn)
    
def getBootcampData(creds, start_year, end_year):
    """
    Input: creds (db credentials)
    Output: dataframe with the combined member_id and year info
    """
    cnx, curs = getDBCursor(creds)
    year_df = getAllYearsData(cnx, start_year, end_year)
    mem_df = getMemberInfo(cnx)
    cnx.close()
    curs.close()
    # drop rows without a loan amount (ex. only a subset of the years is requested.)
    return pd.merge(year_df, mem_df, on='member_id', how='inner') 

def executeQuery(creds, query):
    """
    Executes a query by connecting, querying, and closes the db connection

    Input: creds (db login credentials), query (string of sql query)
    Output: dictionary of query results
    """
    result = None
    try:
        conn, cursor = getDBCursor(creds)
        result = queryDB(cursor, query)
    except Error as e:
        print(e)
    finally:
        conn.close()
        cursor.close()
    return result

def getDataFrame(filename):
    """
    Create a pandas dataframe from a file.
    
    Input: filename (csv file, str)
    Output: pandas dataframe
    """
    try:
        df = pd.read_csv(filename)
    except IOError:
        print 'Use a csv file.'
        
    return df
    
def cleanData():
    """
    Create a pandas dataframe from a file. Clean up the dates.
    
    Input: csv filename (str)
    Output: pandas dataframe object
    """
    df = getBootcampData(db_creds, 2007, 2009).apply(pd.to_numeric, errors='ignore') # joining is probably wrong

    # delete rows with nulls in these columns
    df = dropNanRowsColSpecific(df, ['loan_amnt', 'open_acc', 'pub_rec', 'total_acc', 'inq_last_6mths'])
    
    # clean the rows that have incorrect amounts
    df = fixFundedToApplied(df)
    
    # fix dates
    for col in list(df.columns.values): 
        if col.endswith('_d') or col in ['earliest_cr_line']:
            df[col] = df[col].apply(lambda x: cleanDate(x.strip()))
                
    # drop unneccessary columns
    col_to_drop = ['id', 'loan_status', 'url', 'desc', 'title', 'revol_bal', 'revol_util', 
                   'policy_code', 'grade_num', 'sub_grade_num', 'mths_since_last_record',
                  'collections_12_mths_zero', 'payment_inc_ratio', 'emp_title', 'emp_length',
                  'mths_since_last_major_derog', 'mths_since_last_record', 'delinq_2yrs_zero']
    df.drop(col_to_drop, axis=1, inplace=True)
    
#     # fix employment length # removed bc update in spreadsheet
#     df['emp_length'] = df['emp_length'].apply(lambda x: fixEmpLength(x))
    
    # fill nan's with specified values
    nan_fill = {
        'term': 0
    }
    df.fillna(nan_fill, inplace=True)
    
    df = fixTerm(df)
#     df = fixInitListStatus(df)
    df = fixGrade(df)
    df = fixIsIncV(df)
    df = fixDeliquency(df)
    
    df = fixBinary01(df, 'inactive_loans')
    df = fixBinary01(df, 'bad_loans')
#     df = fixBinary01(df, 'delinq_2yrs_zero')
    df = fixBinary01(df, 'pub_rec_zero')
    
    return df

def cleanDate(in_date):
    """
    Turn str into date.
    
    Input: in_date (str)
    Output: datetime obj 
    """
    if len(in_date) > 8:
        return datetime.strptime(in_date[:8], '%Y%m%d')
    else:
        return ''
    
def dropNanRowsColSpecific(df, col_to_drop):
    """
    Drop the rows from the df that have nulls in the specified columns.
    
    Input: df (dataframe), cols_to_drop (list of strings)
    Output: df (dataframe)
    """
    return df.dropna(subset=col_to_drop, how='any') # add inplace if needed

def fixGrade(df):
    """
    Remove grades outside of range a-g
    Input: df (dataframe)
    Output: dataframe
    """
    df['grade'] = df['grade'].apply(lambda x: x.upper())
    return df[df['grade'].isin(list(grades))]

def fixTerm(df):
    """
    Drop the month part of the term and cast as int.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['term'] = df['term'].apply(lambda x: int(x.split()[0]))
    return df

def fixFundedToApplied(df):
    """
    If funded > applied amount remove the record.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    # only keep the records where they were funded less that they applied for
    df = df[df['funded_amnt'] < 2* df['loan_amnt']]
    return df[df['funded_amnt'] > df['funded_amnt_inv']]

def fixInitListStatus(df):
    """
    Remove rows that aren't F or W
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['initial_list_status'] = df['initial_list_status'].apply(lambda x: x.upper())
    return df[df['initial_list_status'].isin(['F', 'W'])]

def fixEmpLength(value):
    """
    Clean employment length, should only contain a numeric integer value
    i.e. 10+ years transforms to 10
         < 1 transforms to 1
         
    Input: value of cell (String)
    Output: years of employment (Int)
    """
    value = value.strip()
    if value.startswith('< 1'):
        return int(1)
    if value.startswith('10') and value[2] == '+':
        return int(10)
    
    splitVal = value.split(' ')
    if (splitVal[0].isdigit()):
        value = splitVal[0]
        
    return int(value)

def fixSubGrade(df):
    """
    Remove rows that don't have a subgrade A1,A2,A3,A4,A5,....,G1...G4,G5
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    sub_grades = [''.join([x[0], str(x[1])]) for x in product(grades, range(1,6))]
    df['sub_grade'] = df['sub_grade'].apply(lambda x: x.upper()) # make everything uppercase
    return df[df['sub_grade'].isin(sub_grade)]
    
def fixIsIncV(df):
    """
    Remove if row doesn't have enumerated status.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    return df[df['is_inc_v'].isin(["Verified", "Source Verified", "Not Verified"])]

def fixPaymentPlan(df):
    """
    Only leave boolean (val 'n', 'y').
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['pymnt_plan'] = df['pymnt_plan'].apply(lambda x: x.lower())
    return df[df['pymnt_plan'].isin(['n', 'y'])]

def fixPurpose(df):
    """
    Only leave enumerated values. 
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    purposes = ["car", "credit_card", "other", "house", "debt_consolidation",
                "home_improvement", "small_business", "medical", "vacation",
                "moving", "wedding", "major_purchase"]
    return df[df['purpose'].isin(purposes)]

def fixBinary01(df, col_name):
    """
    Only leave boolean (val 0, 1).
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df[col_name] = df[col_name].astype(int, errors='ignore')
    return df[df[col_name].isin(range(2))]

def fixDeliquency(df):
    """
    If the delinq_2yrs col or the mths_since_last_delinq col has value greater than 1, 
    make new col with true, else false
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['deliquency'] = df.apply(lambda r: 1 if r['delinq_2yrs'] + r['mths_since_last_delinq'] > 1 else 0, axis = 1)
    df.drop(['delinq_2yrs', 'mths_since_last_delinq'], axis=1, inplace=True)
    return df

In [130]:
full_df = getBootcampData(db_creds, 2007, 2009)

In [131]:
full_df.shape

(8277, 68)

In [None]:
df = cleanData()

In [405]:
df.shape

(7012, 51)