In [339]:
import mysql.connector
import pandas as pd
import csv
from datetime import datetime
from pprint import pprint
from itertools import product
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size = 14)
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import seaborn as sns
sns.set(style = "white")
sns.set(style = "whitegrid", color_codes = True)

# credentials for connecting to the MySQL db
import json
db_creds = json.load(open('hidden/creds.json'))

grades = 'ABCDEFG'
tmp_df = None

def getDBCursor(creds):
    """
    Get the cursor for the db to perform transaction.

    Input: creds (dictionary of login credentials)
    Output: MySQL Connection, MySQl cursor object
    """
    cnx = mysql.connector.connect(**creds)
    return cnx, cnx.cursor()

def queryDB(cursor, query):
    """
    Execute a sql query.

    Input: cursor (MySQL cursor object), query (string of sql query)
    Output: dictionary containing the lines of the query
    """
    cursor.execute(query)
    return cursor.fetchall()

def getAllYearsData(conn, start_year, end_year):
    """
    Get a dataframe with all the years data.
    
    Input: conn (db connection), start_year (int), end_year int)
    Output: df (dataframe)
    """
    df = pd.DataFrame()
    for year in range(start_year, end_year + 1):
        df = pd.concat([df, pd.read_sql('SELECT * FROM Bootcamp.{}_Data'.format(str(year)), con=conn)])
    return df

def getMemberInfo(conn):
    """
    Get the membership information table in a dataframe
    
    Input: conn (db connection)
    Output: dataframe
    """
    return pd.read_sql('SELECT * FROM Bootcamp.Member_Information', con=conn)
    
def getBootcampData(creds, start_year, end_year):
    """
    Input: creds (db credentials)
    Output: dataframe with the combined member_id and year info
    """
    cnx, curs = getDBCursor(creds)
    year_df = getAllYearsData(cnx, start_year, end_year)
    mem_df = getMemberInfo(cnx)
    cnx.close()
    curs.close()
    # drop rows without a loan amount (ex. only a subset of the years is requested.)
    return pd.merge(year_df, mem_df, on='member_id', how='outer') 

def executeQuery(creds, query):
    """
    Executes a query by connecting, querying, and closes the db connection

    Input: creds (db login credentials), query (string of sql query)
    Output: dictionary of query results
    """
    result = None
    try:
        conn, cursor = getDBCursor(creds)
        result = queryDB(cursor, query)
    except Error as e:
        print(e)
    finally:
        conn.close()
        cursor.close()
    return result

def getDataFrame(filename):
    """
    Create a pandas dataframe from a file.
    
    Input: filename (csv file, str)
    Output: pandas dataframe
    """
    try:
        df = pd.read_csv(filename)
    except IOError:
        print 'Use a csv file.'
        
    return df
    
def cleanData():
    """
    Create a pandas dataframe from a file. Clean up the dates.
    
    Input: csv filename (str)
    Output: pandas dataframe object
    """
    df = getBootcampData(db_creds, 2007, 2009).apply(pd.to_numeric, errors='ignore')

    # delete rows with nulls in these columns
    df = dropNanRowsColSpecific(df, ['loan_amnt', 'open_acc', 'pub_rec', 'total_acc', 'inq_last_6mths'])
    
    # clean the rows that have incorrect amounts
    df = fixFundedToApplied(df)
    
    # fix dates
    for col in list(df.columns.values): 
        if col.endswith('_d') or col in ['earliest_cr_line']:
            df[col] = df[col].apply(lambda x: cleanDate(x.strip()))
                
    # drop unneccessary columns
    col_to_drop = ['id', 'loan_status', 'url', 'desc', 'title', 'revol_bal', 'revol_util', 
                   'policy_code', 'grade_num', 'sub_grade_num', 'mths_since_last_record',
                  'collections_12_mths_zero', 'payment_inc_ratio', 'emp_title', 'emp_length',
                  'mths_since_last_major_derog', 'mths_since_last_record', 'delinq_2yrs_zero']
    df.drop(col_to_drop, axis=1, inplace=True)
    
#     # fix employment length # removed bc update in spreadsheet
#     df['emp_length'] = df['emp_length'].apply(lambda x: fixEmpLength(x))
    
    # fill nan's with specified values
    nan_fill = {
        'term': 0
    }
    df.fillna(nan_fill, inplace=True)
    
    df = fixTerm(df)
    df = fixInitListStatus(df)
    df = fixGrade(df)
    df = fixIsIncV(df)
#     df = fixBinary01(df, 'inactive_loans')
#     df = fixBinary01(df, 'bad_loans')
# #     df = fixBinary01(df, 'delinq_2yrs_zero')
#     df = fixBinary01(df, 'pub_rec_zero')
    
    return df

def cleanDate(in_date):
    """
    Turn str into date.
    
    Input: in_date (str)
    Output: datetime obj 
    """
    if len(in_date) > 8:
        return datetime.strptime(in_date[:8], '%Y%m%d')
    else:
        return ''
    
def dropNanRowsColSpecific(df, col_to_drop):
    """
    Drop the rows from the df that have nulls in the specified columns.
    
    Input: df (dataframe), cols_to_drop (list of strings)
    Output: df (dataframe)
    """
    return df.dropna(subset=col_to_drop, how='any') # add inplace if needed

def fixGrade(df):
    """
    Remove grades outside of range a-g
    Input: df (dataframe)
    Output: dataframe
    """
    df['grade'] = df['grade'].apply(lambda x: x.upper())
    return df[df['grade'].isin(list(grades))]

def fixTerm(df):
    """
    Drop the month part of the term and cast as int.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['term'] = df['term'].apply(lambda x: int(x.split()[0]))
    return df

def fixFundedToApplied(df):
    """
    If funded > applied amount remove the record.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    # only keep the records where they were funded less that they applied for
    df = df[df['funded_amnt'] < 2* df['loan_amnt']]
    return df[df['funded_amnt'] <= df['funded_amnt_inv']]

def fixInitListStatus(df):
    """
    Remove rows that aren't F or W
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['initial_list_status'] = df['initial_list_status'].apply(lambda x: x.upper())
    return df[df['initial_list_status'].isin(['F', 'W'])]

def fixEmpLength(value):
    """
    Clean employment length, should only contain a numeric integer value
    i.e. 10+ years transforms to 10
         < 1 transforms to 1
         
    Input: value of cell (String)
    Output: years of employment (Int)
    """
    value = value.strip()
    if value.startswith('< 1'):
        return int(1)
    if value.startswith('10') and value[2] == '+':
        return int(10)
    
    splitVal = value.split(' ')
    if (splitVal[0].isdigit()):
        value = splitVal[0]
        
    return int(value)

def fixSubGrade(df):
    """
    Remove rows that don't have a subgrade A1,A2,A3,A4,A5,....,G1...G4,G5
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    sub_grades = [''.join([x[0], str(x[1])]) for x in product(grades, range(1,6))]
    df['sub_grade'] = df['sub_grade'].apply(lambda x: x.upper()) # make everything uppercase
    return df[df['sub_grade'].isin(sub_grade)]
    
def fixIsIncV(df):
    """
    Remove if row doesn't have enumerated status.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    return df[df['is_inc_v'].isin(["Verified", "Source Verified", "Not Verified"])]

def fixPaymentPlan(df):
    """
    Only leave boolean (val 'n', 'y').
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['pymnt_plan'] = df['pymnt_plan'].apply(lambda x: x.lower())
    return df[df['pymnt_plan'].isin(['n', 'y'])]

def fixPurpose(df):
    """
    Only leave enumerated values. 
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    purposes = ["car", "credit_card", "other", "house", "debt_consolidation",
                "home_improvement", "small_business", "medical", "vacation",
                "moving", "wedding", "major_purchase"]
    return df[df['purpose'].isin(purposes)]

def fixBinary01(df, col_name):
    """
    Only leave boolean (val 0, 1).
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df[col_name] = df[col_name].astype(int, errors='ignore')
    return df[df[col_name].isin(range(2))]

In [2]:
col_names = executeQuery(db_creds, "describe Bootcamp.Postal_Codes_Tbl")
data = executeQuery(db_creds, "select * from Bootcamp.Postal_Codes_Tbl limit 1")

In [3]:
x = []
for i in col_names:
    x.append(str(i[0]))
y = []
for i in col_names:
    y.append(str(i[1]))
z = []
for i in data[0]:
    z.append(str(i))

In [4]:
data

[(u'210',
  u'Portsmouth',
  u'New Hampshire',
  u'NH',
  u'Rockingham',
  u'43.0059',
  u'-71.0132')]

In [5]:
f = open('data_info.csv', 'w')

In [6]:
f.write(','.join(x))
f.write('\n')
f.write(','.join(y))
f.write('\n')
f.write(','.join(z))

In [7]:
f.close()

In [8]:
i = 0
row = None
with open('YYYY_Data_datatype.csv') as f:
    for line in f:
        if i == 3:
            row = line
            break
        i += 1
row = row.strip().split(',')

In [35]:
dtypes = ['double', 'int', 'date']
type_array = []
to_add = None
for i in range(len(row)):
    if i == 0: 
        continue
    to_add = row[i] if row[i] in dtypes else ''
    type_array.append(to_add)

In [66]:
df2 = pd.read_csv('2008_data.csv')

In [53]:
membs = {}
for x in list(df.columns.values):
    membs[x] = df[x].unique()

In [57]:
with open('2008_data_unique_vals.csv', 'w') as f:
    w = csv.writer(f)
    w.writerows(membs.items())

In [18]:
df = pd.read_csv('2008_data.csv')
for col in list(df.columns.values):
    if col.endswith('_d'):
        try: # exception thrown when nan's occure
            df[col] = df[col].apply(lambda x: datetime.strptime(x[:8], '%Y%d%m'))
        except Exception as e:
            print col

last_pymnt_d
next_pymnt_d


In [72]:
df.tail()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
2388,369630,385659,2000,2000,1800,36 months,8.0,62.68,A,A3,...,0.6,1,1,1,1,1.50432,2011-01-12,1,1,1
2389,369673,385732,15000,15000,5522,36 months,12.84,504.27,C,C2,...,0.4,1,1,1,0,3.31575,2011-01-12,1,1,1
2390,369701,385797,10000,10000,3808,36 months,12.53,334.67,C,C1,...,0.2,1,1,1,0,3.52284,2011-01-12,1,1,1
2391,369713,385363,8600,8600,1932,36 months,11.26,282.63,B,B2,...,0.4,1,1,1,0,6.78312,2011-01-12,0,1,1
2392,369725,385844,15000,15000,6440,36 months,13.79,511.14,C,C5,...,1.0,1,1,1,0,1.90482,2011-01-12,0,1,1


In [130]:
full_df = getBootcampData(db_creds, 2007, 2009)

In [131]:
full_df.shape

(8277, 68)

In [35]:
col_to_drop = ['Latitude']
mi_df.dropna(subset=col_to_drop, how='any', inplace=True)

In [40]:
mi_df.fillna({'Latitude':0}, inplace=True)

In [89]:
df = getBootcampData(db_creds)

In [340]:
df = cleanData()

In [311]:
df.head()

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,is_inc_v,...,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,open_acc,pub_rec,total_acc


In [342]:
df['emp_length']

KeyError: 'emp_length'

In [341]:
df.shape

(1232, 51)

In [164]:
tmp_df.shape

(3130, 61)

In [159]:
tdf = fixPaymentPlan(df)

In [160]:
tdf.shape

(3130, 61)

In [165]:
tmp_df['inactive_loans'].unique()

array([u'1'], dtype=object)

In [234]:
tmp_df['inactive_loans'] = tmp_df['inactive_loans'].astype('float')
tmp_df['inactive_loans'] = tmp_df['inactive_loans'].astype('int')
x = tmp_df[tmp_df['inactive_loans'].isin(range(2))]

In [235]:
int(tmp_df['inactive_loans'][2])

1

In [182]:
def fixDeliquency(df):
    df['delinq_2yrs'] = df['delinq_2yrs'].astype(int, errors='ignore')
    df['mths_since_last_delinq'] = df['mths_since_last_delinq'].astype(int, errors='ignore')
    df['deliquency'] = df.apply(lambda r: r['delinq_2yrs'] and r['mths_since_last_delinq'], axis = 1)
    return df

In [279]:
# x = tdf['delinq_2yrs'].apply(lambda x: bool(x or 1))
tmp_df['new'] = pd.to_numeric(tmp_df['delinq_2yrs'], downcast='integer',errors= 'coerce')
#x = x.astype('float', errors='ignore')

In [316]:
full_df.shape

(8277, 68)

In [335]:
tdf = full_df
x = tdf.apply(pd.to_numeric, errors='ignore')

In [336]:
x.dtypes

id                             float64
member_id                        int64
loan_amnt                        int64
funded_amnt                      int64
funded_amnt_inv                  int64
term                            object
int_rate                       float64
installment                    float64
grade                           object
sub_grade                       object
is_inc_v                        object
issue_d                         object
loan_status                     object
pymnt_plan                      object
url                             object
desc                            object
purpose                         object
title                           object
revol_bal                        int64
revol_util                     float64
initial_list_status             object
out_prncp                      float64
out_prncp_inv                  float64
total_pymnt                    float64
total_pymnt_inv                float64
total_rec_prncp          

In [322]:
tmp_df['new'] = (pd.notnull(tmp_df.iloc[53])).astype(float)

AttributeError: 'NoneType' object has no attribute 'iloc'

In [323]:
tmp_df['delinq_2yrs']

TypeError: 'NoneType' object has no attribute '__getitem__'

In [318]:
full_df.columns.get_loc('delinq_2yrs')

59

In [297]:
type(df.iloc[:,53])

pandas.core.series.Series

In [300]:
df['new']

2      NaN
12     NaN
17     NaN
20     NaN
21     NaN
22     NaN
27     NaN
28     NaN
30     NaN
31     NaN
33     NaN
36     NaN
37     NaN
39     NaN
41     NaN
42     NaN
45     NaN
48     NaN
50     NaN
54     NaN
55     NaN
57     NaN
58     NaN
59     NaN
60     NaN
63     NaN
64     NaN
65     NaN
70     NaN
71     NaN
        ..
8168   NaN
8172   NaN
8176   NaN
8178   NaN
8194   NaN
8199   NaN
8208   NaN
8212   NaN
8213   NaN
8216   NaN
8217   NaN
8218   NaN
8219   NaN
8221   NaN
8223   NaN
8224   NaN
8228   NaN
8229   NaN
8230   NaN
8231   NaN
8238   NaN
8239   NaN
8243   NaN
8251   NaN
8260   NaN
8262   NaN
8264   NaN
8267   NaN
8268   NaN
8276   NaN
Name: new, Length: 3130, dtype: float64

In [None]:
##JEN's attempt at modeling

import pandas as pd
import numpy as np
%matplotlib inline
# matplotlib inline plots the things in the consol
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
from __future__ import print_function

## UDPATE: connect to dataset
data = pd.read_csv('Shoe_data.csv')

## 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
regressor = LinearRegression(fit_intercept=True)

## UDPATE: convert fields numpy array values
type(data['shoe_size'].values)

## UPDATE: convert x to input fields, convert y to binary response variable
x = data['region',''].values[:, np.newaxis] 
y = data['status_response'].values

## 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=40) #test size is the percentage of data you want in the testing dataset
