In [52]:
import mysql.connector
import pandas as pd
import csv
from datetime import datetime
from uszipcode import ZipcodeSearchEngine
from pprint import pprint

# credentials for connecting to the MySQL db
import json
db_creds = json.load(open('hidden/creds.json'))

def getDBCursor(creds):
    """
    Get the cursor for the db to perform transaction.

    Input: creds (dictionary of login credentials)
    Output: MySQL Connection, MySQl cursor object
    """
    cnx = mysql.connector.connect(**creds)
    return cnx, cnx.cursor()

def queryDB(cursor, query):
    """
    Execute a sql query.

    Input: cursor (MySQL cursor object), query (string of sql query)
    Output: dictionary containing the lines of the query
    """
    cursor.execute(query)
    return cursor.fetchall()

def executeQuery(creds, query):
    """
    Executes a query by connecting, querying, and closes the db connection

    Input: creds (db login credentials), query (string of sql query)
    Output: dictionary of query results
    """
    result = None
    try:
        conn, cursor = getDBCursor(creds)
        result = queryDB(cursor, query)
    except Error as e:
        print(e)
    finally:
        conn.close()
        cursor.close()
    return result

def getDataFrame(filename):
    """
    Create a pandas dataframe from a file.
    
    Input: filename (csv file, str)
    Output: pandas dataframe
    """
    try:
        df = pd.read_csv(filename)
    except IOError:
        print 'Use a csv file.'
        
    return df
    
def cleanData(filename):
    """
    Create a pandas dataframe from a file. Clean up the dates.
    
    Input: csv filename (str)
    Output: pandas dataframe object
    """
    df = getDataFrame(filename)
    df = fixFundedToApplied(df)
    
    # fix dates
    for col in list(df.columns.values): 
        if col.endswith('_d'):
            try: # exception thrown when nan's occure
                df[col] = df[col].apply(lambda x: datetime.strptime(x[:8], '%Y%d%m'))
            except Exception as e:
                continue
                
    # drop unneccessary columns
    col_to_drop = ['id', 'url', 'desc', 'title', 'policy_code', 'grade_num']
    df.drop(col_to_drop, axis=1, inplace=True)
    
    # fill nan's with specified values
    nan_fill = {
        'term': 0
    }
    df.fillna(nan_fill, inplace=True)
    
    # fix term col
    df = fixTerm(df)
    return df

def dropNanRowsColSpecific(df, cols_to_drop):
    """
    Drop the rows from the df that have nulls in the specified columns.
    
    Input: df (dataframe), cols_to_drop (list of strings)
    Output: df (dataframe)
    """
    return df.dropna(subset=col_to_drop, how='any') # add inplace if needed

def fixTerm(df):
    """
    Drop the month part of the term and cast as int.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    df['term'] = df['term'].apply(lambda x: int(x.split()[0]))
    return df

def fixFundedToApplied(df):
    """
    If funded > applied amount remove the record.
    
    Input: df (dataframe)
    Output: df (dataframe)
    """
    # only keep the records where they were funded less that they applied for
    return df[df['funded_amnt'] <= df['loan_amnt']]

In [2]:
col_names = executeQuery(db_creds, "describe Bootcamp.Postal_Codes_Tbl")
data = executeQuery(db_creds, "select * from Bootcamp.Postal_Codes_Tbl limit 1")

In [3]:
x = []
for i in col_names:
    x.append(str(i[0]))
y = []
for i in col_names:
    y.append(str(i[1]))
z = []
for i in data[0]:
    z.append(str(i))

In [4]:
data

[(u'210',
  u'Portsmouth',
  u'New Hampshire',
  u'NH',
  u'Rockingham',
  u'43.0059',
  u'-71.0132')]

In [5]:
f = open('data_info.csv', 'w')

In [6]:
f.write(','.join(x))
f.write('\n')
f.write(','.join(y))
f.write('\n')
f.write(','.join(z))

In [7]:
f.close()

In [8]:
i = 0
row = None
with open('YYYY_Data_datatype.csv') as f:
    for line in f:
        if i == 3:
            row = line
            break
        i += 1
row = row.strip().split(',')

In [35]:
dtypes = ['double', 'int', 'date']
type_array = []
to_add = None
for i in range(len(row)):
    if i == 0: 
        continue
    to_add = row[i] if row[i] in dtypes else ''
    type_array.append(to_add)

In [54]:
df = pd.read_csv('2008_data.csv')

In [53]:
membs = {}
for x in list(df.columns.values):
    membs[x] = df[x].unique()

In [57]:
with open('2008_data_unique_vals.csv', 'w') as f:
    w = csv.writer(f)
    w.writerows(membs.items())

In [18]:
df = pd.read_csv('2008_data.csv')
for col in list(df.columns.values):
    if col.endswith('_d'):
        try: # exception thrown when nan's occure
            df[col] = df[col].apply(lambda x: datetime.strptime(x[:8], '%Y%d%m'))
        except Exception as e:
            print col

last_pymnt_d
next_pymnt_d


In [72]:
df.tail()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
2388,369630,385659,2000,2000,1800,36 months,8.0,62.68,A,A3,...,0.6,1,1,1,1,1.50432,2011-01-12,1,1,1
2389,369673,385732,15000,15000,5522,36 months,12.84,504.27,C,C2,...,0.4,1,1,1,0,3.31575,2011-01-12,1,1,1
2390,369701,385797,10000,10000,3808,36 months,12.53,334.67,C,C1,...,0.2,1,1,1,0,3.52284,2011-01-12,1,1,1
2391,369713,385363,8600,8600,1932,36 months,11.26,282.63,B,B2,...,0.4,1,1,1,0,6.78312,2011-01-12,0,1,1
2392,369725,385844,15000,15000,6440,36 months,13.79,511.14,C,C5,...,1.0,1,1,1,0,1.90482,2011-01-12,0,1,1


In [98]:

search = ZipcodeSearchEngine()
zipcode = search.by_coordinate(43.0059,-71.0132, returns=1)

In [99]:
pprint(zipcode[0])

{"City": "Exeter", "Density": 431.7058096415328, "HouseOfUnits": 8668, "LandArea": 48.54, "Latitude": 42.9996568, "Longitude": -70.9784562, "NEBoundLatitude": 43.033860100000005, "NEBoundLongitude": -70.88238259999999, "Population": 20955, "SWBoundLatitude": 42.89838579999999, "SWBoungLongitude": -71.0894541, "State": "NH", "TotalWages": 571105228.0, "WaterArea": 0.61, "Wealthy": 27253.88823669769, "Zipcode": "03833", "ZipcodeType": "Standard"}


In [100]:
for i in range(len(zipcode)):
    print zipcode[i]['Zipcode']
    

03833


In [37]:
mi_df = getDataFrame('Postal_Codes_Tbl.csv')
mi_df.tail()

Unnamed: 0,Postal Code,Place Name,State,State Abbreviation,County,Latitude,Longitude
37711,84646,Moroni,Utah,UT,Sanpete,39.5108,-111.5603
37712,84647,Mount Pleasant,Utah,UT,Sanpete,39.5232,-111.5039
37713,84648,Nephi,Utah,UT,Juab,39.6923,-111.8359
37714,84649,Oak City,Utah,UT,Millard,39.3729,-112.3288
37715,84650,Oasis,Utah,UT,Milla,,


In [None]:
df['zipcode'] = mi_df.apply(lambda row: search.by_coordinate(mi_df['Latitude'], mi_df['Longitude'])[0]['Zipcode'], axis = 1)

In [16]:
len(df)

2393

In [36]:
mi_df.shape

(37715, 7)

In [20]:
col_to_drop = ['url']
df.drop(col_to_drop, axis=1, inplace=True)

In [35]:
col_to_drop = ['Latitude']
mi_df.dropna(subset=col_to_drop, how='any', inplace=True)

In [40]:
mi_df.fillna({'Latitude':0}, inplace=True)

In [41]:
mi_df.tail()

Unnamed: 0,Postal Code,Place Name,State,State Abbreviation,County,Latitude,Longitude
37711,84646,Moroni,Utah,UT,Sanpete,39.5108,-111.5603
37712,84647,Mount Pleasant,Utah,UT,Sanpete,39.5232,-111.5039
37713,84648,Nephi,Utah,UT,Juab,39.6923,-111.8359
37714,84649,Oak City,Utah,UT,Millard,39.3729,-112.3288
37715,84650,Oasis,Utah,UT,Milla,0.0,


In [48]:
mi_df.columns.values

array(['Postal Code', 'Place Name', 'State', 'State Abbreviation',
       'County', 'Latitude', 'Longitude'], dtype=object)

In [50]:
x = '36 months'
int(x.split()[0])

36

In [56]:
col_to_drop = ['id', 'url', 'desc', 'title', 'policy_code', 'grade_num']
df.drop(col_to_drop, axis=1, inplace=True)

ValueError: labels ['id' 'url' 'desc' 'title' 'policy_code' 'grade_num'] not contained in axis