In [52]:
import pandas as pd
pd.options.display.max_columns = 99

# Explore the dataset

In [53]:
df = pd.read_csv('loans_2007.csv', nrows=3)
df

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,,10+ years,RENT,24000.0,Verified,Dec-2011,Fully Paid,n,credit_card,Computer,860xx,AZ,27.65,0.0,Jan-1985,1.0,3.0,0.0,13648.0,83.7%,9.0,f,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,Jan-2015,171.62,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,Dec-2011,Charged Off,n,car,bike,309xx,GA,1.0,0.0,Apr-1999,5.0,3.0,0.0,1687.0,9.4%,4.0,f,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,Apr-2013,119.66,Sep-2013,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,,10+ years,RENT,12252.0,Not Verified,Dec-2011,Fully Paid,n,small_business,real estate business,606xx,IL,8.72,0.0,Nov-2001,2.0,2.0,0.0,2956.0,98.5%,10.0,f,0.0,0.0,3005.666844,3005.67,2400.0,605.67,0.0,0.0,0.0,Jun-2014,649.91,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0


# Objectives
- For each chunk:   
    - How many columns have a numeric type? 
    - How many columns have a string type?
    - How many unique values are there in each string column? 
    - How many of the string columns contain values that are less than 50% unique?
    - Which float columns have no missing values and could be candidates for conversion to the integer type?
- Calculate the total memory usage across all of the chunks

In [54]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)

total_memory = 0.0
for chunk in chunk_iter:
    # Memory usage in MB
    memory = chunk.memory_usage(deep=True).sum()/(1024*1024)
    total_memory += memory
    
    # How many columns have a numeric type?
    ntype_col_count = chunk.select_dtypes(include='number').columns.size
    print('Num. of numeric type columns:', ntype_col_count, '\n')
    
    # How many columns have a string type?
    stype_col_count = chunk.select_dtypes(include='object').columns.size
    print('Num. of string type columns:', stype_col_count, '\n')
    
    # How many unique values are there in each string column?
    stype_unq_counts = chunk.select_dtypes(include='object').nunique()
    print('Unique values in each string type column:\n')
    print(stype_unq_counts)
    
    # How many of the string columns contain values that are less than 50% unique?
    print('\nString type columns containing values that are less than 50% unique:\n')
    string_type_cols = chunk.select_dtypes(include='object')
    for colname, colvalues in string_type_cols.iteritems():
        if colvalues.nunique() < colvalues.count()/2:
            print(colname)

    # Which float columns have no missing values?
    print('\nFloating type columns with no missing values:\n')
    col_na_count = chunk.select_dtypes(include='floating').isna().sum()
    for col in col_na_count.where(col_na_count == 0).index:
        print(col)
    
    print('\n' + '='*50 + '\n')
    

print('Total memory usage (MB) =', total_memory)   


Num. of numeric type columns: 31 

Num. of string type columns: 21 

Unique values in each string type column:

term                      2
int_rate                 36
grade                     7
sub_grade                35
emp_title              2653
emp_length               11
home_ownership            3
verification_status       3
issue_d                   2
loan_status               6
pymnt_plan                1
purpose                  13
title                  1406
zip_code                568
addr_state               43
earliest_cr_line        366
revol_util              884
initial_list_status       1
last_pymnt_d             54
last_credit_pull_d       55
application_type          1
dtype: int64

String type columns containing values that are less than 50% unique:

term
int_rate
grade
sub_grade
emp_length
home_ownership
verification_status
issue_d
loan_status
pymnt_plan
purpose
title
zip_code
addr_state
earliest_cr_line
revol_util
initial_list_status
last_pymnt_d
last_credit_pu

Num. of numeric type columns: 31 

Num. of string type columns: 21 

Unique values in each string type column:

term                      2
int_rate                 45
grade                     7
sub_grade                35
emp_title              2600
emp_length               11
home_ownership            3
verification_status       3
issue_d                   4
loan_status               4
pymnt_plan                1
purpose                  13
title                  1958
zip_code                551
addr_state               43
earliest_cr_line        385
revol_util              948
initial_list_status       1
last_pymnt_d             64
last_credit_pull_d       65
application_type          1
dtype: int64

String type columns containing values that are less than 50% unique:

term
int_rate
grade
sub_grade
emp_length
home_ownership
verification_status
issue_d
loan_status
pymnt_plan
purpose
zip_code
addr_state
earliest_cr_line
revol_util
initial_list_status
last_pymnt_d
last_credit_pull_d
a

Num. of numeric type columns: 31 

Num. of string type columns: 21 

Unique values in each string type column:

term                      1
int_rate                107
grade                     7
sub_grade                35
emp_title              2598
emp_length               11
home_ownership            4
verification_status       3
issue_d                  17
loan_status               2
pymnt_plan                1
purpose                  14
title                  2297
zip_code                574
addr_state               47
earliest_cr_line        366
revol_util              929
initial_list_status       1
last_pymnt_d             59
last_credit_pull_d       97
application_type          1
dtype: int64

String type columns containing values that are less than 50% unique:

term
int_rate
grade
sub_grade
emp_length
home_ownership
verification_status
issue_d
loan_status
pymnt_plan
purpose
zip_code
addr_state
earliest_cr_line
revol_util
initial_list_status
last_pymnt_d
last_credit_pull_d
a