## Final Project of Big Data

## Import Libraries

In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext('local', 'pyspark') 
sqlContext = SQLContext(sc)

## Load Data Set

In [114]:
loadPath = "../dataset/lending/loan.csv"
dataload = sqlContext.read.csv(loadPath, header='true')
dataload.createOrReplaceTempView("loancredit")
#dataload.cache()

## Show Schema

In [3]:
dataload.printSchema()

root
 |-- id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amnt: string (nullable = true)
 |-- funded_amnt: string (nullable = true)
 |-- funded_amnt_inv: string (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: string (nullable = true)
 |-- installment: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- pymnt_plan: string (nullable = true)
 |-- url: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- dti: string 

In [11]:
dataload.count()

887379

# Clean data

## Check for duplications

In [8]:
print('Count of rows: {0}'.format(dataload.count()))
print('Count of distinct rows: {0}'.format(dataload.distinct().count()))

Count of rows: 887379
Count of distinct rows: 887379


The amount of rows is same, so not exist duplicate data

## Missing observations

In [115]:
datafilterloan = sqlContext.sql("select loan_status,loan_amnt,funded_amnt,term,grade,sub_grade,home_ownership,verification_status,purpose,title,emp_length,int_rate,annual_inc,dti,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int from loancredit")
#datafilterloan_1=sqlContext.sql("select loan_status,loan_amnt as lo_mnt,funded_amnt as fun_mnt,term,grade,sub_grade as s_grad,home_ownership as hom_own,verification_status as verfi_sta,purpose from loancredit")
#sqlContext.sql("select loan_status,loan_amnt as lo_mnt,funded_amnt as fun_mnt,term,grade,sub_grade as s_grad,home_ownership as hom_own,verification_status as verfi_sta,purpose from loancredit").show(5)
#datafilterloan_2=sqlContext.sql("select loan_status,title,emp_length,int_rate,annual_inc,dti,inq_last_6mths,open_acc,pub_rec,revol_bal from loancredit")
#sqlContext.sql("select loan_status,title,emp_length,int_rate,annual_inc,dti,inq_last_6mths,open_acc,pub_rec,revol_bal from loancredit").show(5)
#datafilterloan_3=sqlContext.sql("select loan_status,revol_util,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp as TotRecPy,total_rec_int as TotRecInt,last_pymnt_amnt,last_pymnt_d from loancredit")
#sqlContext.sql("select loan_status,revol_util,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp as TotRecPy,total_rec_int as TotRecInt,last_pymnt_amnt,last_pymnt_d from loancredit").show(5)
datafilterloan.show(5)

+-----------+---------+-----------+----------+-----+---------+--------------+-------------------+--------------+--------------------+----------+--------+----------+-----+--------------+--------+-------+---------+----------+---------+-------------+---------------+---------------+-------------+
|loan_status|loan_amnt|funded_amnt|      term|grade|sub_grade|home_ownership|verification_status|       purpose|               title|emp_length|int_rate|annual_inc|  dti|inq_last_6mths|open_acc|pub_rec|revol_bal|revol_util|total_acc|  total_pymnt|total_pymnt_inv|total_rec_prncp|total_rec_int|
+-----------+---------+-----------+----------+-----+---------+--------------+-------------------+--------------+--------------------+----------+--------+----------+-----+--------------+--------+-------+---------+----------+---------+-------------+---------------+---------------+-------------+
| Fully Paid|   5000.0|     5000.0| 36 months|    B|       B2|          RENT|           Verified|   credit_card|      

### Discretization: Convert the categorical variable in numeric variable.
### Conversion of the following variable:
* loan_status
* term
* grade
* sub_grade
* home_ownership
* verification_status
* purpose
* emp_length

### Encoding function of the following variable (Loan Status, term, grade, sub_grade, home_ownership, verification_status, purpose, title)

In [139]:
def loan_status_transformer(cat):
    if(cat is None):
        return 0
    else:
        if(cat == 'Fully Paid'): 
            return 1
        elif(cat == 'Charged Off'): 
            return 2
        elif(cat == 'Current'): 
            return 3
        elif(cat == 'Default'): 
            return 4
        elif(cat == 'In Grace Period'): 
            return 5
        elif(cat == 'Late (31-120 days)'): 
            return 6
        else:
            return 0
def term_transformer(cat):
    if(cat is None):
        return 0
    else:
        if(cat.strip() == '60 months'): 
            return 2
        elif(cat.strip() == '36 months'): 
            return 1
        else:
            return 0
def grade_transformer(cat):
    if(cat is None):
        return 0
    else:
        alfa = "ABCDEFG"
        for ind,x in enumerate(alfa):
            if(x==cat):
                return ind+1  
        return 0
def subgrade_transformer(cat):
    if(cat is None):
        return 0
    else:
        alfa = "ABCDEFG"
        numer = "12345"
        count=1
        for indx,x in enumerate(alfa):
            for indy,y in enumerate(numer):
                if(x+y == cat.strip()):
                    return count
                count+=1
        return count
def emplen_transformer(cat):
    output=0
    if(cat == "n/a" or cat is None):
        output=0
    if(cat == "< 1 year"):
        output=1
    for item in range(10):
        if((str(item)+" years") == cat.strip()):
            output=item
    if(cat =='10+ years'):
        output=10
    return output
def hoow_transformer(cat):
    if(cat is None):
            return 0
    else:
        if(cat.strip() == 'MORTAGE'): 
            return 1
        elif(cat.strip() == 'OWN'): 
            return 2
        elif(cat.strip() == 'RENT'): 
            return 3
        else:
            return 0
def verst_transformer(cat):
    if(cat is None):
        return 0
    else:
        if(cat.strip() == 'Not Verified'): 
            return 1
        elif(cat.strip() == 'Verified'): 
            return 2
        elif(cat.strip() == 'Source Verified'): 
            return 3
        else:
            return 0
def purp_transformer(cat):
    if(cat is None):
        return 0
    else:
        if(cat.strip() == 'car'): 
            return 13
        elif(cat.strip() == 'credit_card'): 
            return 1
        elif(cat.strip() == 'debt_consolidation'): 
            return 2
        elif(cat.strip() == 'home_improvement'): 
            return 3
        elif(cat.strip() == 'house'): 
            return 4
        elif(cat.strip() == 'major_purchase'): 
            return 5
        elif(cat.strip() == 'medical'): 
            return 6
        elif(cat.strip() == 'moving'): 
            return 7
        elif(cat.strip() == 'other'): 
            return 8
        elif(cat.strip() == 'renewable_energi'): 
            return 9
        elif(cat.strip() == 'small_business'): 
            return 10
        elif(cat.strip() == 'vacation'): 
            return 11
        elif(cat.strip() == 'wedding'): 
            return 12
        else:
            return 0
print(emplen_transformer('10+ years'))

10


In [6]:
import pyspark.ml.feature as ft
import pyspark.sql.types as typ
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import *

### Convert from categorical to numerical features through UDF

In [140]:
# loan_status
udfloan_status_transformer = udf(loan_status_transformer, IntegerType())
datafilterloan = datafilterloan.withColumn("loan_status_int", udfloan_status_transformer("loan_status"))
# term
udfterm_transformer = udf(term_transformer, IntegerType())
datafilterloan = datafilterloan.withColumn("term_int", udfterm_transformer("term"))
# grade
udfgrade_transformer = udf(grade_transformer, IntegerType())
datafilterloan = datafilterloan.withColumn("grade_int", udfgrade_transformer("grade"))
# sub_grade
udfsubgrade_transformer = udf(subgrade_transformer, IntegerType())
datafilterloan = datafilterloan.withColumn("su_grad_int", udfsubgrade_transformer("sub_grade"))
# home_ownership
udfhoow_transformer = udf(hoow_transformer, IntegerType())
datafilterloan = datafilterloan.withColumn("homeowner_int", udfhoow_transformer("home_ownership"))
# verification_status
udfverst_transformer = udf(verst_transformer, IntegerType())
datafilterloan = datafilterloan.withColumn("veri_sta_int", udfverst_transformer("verification_status"))
# purpose
udfpurp_transformer = udf(purp_transformer, IntegerType())
datafilterloan = datafilterloan.withColumn("purpose_int", udfpurp_transformer("purpose"))
# purpose
udfemplen_transformer = udf(emplen_transformer, IntegerType())
datafilterloan = datafilterloan.withColumn("emp_length_int", udfemplen_transformer("emp_length"))

### Test of Discretizaties values

In [135]:
datafilterloan.select("loan_status_int","term_int","grade_int","su_grad_int","homeowner_int","veri_sta_int","purpose_int","emp_length_int").show(10)

+---------------+--------+---------+-----------+-------------+------------+-----------+--------------+
|loan_status_int|term_int|grade_int|su_grad_int|homeowner_int|veri_sta_int|purpose_int|emp_length_int|
+---------------+--------+---------+-----------+-------------+------------+-----------+--------------+
|              1|       1|        1|          6|            3|           2|          1|            10|
|              2|       2|        2|         13|            3|           3|         13|             1|
|              1|       1|        2|         14|            3|           1|         10|            10|
|              1|       1|        2|         10|            3|           3|          8|            10|
|              3|       2|        1|          9|            3|           3|          8|             0|
|              1|       1|        0|          3|            3|           3|         12|             3|
|              3|       2|        2|         14|            3|           

## Stratification: Put values in range.
   ### Strat the following variable:

* loan_amnt
* funded_amnt
* annual_inc
* revol_bal

### Stratification function of the following variable (loan_amnt, funded_amnt, annual_inc, revol_bal)

In [136]:
import numbers
def loan_amnt_strat(recei):    
    try:
        if(recei is None):
            return 0
        else:
            val_str=float(recei)
            val_comp=0.00
            for itera in range(7):
                if(val_str >= val_comp and val_str < (val_comp+5001.00)): 
                    return itera+1
                val_comp+=5000.00
            return 0
    except ValueError: 
        return 0
def funded_amnt_strat(recei):
    try:
        if(recei is None):
            return 0
        else:
            val_str=float(recei)
            val_comp=0.00
            for itera in range(7):
                if(val_str >= val_comp and val_str < (val_comp+5001.00)): 
                    return itera+1
                val_comp+=5000.00
            return 0
    except ValueError: 
        return 0
def annual_inc_strat(recei):
    try:
        if(recei is None):
            return 0
        else:
            val_str=float(recei)
            val_comp=0.00
            for itera in range(105):
                if(val_str >= val_comp and val_str < (val_comp+5001.00)): 
                    return itera+1
                val_comp+=5000.00
            return 0
    except ValueError: 
        return 0
def revol_bal_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(66):
                if(val_str >= val_comp and val_str < (val_comp+2001.00)): 
                    return itera+1
                val_comp+=2000.00
            return 0
    except ValueError: 
        return 0
def revol_util_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(66):
                if(val_str >= val_comp and val_str < (val_comp+1001.00)): 
                    return itera+1
                val_comp+=1000.00
            return 0
    except ValueError: 
        return 0
def total_pymnt_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(30):
                if(val_str >= val_comp and val_str < (val_comp+2001.00)): 
                    return itera+1
                val_comp+=2000.00
            return 0
    except ValueError: 
        return 0
def total_pymnt_inv_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(30):
                if(val_str >= val_comp and val_str < (val_comp+2001.00)): 
                    return itera+1
                val_comp+=2000.00
            return 0
    except ValueError: 
        return 0
def total_rec_prncp_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(14):
                if(val_str >= val_comp and val_str < (val_comp+2501.00)): 
                    return itera+1
                val_comp+=2500.00
            return 0
    except ValueError: 
        return 0
def total_rec_int_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(14):
                if(val_str >= val_comp and val_str < (val_comp+2501.00)): 
                    return itera+1
                val_comp+=2500.00
            return 0
    except ValueError: 
        return 0
def int_rate_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(16):
                if(val_str >= val_comp and val_str < (val_comp+1.00)): 
                    return itera+1
                val_comp+=1.00
            return 0
    except ValueError: 
        return 0
def dti_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(20):
                if(val_str >= val_comp and val_str < (val_comp+5.00)): 
                    return itera+1
                val_comp+=5.00
            return 0
    except ValueError: 
        return 0
def inq_last_6mths_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(17):
                if(val_str >= val_comp and val_str < (val_comp+2.00)): 
                    return itera+1
                val_comp+=2.00
            return 0
    except ValueError: 
        return 0
def pub_rec_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(25):
                if(val_str >= val_comp and val_str < (val_comp+4.00)): 
                    return itera+1
                val_comp+=4.00
            return 0
    except ValueError: 
        return 0
def total_acc_strat(receiy):
    try:
        if(receiy is None):
            return 0
        else:
            val_str=float(receiy)
            val_comp=0.00
            for itera in range(40):
                if(val_str >= val_comp and val_str < (val_comp+500.00)): 
                    return itera+1
                val_comp+=500.00
            return 0
    except ValueError: 
        return 0
print(total_acc_strat('2134.5'))

5


### Stratific Big numerical Features

In [137]:
# loan_amnt
udfloan_amnt_strat = udf(loan_amnt_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("loan_amnt_strat", udfloan_amnt_strat("loan_amnt"))
# funded_amnt
udffunded_amnt_strat = udf(funded_amnt_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("funded_amnt_strat", udffunded_amnt_strat("funded_amnt"))
# annual_inc
udfannual_inc_strat = udf(annual_inc_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("annual_inc_strat", udfannual_inc_strat("annual_inc"))
# revol_bal
udfrevol_bal_strat = udf(revol_bal_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("revol_bal_strat", udfrevol_bal_strat("revol_bal"))
# revol_util
udfrevol_util_strat = udf(revol_util_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("revol_util_strat", udfrevol_util_strat("revol_util"))
# total_pymnt 60000, cada 5000
udftotal_pymnt_strat = udf(total_pymnt_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("total_pymnt_strat", udftotal_pymnt_strat("total_pymnt"))
# total_pymnt_inv 60000, cada 5000
udftotal_pymnt_inv_strat = udf(total_pymnt_inv_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("total_pymnt_inv_strat", udftotal_pymnt_inv_strat("total_pymnt_inv"))
# total_rec_prncp 35000, cada 2500
udftotal_rec_prncp_strat = udf(total_rec_prncp_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("total_rec_prncp_strat", udftotal_rec_prncp_strat("total_rec_prncp"))
# total_rec_int 35000, cada 2500
udftotal_rec_int_strat = udf(total_rec_int_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("total_rec_int_strat", udftotal_rec_int_strat("total_rec_int"))
# int_rate 16, cada 1
udfint_rate_strat = udf(int_rate_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("int_rate_strat", udfint_rate_strat("int_rate"))
# dti 20, cada 5
udfdti_strat = udf(dti_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("dti_strat", udfdti_strat("dti"))
# inq_last_6mths 34, cada 2
udfinq_last_6mths_strat = udf(inq_last_6mths_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("inq_last_6mths_strat", udfinq_last_6mths_strat("inq_last_6mths"))
# pub_rec 100, cada 4
udfpub_rec_strat = udf(pub_rec_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("pub_rec_strat", udfpub_rec_strat("pub_rec"))
# total_acc 40, cada 500
udftotal_acc_strat = udf(total_acc_strat, IntegerType())
datafilterloan = datafilterloan.withColumn("total_acc_strat", udftotal_acc_strat("total_acc"))

### Test of Stratificaties values

In [124]:
datafilterloan.select("loan_amnt_strat","funded_amnt_strat","annual_inc_strat","revol_bal_strat","revol_util_strat","total_pymnt_strat","total_pymnt_inv_strat","total_rec_prncp_strat","total_rec_int_strat","int_rate_strat","dti_strat","inq_last_6mths_strat","pub_rec_strat","total_acc_strat").show(5)

+---------------+-----------------+----------------+---------------+----------------+-----------------+---------------------+---------------------+-------------------+--------------+---------+--------------------+-------------+---------------+
|loan_amnt_strat|funded_amnt_strat|annual_inc_strat|revol_bal_strat|revol_util_strat|total_pymnt_strat|total_pymnt_inv_strat|total_rec_prncp_strat|total_rec_int_strat|int_rate_strat|dti_strat|inq_last_6mths_strat|pub_rec_strat|total_acc_strat|
+---------------+-----------------+----------------+---------------+----------------+-----------------+---------------------+---------------------+-------------------+--------------+---------+--------------------+-------------+---------------+
|              1|                1|               5|              7|               1|                3|                    3|                    2|                  1|            11|        6|                   1|            1|              1|
|              1|       

### Clean Data Loan

In [141]:
cleandataloan=datafilterloan.select("loan_status_int","term_int","grade_int","su_grad_int","homeowner_int","veri_sta_int","purpose_int","emp_length_int","loan_amnt_strat","funded_amnt_strat","annual_inc_strat","revol_bal_strat","revol_util_strat","total_pymnt_strat","total_pymnt_inv_strat","total_rec_prncp_strat","total_rec_int_strat","int_rate_strat","dti_strat","inq_last_6mths_strat","pub_rec_strat","total_acc_strat")
cleandataloan.show(10)

+---------------+--------+---------+-----------+-------------+------------+-----------+--------------+---------------+-----------------+----------------+---------------+----------------+-----------------+---------------------+---------------------+-------------------+--------------+---------+--------------------+-------------+---------------+
|loan_status_int|term_int|grade_int|su_grad_int|homeowner_int|veri_sta_int|purpose_int|emp_length_int|loan_amnt_strat|funded_amnt_strat|annual_inc_strat|revol_bal_strat|revol_util_strat|total_pymnt_strat|total_pymnt_inv_strat|total_rec_prncp_strat|total_rec_int_strat|int_rate_strat|dti_strat|inq_last_6mths_strat|pub_rec_strat|total_acc_strat|
+---------------+--------+---------+-----------+-------------+------------+-----------+--------------+---------------+-----------------+----------------+---------------+----------------+-----------------+---------------------+---------------------+-------------------+--------------+---------+-----------------

# Analyze data

## Descriptive Statistics

## Here data set, I am going to split in Xtrain, Y train(loan_status)
### Y train:
* loan_status: 
### Xtrain:
* loan_amnt: The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
* funded_amnt: The total amount funded by investors for that loan at that point in time.
* term: The Number of payments on the loan. Values are in months and can be either 36 or 60.
* grade: LC assigned loan grade
* sub_grade: LC assigned loan subgrade
* home_ownership: The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER
* verification_status: The status of the loan during the listing period. Values: APPROVED, NOT_APPROVED.
* purpose: A category provided by the borrower for the loan request. Values are: debt_consolidation, medical, home_improvement, renewable_energy, small_business, wedding, vacation, moving, house, car, major_purchase, credit_card, other
* emp_length: Employment length in months. Possible values are whole numbers from 0 and higher. Null indicates not employed.
* int_rate: Interest Rate on the loan
* annual_inc: The annual income provided by the borrower during registration.
* dti: The borrower's debt to income ratio, calculated using the monthly payments on the total debt obligations, excluding mortgage, divided by self-reported monthly income.
* inq_last_6mths: information from the bureau last 6 months
* pub_rec: it indicates risk and hurts your ability to qualify for credit, obligations that were not paid 
* revol_bal: sometimes a full payment doesn’t (or can’t) happen, The amount of credit you have versus the amount of credit you use is a factor in your credit score
* revol_util: the amount of credit the borrower is using relative to all available revolving credit.
* total_acc: The total Number of credit lines currently in the borrower's credit file
* total_pymnt: The total payment currently
* total_pymnt_inv: The total payment currently of investment
* total_rec_prncp: 
* total_rec_int:  interest of recollect total.

## Descriptive Statistic

### Test Balance of Class Loan Status

In [130]:
cleandataloan.groupby('loan_status_int').count().show()

+---------------+------+
|loan_status_int| count|
+---------------+------+
|              1|207723|
|              6| 11591|
|              3|601778|
|              5|  6253|
|              4|  1219|
|              2| 45248|
|              0| 13567|
+---------------+------+



Here I can determine that data is concentrate in two labels in loan status:
* 1: Fully Paid
* 3: Current
* 2: Charged Off

In [146]:
#columns=["loan_status_int","term_int","grade_int","su_grad_int","homeowner_int","veri_sta_int","purpose_int","emp_length_int","loan_amnt_strat","funded_amnt_strat","annual_inc_strat","revol_bal_strat","revol_util_strat","total_pymnt_strat","total_pymnt_inv_strat","total_rec_prncp_strat","total_rec_int_strat","int_rate_strat","dti_strat","inq_last_6mths_strat","pub_rec_strat","total_acc_strat"]
columns1=["loan_status_int","term_int","grade_int","su_grad_int"]
columns2=["homeowner_int","veri_sta_int","purpose_int","emp_length_int"]
cleandataloan.describe(columns1).show()
cleandataloan.describe(columns2).show()

+-------+------------------+------------------+------------------+------------------+
|summary|   loan_status_int|          term_int|         grade_int|       su_grad_int|
+-------+------------------+------------------+------------------+------------------+
|  count|            887379|            887379|            887379|            887379|
|   mean|2.4896239374607694| 1.300045414642447| 2.798402937189183|11.959941580767632|
| stddev| 1.005768502459213|0.4582776456114865|1.3125993531916076| 6.493018957756487|
|    min|                 0|                 1|                 1|                 1|
|    max|                 6|                 2|                 7|                35|
+-------+------------------+------------------+------------------+------------------+

+-------+------------------+------------------+------------------+------------------+
|summary|     homeowner_int|      veri_sta_int|       purpose_int|    emp_length_int|
+-------+------------------+------------------+------

Here, We can to test that not exist a stddev very high, so we can work with this data

### Correlations between features

 Calculate pairwise correlations of:

In [147]:
print(cleandataloan.corr('loan_status_int','purpose_int'))

-0.07012200137237125


#### Correlations Matrix

In [149]:
#"loan_status_int","term_int","grade_int","su_grad_int",
#"homeowner_int","veri_sta_int","purpose_int","emp_length_int","loan_amnt_strat",
#"funded_amnt_strat","annual_inc_strat","revol_bal_strat","revol_util_strat",
#"total_pymnt_strat","total_pymnt_inv_strat","total_rec_prncp_strat",
#"total_rec_int_strat","int_rate_strat","dti_strat","inq_last_6mths_strat",
#"pub_rec_strat","total_acc_strat"]
numerical=['loan_status_int','purpose_int','homeowner_int','int_rate_strat','annual_inc_strat']
n_numerical = len(numerical)
corr = []
for i in range(0, n_numerical):
    temp = [None] * i
    for j in range(i, n_numerical):
        temp.append(round(cleandataloan.corr(numerical[i], numerical[j]),2))
        corr.append(temp)
print(corr)

[[1.0, -0.07, 0.0, -0.02, 0.01], [1.0, -0.07, 0.0, -0.02, 0.01], [1.0, -0.07, 0.0, -0.02, 0.01], [1.0, -0.07, 0.0, -0.02, 0.01], [1.0, -0.07, 0.0, -0.02, 0.01], [None, 1.0, 0.05, -0.07, -0.01], [None, 1.0, 0.05, -0.07, -0.01], [None, 1.0, 0.05, -0.07, -0.01], [None, 1.0, 0.05, -0.07, -0.01], [None, None, 1.0, -0.0, -0.22], [None, None, 1.0, -0.0, -0.22], [None, None, 1.0, -0.0, -0.22], [None, None, None, 1.0, -0.01], [None, None, None, 1.0, -0.01], [None, None, None, None, 1.0]]


### Visualization

In [162]:
#import bokeh.charts as chrt
from bokeh.plotting import figure, show
numerical=['loan_status_int','int_rate_strat']
data_multi = dict([(elem, cleandataloan.select(elem).rdd.flatMap(lambda row: row).collect()) for elem in numerical])
p = figure(title="Loan Status X Interest Rate")
p.scatter(data_multi['int_rate_strat'], data_multi['loan_status_int'], marker='x',line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.5, size=12)
show(p)