### Linear Regression and Correlation

In [1]:
# Dependent and independent variable are directly proportional (y = mx + b)

In [3]:
import pandas as pd
loansData = pd.read_csv('https://github.com/Thinkful-Ed/curric-data-001-data-sets/raw/master/loans/loansData.csv')

loansData['Interest.Rate'][0:5] # first 5 rows of Interest.Rates

81174     8.90%
99592    12.12%
80059    21.98%
15825     9.99%
33182    11.71%
Name: Interest.Rate, dtype: object

### Replacing Raw Data with Cleaned Data in the DataFrame

In [21]:
loansData['Interest.Rate'] = [float(interest[0:-1])/100 for interest in loansData['Interest.Rate']]

In [24]:
loansData['Loan.Length'] = [int(elm.replace('months','')) for elm in loansData['Loan.Length']]

In [6]:
loansData['FICO.Score'] = [int(elm.split('-')[0]) for elm in loansData['FICO.Range']]

In [25]:
loansData # updated DataFrame. Has new columns (1) 'Interest.Rate', (2) 'Loan.Length', (3) 'FICO.Score''

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length,FICO.Score
81174,20000,20000.00,0.0890,36,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year,735
99592,19200,19200.00,0.1212,36,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years,715
80059,35000,35000.00,0.2198,60,debt_consolidation,23.81%,CA,MORTGAGE,11500.00,690-694,14.0,21977.0,1.0,2 years,690
15825,10000,9975.00,0.0999,36,debt_consolidation,14.30%,KS,MORTGAGE,3833.33,695-699,10.0,9346.0,0.0,5 years,695
33182,12000,12000.00,0.1171,36,credit_card,18.78%,NJ,RENT,3195.00,695-699,11.0,14469.0,0.0,9 years,695
62403,6000,6000.00,0.1531,36,other,20.05%,CT,OWN,4891.67,670-674,17.0,10391.0,2.0,3 years,670
48808,10000,10000.00,0.0790,36,debt_consolidation,26.09%,MA,RENT,2916.67,720-724,10.0,15957.0,0.0,10+ years,720
22090,33500,33450.00,0.1714,60,credit_card,14.70%,LA,MORTGAGE,13863.42,705-709,12.0,27874.0,0.0,10+ years,705
76404,14675,14675.00,0.1433,36,credit_card,26.92%,CA,RENT,3150.00,685-689,9.0,7246.0,1.0,8 years,685
15867,7000,7000.00,0.0691,36,credit_card,7.10%,CA,RENT,5000.00,715-719,8.0,7612.0,0.0,3 years,715


### Plotting with Cleaned Data

In [7]:
# Histogram

import matplotlib.pyplot as plt

plt.figure()
plt.hist(loansData['FICO.Score'])
plt.show()

In [12]:
# Scatterplot

import pandas as pd
plt.figure()
p = pd.scatter_matrix(loansData, alpha=0.05, figsize=(10,10), diagonal='hist')
plt.show()

### Linear Regression Analysis

In [14]:
# linear model will look something like this:
# InterestRate = b + a1(FICOScore) + a2(LoanAmount)

In [15]:
# everything you need going forward

import numpy as np
import pandas as pd
import statsmodels.api as sm

In [26]:
# extracted columns for our equation:

intrate = loansData['Interest.Rate']
loanamt = loansData['Amount.Requested']
fico = loansData['FICO.Score']

In [27]:
# creating our variables

# the dependent variable
y = np.matrix(intrate).transpose()
# The independent variables shaped as columns
x1 = np.matrix(fico).transpose()
x2 = np.matrix(loanamt).transpose()

In [28]:
# put 2 columbs together to cerate an "input matrix"

x = np.column_stack([x1,x2])

In [29]:
# now we create a linear model:

X = sm.add_constant(x)
model = sm.OLS(y, X)
f = model.fit()

In [30]:
# view results summary:

f.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.657
Model:,OLS,Adj. R-squared:,0.656
Method:,Least Squares,F-statistic:,2388.0
Date:,"Thu, 22 Dec 2016",Prob (F-statistic):,0.0
Time:,08:43:11,Log-Likelihood:,5727.6
No. Observations:,2500,AIC:,-11450.0
Df Residuals:,2497,BIC:,-11430.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.7288,0.010,73.734,0.000,0.709 0.748
x1,-0.0009,1.4e-05,-63.022,0.000,-0.001 -0.001
x2,2.107e-06,6.3e-08,33.443,0.000,1.98e-06 2.23e-06

0,1,2,3
Omnibus:,69.496,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,77.811
Skew:,0.379,Prob(JB):,1.27e-17
Kurtosis:,3.414,Cond. No.,296000.0
