In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
url = "https://raw.githubusercontent.com/ga-students/SF-DAT-20/master/Data/Credit.csv"
CreditData = pd.read_csv(url)
CreditData.head(10)

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331
5,6,80.18,8047,569,4,77,10,Male,No,No,Caucasian,1151
6,7,20.996,3388,259,2,37,12,Female,No,No,African American,203
7,8,71.408,7114,512,2,87,9,Male,No,No,Asian,872
8,9,15.125,3300,266,5,66,13,Female,No,No,Caucasian,279
9,10,71.061,6819,491,3,41,19,Female,Yes,Yes,African American,1350


In [2]:
del CreditData['Unnamed: 0']

#### Let's look at correlation matrix. This time, we only explore the quantitative variables that affect Credit Balance. From your preliminary analysis, which 3 variables seem to affect Balance the most? In our goal is interpretation; can we use these 3 variables simultaneously? Why?

In [3]:
CreditData.corr()


Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Balance
Income,1.0,0.792088,0.791378,-0.018273,0.175338,-0.027692,0.463656
Limit,0.792088,1.0,0.99688,0.010231,0.100888,-0.023549,0.861697
Rating,0.791378,0.99688,1.0,0.053239,0.103165,-0.030136,0.863625
Cards,-0.018273,0.010231,0.053239,1.0,0.042948,-0.051084,0.086456
Age,0.175338,0.100888,0.103165,0.042948,1.0,0.003619,0.001835
Education,-0.027692,-0.023549,-0.030136,-0.051084,0.003619,1.0,-0.008062
Balance,0.463656,0.861697,0.863625,0.086456,0.001835,-0.008062,1.0


Answer: Income, Limit, Rating -- they are highly correlated with each other

#### There are few categorical variables, let's first create dummy variables for them


In [4]:

RaceDummy = pd.get_dummies(CreditData.Ethnicity, prefix = 'Race')
del RaceDummy['Race_African American']

GenderDummy = pd.get_dummies(CreditData.Gender, prefix = 'Gender')
del GenderDummy['Gender_ Male']  

MarriedDummy = pd.get_dummies(CreditData.Married, prefix = 'Married')
del MarriedDummy['Married_No']

StudentDummy = pd.get_dummies(CreditData.Student, prefix = 'Student')
del StudentDummy['Student_No']

CreditData = pd.concat([CreditData, RaceDummy,GenderDummy,MarriedDummy,StudentDummy], axis=1)

CreditData.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance,Race_Asian,Race_Caucasian,Gender_Female,Married_Yes,Student_Yes
0,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333,0,1,0,1,0
1,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903,1,0,1,1,1
2,104.593,7075,514,4,71,11,Male,No,No,Asian,580,1,0,0,0,0
3,148.924,9504,681,3,36,11,Female,No,No,Asian,964,1,0,1,0,0
4,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331,0,1,0,1,0


# Now it's time for some fun!

#### By a regression line, use Education, Ethnicity, Gender, Age, Cards, and Income to predict Balance. 

First Step, find the coefficients of your regression line

In [5]:
from sklearn.linear_model import LinearRegression  #import
linreg = LinearRegression() #Instantiate


X = CreditData[[ 'Education', 'Race_Asian', 'Race_Caucasian', 'Gender_Female', 'Age', 'Cards', 'Income' ]]
y= CreditData['Balance']
linreg.fit(X,y)
print(linreg.intercept_)
print(linreg.coef_)

230.042354393
[  1.64553607  -6.54603078   3.47497641  27.12543123  -2.32970547
  33.62953508   6.27995894]


Second Step, find the p-values of your estimates. You have a few variables try to show your p-values along side the names of the variables.

In [7]:
from sklearn import feature_selection
pvals = feature_selection.f_regression(X,y)[1] #That's how we extract p-values
print(pvals)

zip([ 'Education', 'Race_Asian', 'Race_Caucasian', 'Gender_Female', 'Age', 'Cards', 'Income' ],pvals)

[  8.72306402e-01   8.44895644e-01   9.47727511e-01   6.68516106e-01
   9.70813872e-01   8.41765556e-02   1.03088580e-22]


[('Education', 0.87230640156710226),
 ('Race_Asian', 0.84489564436221742),
 ('Race_Caucasian', 0.94772751139663791),
 ('Gender_Female', 0.66851610550260099),
 ('Age', 0.97081387233013317),
 ('Cards', 0.084176555599370956),
 ('Income', 1.0308858025893513e-22)]

Which of your coefficients are significant at significance level 5%?

Answer:  Income

#### What is the R-Squared of your model?

In [8]:
linreg.score(X,y)

0.23231260833540443

#### How do we interpret this value?

Answer: 23% of our variance is explained by income

#### Now focus on two of the most significant variables from your previous model and re-run your regression model. This time, try to use a function to return all the values you are interested in. You may borrow from what covered in Lecture 5 ipython lecture.

In [21]:
X = CreditData[[ 'Rating','Limit' ]]
y= CreditData['Balance']
linreg.fit(X,y)
print(linreg.intercept_)
print(linreg.coef_)
pvals = feature_selection.f_regression(X,y)[1] #That's how we extract p-values

print zip([ 'Rating','Limit' ],pvals)
R2 = linreg.score(X,y)
print 'Rsquared', R2
Radj2 = 1-(((1-R2)*(X.Rating.count()-1))/(X.Rating.count()-2-1))
print 'Adjusted R2', Radj2
print ''
X = CreditData[[ 'Rating','Limit', 'Income' ]]
y= CreditData['Balance']
linreg.fit(X,y)
print(linreg.intercept_)
print(linreg.coef_)
pvals = feature_selection.f_regression(X,y)[1] #That's how we extract p-values

print zip([ 'Rating','Limit', 'Income' ],pvals)
R2 = linreg.score(X,y)
print 'Rsquared', R2
Radj2 = 1-(((1-R2)*(X.Rating.count()-1))/(X.Rating.count()-3-1))
print 'Adjusted R2', Radj2

-377.536795356
[ 2.20167217  0.02451438]
[('Rating', 1.8988990970396967e-120), ('Limit', 2.5305807117137046e-119)]
Rsquared 0.745942796101
Adjusted R2 0.744662910943
400

-489.727478494
[ 2.69858214  0.08467064 -7.7193089 ]
[('Rating', 1.8988990970396967e-120), ('Limit', 2.5305807117137046e-119), ('Income', 1.0308858025893513e-22)]
Rsquared 0.876238945626
Adjusted R2 0.875301361881


#### In comparison to the previous model, did our R-Squared increase or decrease? Why?

Answer: It increased because of A) different variables -- ones that had significant correlation. B) multiple variables

#### Now let's regress Balance on Gender alone. After running your regression lines, do you have enough evidence to claim that females having more balance than males? (Hint: Look at the p-value of the Gender coefficient. If it is significant then you will have evidence to support that claim, otherwise you cannot support the statement.

In [23]:
X = CreditData[[ 'Gender_Female' ]]
y= CreditData['Balance']
linreg.fit(X,y)
print(linreg.intercept_)
print(linreg.coef_)
print feature_selection.f_regression(X,y)[1] #That's how we extract p-values

R2 = linreg.score(X,y)
print 'Rsquared', R2



509.803108808
[ 19.73312308]
[ 0.66851611]
Rsquared 0.000461132964496


Answer:  No there is no evidence to support this claim

#### Now let's regress Balance on Ethnicity. After running your regression lines, do you have enough evidence to claim that some ethnic groups carry more balance than others? (Hint: Look at the p-value of  your dummy variables. If it is significant then you will have evidence to support that claim, otherwise you cannot support that statement.

In [24]:
X = CreditData[[ 'Race_Asian', 'Race_Caucasian' ]]
y= CreditData['Balance']
linreg.fit(X,y)
print(linreg.intercept_)
print(linreg.coef_)
print feature_selection.f_regression(X,y)[1] #That's how we extract p-values

R2 = linreg.score(X,y)
print 'Rsquared', R2


531.0
[-18.68627451 -12.50251256]
[ 0.84489564  0.94772751]
Rsquared 0.000218807443049


Answer: Again, no correlation.

#### I know you get tired of this but for the last time regress Balance on Studentship status. After running your regression lines, do you have enough evidence to claim that students  carry more balance than others? (Hint: Look at the p-value of the your dummy variables. If it is significant then you will have evidence to support that claim, otherwise you cannot support the statement.


In [25]:
X = CreditData[[ 'Student_Yes' ]]
y= CreditData['Balance']
linreg.fit(X,y)
print(linreg.intercept_)
print(linreg.coef_)
print feature_selection.f_regression(X,y)[1] #That's how we extract p-values

R2 = linreg.score(X,y)
print 'Rsquared', R2


480.369444444
[ 396.45555556]
[  1.48773411e-07]
Rsquared 0.0670900898871


Answer: The Student value has a correlation with the balance outcome.

#### No let's consider effect of students and income on balance simoltanously. Let's start with a regression line.

In [26]:
X = CreditData[[ 'Income', 'Student_Yes' ]]
y= CreditData['Balance']
linreg.fit(X,y)
print(linreg.intercept_)
print(linreg.coef_)
print feature_selection.f_regression(X,y)[1] #That's how we extract p-values

R2 = linreg.score(X,y)
print 'Rsquared', R2


211.142964398
[   5.98433557  382.67053884]
[  1.03088580e-22   1.48773411e-07]
Rsquared 0.277458888967


#### Are all of our regression coefficients significant? If yes, interpret them.

Answer: Yes but the Rsquared is low

#### Now let's explore interaction between income and studentship. Let's start with a regression line

In [28]:
# First generate a column for interaction term
CreditData['Interact'] = CreditData['Income']*CreditData['Student_Yes']
X = CreditData[[ 'Student_Yes' , 'Income', 'Interact']]
y= CreditData['Balance' ]
linreg.fit(X,y)
print(linreg.intercept_)
print(linreg.coef_)
print feature_selection.f_regression(X,y)[1] #That's how we extract p-values

R2 = linreg.score(X,y)
print 'Rsquared', R2


200.62315295
[ 476.67584321    6.21816874   -1.99915087]
[  1.48773411e-07   1.03088580e-22   4.61768368e-08]
Rsquared 0.279883703062


#### Are our coefficients signifincant? It they are write down your regression line below:

Answer: Balance = 200.62 + 476.67*Student_Yes + 6.2*Income -1.99*Interact

#### Is there any income level at which students and non-students on average carry same level of balance?

Answer: 



In [30]:
# not student = 200.62 + 6.2* Income
# student = 677 + 4.2*Income 
# When these are equal. = 477 - 2*Income = 0 , Income = 477/2

print 200.62 + 476.67 + 6.2*477/2 -1.99*477/2
print 200.62 + 0 + 6.2*477/2
print 477/2
#238,000 !!!!!

1681.375
1679.32
238
