# Imports 

## Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#the inline magic line is needed to make some data visualizations come out right when you are using jupyter notebooks
%matplotlib inline 
import seaborn as sns

from math import sqrt

In [2]:
#ML algorithms (estimators)
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn import linear_model

#model metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

#cross validation
from sklearn.model_selection import train_test_split

"""sklearn imports from classification problem in course 1"""
# from sklearn.model_selection import train_test_split
# from sklearn import metrics
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score, classification_report

'sklearn imports from classification problem in course 1'

## Import & Verify Data

In [3]:
#import numerical only data
df= pd.read_csv('num_only_credit_one_data.csv')

df.head()

Unnamed: 0,ID,SEX,AGE,LIMIT_BAL,bill_ap,bill_m,bill_ju,bill_jy,bill_ag,bill_s,...,pay_s,DEFAULT,edu_graduate school,edu_high school,edu_other,edu_university,mar_0,mar_1,mar_2,mar_3
0,1,0,24,20000,0,0,0,689,3102,3913,...,2,0,0,0,0,1,0,1,0,0
1,2,0,26,120000,3261,3455,3272,2682,1725,2682,...,-1,0,0,0,0,1,0,0,1,0
2,3,0,34,90000,15549,14948,14331,13559,14027,29239,...,0,1,0,0,0,1,0,0,1,0
3,4,0,37,50000,29547,28959,28314,49291,48233,46990,...,0,1,0,0,0,1,0,1,0,0
4,5,1,57,50000,19131,19146,20940,35835,5670,8617,...,-1,1,0,0,0,1,0,1,0,0


In [4]:
df.shape

(30000, 31)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   ID                   30000 non-null  int64
 1   SEX                  30000 non-null  int64
 2   AGE                  30000 non-null  int64
 3   LIMIT_BAL            30000 non-null  int64
 4   bill_ap              30000 non-null  int64
 5   bill_m               30000 non-null  int64
 6   bill_ju              30000 non-null  int64
 7   bill_jy              30000 non-null  int64
 8   bill_ag              30000 non-null  int64
 9   bill_s               30000 non-null  int64
 10  pmt_ap               30000 non-null  int64
 11  pmt_m                30000 non-null  int64
 12  pmt_ju               30000 non-null  int64
 13  pmt_jy               30000 non-null  int64
 14  pmt_ag               30000 non-null  int64
 15  pmt_s                30000 non-null  int64
 16  pay_ap               3

Everything with my dataset looks as I expect. So I will move on to specifying the variables I was to use as features (IVs) and the one I want to use as target for my model predicting credit limit. 

# Predicting Customer Credit Limits
In this notebook I am only going to create and test models that attempt to predict customer credit limits. Which means the target variable in all models will be 'limit' (i.e. limit_bal).

#### Re-ordering df Columns
To make things easier, before doing anything I'm going to move LIMIT_BAL to the last column in my df and rename it. 

In [6]:
#step 1: create a variabled to hold the LIMIT_BAL var
lim=df['LIMIT_BAL']

#step 2: drop the LIMIT_BAL column
df.drop(labels='LIMIT_BAL', axis=1, inplace=True)

#step 3: insert 'lim' as last column of df
df.insert(loc=30, column='limit', value=lim)

df.head()

Unnamed: 0,ID,SEX,AGE,bill_ap,bill_m,bill_ju,bill_jy,bill_ag,bill_s,pmt_ap,...,DEFAULT,edu_graduate school,edu_high school,edu_other,edu_university,mar_0,mar_1,mar_2,mar_3,limit
0,1,0,24,0,0,0,689,3102,3913,0,...,0,0,0,0,1,0,1,0,0,20000
1,2,0,26,3261,3455,3272,2682,1725,2682,2000,...,0,0,0,0,1,0,0,1,0,120000
2,3,0,34,15549,14948,14331,13559,14027,29239,5000,...,1,0,0,0,1,0,0,1,0,90000
3,4,0,37,29547,28959,28314,49291,48233,46990,1000,...,1,0,0,0,1,0,1,0,0,50000
4,5,1,57,19131,19146,20940,35835,5670,8617,679,...,1,0,0,0,1,0,1,0,0,50000


### Specify the Target Variable
Now I will specify the dependent or target variable for models attempting to predict customer credit limits. 

In [28]:
#filter the data to return my dependent var (the limit column)
y = df.loc[:,'limit']

#verify that the first 5 values in y match the values in the limit column above
y.head()

0     20000
1    120000
2     90000
3     50000
4     50000
Name: limit, dtype: int64

In [8]:
#verify that y has 30,000 rows
y.shape

(30000,)

# Model 1: Predicting Credit Limit Using All Variables
None of the variables stood out as ones that should be included or excluded from the model, so in this first model I'm going to include them all.

### Specify the Feature Variables

In [9]:
#filter the data to return only columns for my feature vars
X = df.iloc[:, 1:30]

#verify that all but the ID and limit columns were selected
print('Summary of Feature Variables') 
X.head()

Summary of Feature Variables


Unnamed: 0,SEX,AGE,bill_ap,bill_m,bill_ju,bill_jy,bill_ag,bill_s,pmt_ap,pmt_m,...,pay_s,DEFAULT,edu_graduate school,edu_high school,edu_other,edu_university,mar_0,mar_1,mar_2,mar_3
0,0,24,0,0,0,689,3102,3913,0,0,...,2,0,0,0,0,1,0,1,0,0
1,0,26,3261,3455,3272,2682,1725,2682,2000,0,...,-1,0,0,0,0,1,0,0,1,0
2,0,34,15549,14948,14331,13559,14027,29239,5000,1000,...,0,1,0,0,0,1,0,0,1,0
3,0,37,29547,28959,28314,49291,48233,46990,1000,1069,...,0,1,0,0,0,1,0,1,0,0
4,1,57,19131,19146,20940,35835,5670,8617,679,689,...,-1,1,0,0,0,1,0,1,0,0


## Train & Test Model 1 Using 3 Different Regression Algorithms
Because credit limit is a continuous variable, I know I will have to use a regression model to predict it. But I don't have a principled reason to choose one regression algorithm over others, so I'll compare the predictions of models using Random Forest p9idifferent regression algorithms to find the one that works best on this task. 
* I will run each model through a 3-fold cross validation test. Using R squared to score the models. 
* The model with the highest mean CV sore will be labeled best.

### Setup

In [10]:
#create a dictionary to hold the algos
algosClass = [ ]

#add the randomn forest regressor to the dictionary
algosClass.append(('Random Forest Regressor',RandomForestRegressor()))

#add the liner regression algo to the dictionary
algosClass.append(('Linear Regression',LinearRegression()))

#add the SV regressor to the dictionary
algosClass.append(('Support Vector Regression',SVR()))

#verify every algo was added to the list
print(algosClass)

[('Random Forest Regressor', RandomForestRegressor()), ('Linear Regression', LinearRegression()), ('Support Vector Regression', SVR())]


### Train & Test Models

In [11]:
#create two empty lists to hold the names and cross validation results from each model
results = []
names = []

#create a for-loop that will
for name, model in algosClass:
    result = cross_val_score(model, X,y, cv=3, scoring='r2')
    names.append(name)
    results.append(result)

### Performance Results

In [12]:
#run the for loop and print the mean cross validation score for each model
for i in range(len(names)):
    print(names[i],results[i].mean())

Random Forest Regressor 0.46700202592747336
Linear Regression 0.35871695390197167
Support Vector Regression -0.05039128320492573


The model built using the random forest regressor performed somewhat less badly than the other two models. However, even the randomn forest model does little better than chance at predicting customer's credit limits (i.e.limit_bal). 

# Credit Limit Model 2: Excluding the Demographic Variables 

## Select the Feature Variables

In [20]:
#filter the data to return only columns for bill amount, payment amount, payment history, and default var
X2 = df.iloc[:, 3:22]

print('Summary of Model 2 Features')
X2.head()

Summary of Features for Model 2


Unnamed: 0,bill_ap,bill_m,bill_ju,bill_jy,bill_ag,bill_s,pmt_ap,pmt_m,pmt_ju,pmt_jy,pmt_ag,pmt_s,pay_ap,pay_m,pay_ju,pay_jy,pay_ag,pay_s,DEFAULT
0,0,0,0,689,3102,3913,0,0,0,0,689,0,-2,-2,-1,-1,2,2,0
1,3261,3455,3272,2682,1725,2682,2000,0,1000,1000,1000,0,2,0,0,0,2,-1,0
2,15549,14948,14331,13559,14027,29239,5000,1000,1000,1000,1500,1518,0,0,0,0,0,0,1
3,29547,28959,28314,49291,48233,46990,1000,1069,1100,1200,2019,2000,0,0,0,0,0,0,1
4,19131,19146,20940,35835,5670,8617,679,689,9000,10000,36681,2000,0,0,0,-1,0,-1,1


In [23]:
#create two empty lists to hold the names and cross validation results from each model
results2 = []
names2 = []

#get cross validation scores for model 2 
for name, model in algosClass:
    result = cross_val_score(model, X2,y, cv=3, scoring='r2')
    names2.append(name)
    results2.append(result)

In [27]:
#run the for loop and print the mean cross validation score for each model
print("Summary of Model2 Cross Validation Scores")

for i in range(len(names2)):
    print(names[i],results[i].mean())

Summary of Model2 Cross Validation Scores
Random Forest Regressor 0.46700202592747336
Linear Regression 0.35871695390197167
Support Vector Regression -0.05039128320492573


## Train & Test the Model

In [14]:
#instantiate the random forest regression algo
# rfr = RandomForestRegressor()

In [15]:
#train the rfr model on X_train and y_train
# model = rfr.fit(X_train, y_train)

#use trained model to predict credit limit amounts for the X_test data
# y_predict = model_1.predict(X_test)

NameError: name 'X_train' is not defined

## Split Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=123)

>*Coding Notes*
>* The ```test_size =``` parameter specifies what percent of you dataset will be used for training and training. If you don't specify an amount, the default is 0.25.
>* The ```random_state =``` parameter can be set to any number. The purpose of it is to make sure that every time the model is run,  the same observations are used in the training and testing sets. If this parameter is not included, every time you run the model it may select a slightly different set of observations.

In [None]:
cross_val_score(rfr, X, y, cv=3)

In [None]:
r2_score(y_test, y_predict)

In [None]:
mean_squared_error(y_test, y_predict)