In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
url = "https://raw.githubusercontent.com/ga-students/SF-DAT-20/master/Data/bank.csv"
BankData = pd.read_csv(url)
BankData.head(5)

Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y
0,30,married,primary,no,1787,no,no,cellular,79,1,-1,0,unknown,no
1,33,married,secondary,no,4789,yes,yes,cellular,220,1,339,4,failure,no
2,35,single,tertiary,no,1350,yes,no,cellular,185,1,330,1,failure,no
3,30,married,tertiary,no,1476,yes,yes,unknown,199,4,-1,0,unknown,no
4,59,married,secondary,no,0,yes,no,unknown,226,1,-1,0,unknown,no


In [3]:
BankData['marital'].unique()  #you can use unique if you would like to find out how many unique attributes each variable have

array(['married', 'single', 'divorced'], dtype=object)

For dictionary of data please refer to https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

#### Our goal is to define a model best predicts outcome y - success of the marketing campaign 

First let's create dummy variables for default, marital, housing, and loan

In [4]:
Default_dummy  = pd.get_dummies(BankData['default'], prefix = 'default')
del Default_dummy['default_no']

marital_dummy  = pd.get_dummies(BankData['marital'], prefix = 'marital')
del marital_dummy['marital_married']
del marital_dummy['marital_divorced']

housing_dummy  = pd.get_dummies(BankData['housing'], prefix = 'housing')
del housing_dummy['housing_no']

loan_dummy = pd.get_dummies(BankData['loan'], prefix = 'loan')
del loan_dummy['loan_no']


BankData = pd.concat([BankData,marital_dummy , Default_dummy, housing_dummy, loan_dummy], axis=1)
BankData.head()



Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y,marital_single,default_yes,housing_yes,loan_yes
0,30,married,primary,no,1787,no,no,cellular,79,1,-1,0,unknown,no,0,0,0,0
1,33,married,secondary,no,4789,yes,yes,cellular,220,1,339,4,failure,no,0,0,1,1
2,35,single,tertiary,no,1350,yes,no,cellular,185,1,330,1,failure,no,1,0,1,0
3,30,married,tertiary,no,1476,yes,yes,unknown,199,4,-1,0,unknown,no,0,0,1,1
4,59,married,secondary,no,0,yes,no,unknown,226,1,-1,0,unknown,no,0,0,1,0


In [17]:
X = BankData[['age','balance','duration','campaign','pdays','previous','marital_single','default_yes','housing_yes','loan_yes']]
y = BankData['y']
BankData['y'].value_counts()

no     4000
yes     521
Name: y, dtype: int64

#### Run a Logistic Regression Line on your inputs and output

In [11]:
lm = LogisticRegression()
lm.fit(X,y)
cols=list(X)
print cols
print(lm.coef_)

['age', 'balance', 'duration', 'campaign', 'pdays', 'previous', 'marital_single', 'default_yes', 'housing_yes', 'loan_yes']
[[ -6.85307805e-05   1.03684080e-05   3.73310607e-03  -9.70503294e-02
    2.32454766e-03   9.52908286e-02   1.71313754e-01   1.23013337e-01
   -9.11169344e-01  -8.71159555e-01]]


#### What is your 10-fold cross-validation error?

In [12]:
from sklearn.cross_validation import cross_val_score
a=cross_val_score(lm,X,y,cv=10)
#cross_val_score(model,inputs,output,cv = k-fold).mean()
print ''
print(a.mean())



0.889848893317


#### Construct a confusion matrix.

In [8]:
from sklearn.metrics import confusion_matrix
y_hat = lm.predict(X)
confusion_matrix(y, y_hat)


array([[3929,   71],
       [ 427,   94]])

#### Interpret your coefficients. (At least interpret campaign, marital_single, and default_yes. Do your interpretations  make sense?

In [None]:
[ 'campaign''marital_single', 'default_yes',]
 -9.70503294e-02
       1.71313754e-01   1.23013337e-01
   
for every increase in campaign call, the odds that the person will be a no goes down by 9%, it goes up 17% if you are
single, and up 12% if you have been in default

Answer: 

#### What is your prediction for a person who is 30 years old, 1000 dollars balance, with duration = 210 , has been contacted 3 times for this campaign (campaign = 3), with pdays = 100, who has previously been contacted 4 times, who is single, never defaulted, home owner and doesn't have any loan?

In [19]:
XP=[[30, 1000, 210, 3, 100, 4, 1, 0, 1, 0]]
y_hat = lm.predict(XP)
print y_hat



['no']


#### Now standardize your data - you can use standardization method used for KNN algorithms.

In [None]:
def Standardize(X):
    X_Max = X.max()
    X_Min = X.min()
    X_Standardized = (X-X_Min)/(X_Max - X_Min)
    return X_Standardized



#### Use 10-fold cross validation to find the best tuning parameter - C.

#### Now use the best C you found above and repeat your analysis and look over your coefficients

In [None]:
#It will be easier for you to zip the name of variables and your coefficients


#### If you would like to drop 3 variables from your analysis, which variables are you going to choose?

Answer: 