In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import math
from patsy import dmatrices
import statsmodels.formula.api as sm

# Loading some example data
titanic = pd.read_csv('../CSV Files/titanic-data.csv')

formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + C(Embarked) + Parch'
y,X = dmatrices(formula, data=titanic, return_type='dataframe')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Predicting Using the Statsmodel library

In [3]:
model = sm.Logit(y,X)
res = model.fit()
res.summary()

Optimization terminated successfully.
         Current function value: 0.444291
         Iterations 6


0,1,2,3
Dep. Variable:,Survived,No. Observations:,712.0
Model:,Logit,Df Residuals:,703.0
Method:,MLE,Df Model:,8.0
Date:,"Thu, 26 Oct 2017",Pseudo R-squ.:,0.3416
Time:,15:41:42,Log-Likelihood:,-316.34
converged:,True,LL-Null:,-480.45
,,LLR p-value:,3.985e-66

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,4.5703,0.480,9.516,0.000,3.629,5.512
C(Pclass)[T.2],-1.2684,0.298,-4.250,0.000,-1.853,-0.683
C(Pclass)[T.3],-2.4938,0.296,-8.419,0.000,-3.074,-1.913
C(Sex)[T.male],-2.6411,0.223,-11.854,0.000,-3.078,-2.204
C(Embarked)[T.Q],-0.8445,0.599,-1.410,0.159,-2.019,0.330
C(Embarked)[T.S],-0.4277,0.271,-1.579,0.114,-0.958,0.103
Age,-0.0437,0.008,-5.272,0.000,-0.060,-0.027
SibSp,-0.3558,0.129,-2.768,0.006,-0.608,-0.104
Parch,-0.0446,0.121,-0.370,0.712,-0.281,0.192


## Predicting Using SKLearn

In [4]:
model = LogisticRegression()

model = model.fit(X, y.values.ravel())
pd.DataFrame(list(zip(X.columns, np.transpose(model.coef_))))

Unnamed: 0,0,1
0,Intercept,[1.89669665438]
1,C(Pclass)[T.2],[-0.95200839777]
2,C(Pclass)[T.3],[-2.11432508088]
3,C(Sex)[T.male],[-2.39987386479]
4,C(Embarked)[T.Q],[-0.57958400265]
5,C(Embarked)[T.S],[-0.364858062139]
6,Age,[-0.0351247870318]
7,SibSp,[-0.295542169807]
8,Parch,[-0.0149016182569]


## Computing Precision

In [5]:
from sklearn.metrics import classification_report
y_pred = model.predict_proba(X)
y_pred_flag = (y_pred[:,1] > 0.5).astype(int)
print(classification_report(y, y_pred_flag))

             precision    recall  f1-score   support

        0.0       0.82      0.86      0.84       424
        1.0       0.78      0.71      0.74       288

avg / total       0.80      0.80      0.80       712



Can you compare it to the precision using decision trees?

**Question 1: ** Obtain more than 80% by adding new features

**Question 2:** What is the precision of the linear regression function in the statsmodels library?

In [6]:
model = sm.Logit(y,X)
res = model.fit()
y_pred = res.predict(X)
y_pred_flag = (y_pred > 0.5).astype(int)
print(classification_report(y, y_pred_flag))

Optimization terminated successfully.
         Current function value: 0.444291
         Iterations 6
             precision    recall  f1-score   support

        0.0       0.82      0.86      0.84       424
        1.0       0.78      0.72      0.75       288

avg / total       0.80      0.80      0.80       712



## Support Vector Machines (SVM)

We did not learn about them but they one fo the most popular classification modeling technique. Follow [this notebook to learn about SVM using SKLearn](https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/master/scikit-learn/scikit-learn-svm.ipynb).

In [7]:
from sklearn.svm import SVC, LinearSVC

svc = LinearSVC()
svc.fit(X, y.Survived)
y_pred = svc.predict(X).astype(int)
print(classification_report(y, y_pred))

             precision    recall  f1-score   support

        0.0       0.78      0.92      0.85       424
        1.0       0.84      0.63      0.72       288

avg / total       0.81      0.80      0.79       712



In [7]:
svc = SVC()
svc.fit(X, y.Survived)
y_pred = svc.predict(X).astype(int)
print(classification_report(y, y_pred))

             precision    recall  f1-score   support

        0.0       0.84      0.90      0.87       424
        1.0       0.84      0.74      0.78       288

avg / total       0.84      0.84      0.83       712



**Question 3:** Can you get closer to this level of precision?
(Check out https://www.kaggle.com/sinakhorami/titanic-best-working-classifier)

In [10]:
from sklearn import model_selection

X_train, X_test, y_train, y_test = \
    model_selection.train_test_split(X, y, train_size=0.01)



In [11]:
svc = SVC()
svc.fit(X_train, y_train.Survived)
y_pred = svc.predict(X_train).astype(int)
print(classification_report(y_train, y_pred))

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00         4
        1.0       1.00      1.00      1.00         3

avg / total       1.00      1.00      1.00         7



In [12]:
svc = SVC()
svc.fit(X_train, y_train.Survived)
y_pred = svc.predict(X_test).astype(int)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        0.0       0.61      0.93      0.74       420
        1.0       0.56      0.13      0.21       285

avg / total       0.59      0.61      0.52       705

