In [1]:
import numpy as np
import pandas as pd
import statsmodels .api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#apply fix statesmodel lib
from scipy import stats
stats.chisqprob = lambda chisq, df:stats.chi2.sf(chisq, df)

In [2]:
df = pd.read_csv('udemy_data/2.02. Binary predictors.csv')
df.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male


In [3]:
#encode the data
data = df.copy()
data['Admitted']=data['Admitted'].map({'No':0,'Yes':1})
data['Gender']=data['Gender'].map({'Male':0,'Female':1})
data.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0


### Logistic Regression Using a variable

In [4]:
y = data['Admitted'] #dependant var
x1single = data['Gender'] #independant

In [5]:
xsingle = sm.add_constant(x1single)
reg_log = sm.Logit(y,xsingle)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.572260
         Iterations 5


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Sun, 27 Sep 2020",Pseudo R-squ.:,0.1659
Time:,15:04:16,Log-Likelihood:,-96.14
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,6.283e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6436,0.222,-2.901,0.004,-1.078,-0.209
Gender,2.0786,0.363,5.727,0.000,1.367,2.790


### Logistic Regression using 2  variables

In [6]:
y = data['Admitted']
x1 = data.drop('Admitted', axis=1) #independant using SAT and gender

In [7]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Sun, 27 Sep 2020",Pseudo R-squ.:,0.8249
Time:,15:04:17,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
SAT,0.0406,0.010,4.129,0.000,0.021,0.060
Gender,1.9449,0.846,2.299,0.022,0.287,3.603


- We can see the Log_likelihood of model1 < model2. It means model2 is better than model1
- in modek2 gender variable P>|z| is significant, but we don't have 3 zero

In [8]:
np.exp(1.9449)
#mean the female has 7 times higher odds to get admitted

6.992932526814459

### Accuracy

In [9]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)}) #format the decimal result, 2 digits after coma
results_log.predict() #predict result

array([0.00, 1.00, 1.00, 0.23, 0.02, 0.99, 1.00, 1.00, 1.00, 0.01, 1.00,
       1.00, 0.76, 0.00, 0.60, 1.00, 0.11, 0.12, 0.51, 1.00, 1.00, 1.00,
       0.00, 0.01, 0.97, 1.00, 0.48, 0.99, 1.00, 0.99, 0.00, 0.83, 0.25,
       1.00, 1.00, 1.00, 0.31, 1.00, 0.23, 0.00, 0.02, 0.45, 1.00, 0.00,
       0.99, 0.00, 0.99, 0.00, 0.00, 0.01, 0.00, 1.00, 0.92, 0.02, 1.00,
       0.00, 0.37, 0.98, 0.12, 1.00, 0.00, 0.78, 1.00, 1.00, 0.98, 0.00,
       0.00, 0.00, 1.00, 0.00, 0.78, 0.12, 0.00, 0.99, 1.00, 1.00, 0.00,
       0.30, 1.00, 1.00, 0.00, 1.00, 1.00, 0.85, 1.00, 1.00, 0.00, 1.00,
       1.00, 0.89, 0.83, 0.00, 0.98, 0.97, 0.00, 1.00, 1.00, 0.03, 0.99,
       0.96, 1.00, 0.00, 1.00, 0.01, 0.01, 1.00, 1.00, 1.00, 0.00, 0.00,
       0.02, 0.33, 0.00, 1.00, 0.09, 0.00, 0.97, 0.00, 0.75, 1.00, 1.00,
       0.01, 0.01, 0.00, 1.00, 0.00, 0.99, 0.57, 0.54, 0.87, 0.83, 0.00,
       1.00, 0.00, 0.00, 0.00, 1.00, 0.04, 0.00, 0.01, 1.00, 0.99, 0.52,
       1.00, 1.00, 0.05, 0.00, 0.00, 0.00, 0.68, 1.

In [10]:
np.array(data['Admitted'])

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0], dtype=int64)

In [11]:
#predict result in confusion matrix
results_log.pred_table()

array([[69.00, 5.00],
       [4.00, 90.00]])

In [12]:
#confusion matrix
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df.rename(index={0:'Actual 0',1:'Actual 1'}, inplace=True)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69.0,5.0
Actual 1,4.0,90.0


In [13]:
cm = np.array(cm_df)
accuracy = (cm[0,0]+cm[1,1])/cm.sum() #0,0 = 69 and 1,1 = 90 
accuracy

0.9464285714285714

### Testing The Model
The problem in overfitting where the model too fit the train data, it has high accuracy on train but not in predict. So the solution is split it into data train and data test

In [14]:
df_test = pd.read_csv('udemy_data/2.03. Test dataset.csv')
df_test.head()

Unnamed: 0,SAT,Admitted,Gender
0,1323,No,Male
1,1725,Yes,Female
2,1762,Yes,Female
3,1777,Yes,Male
4,1665,No,Male


In [15]:
test = df_test.copy()
test['Admitted']=test['Admitted'].map({'No':0,'Yes':1})
test['Gender']=test['Gender'].map({'Male':0,'Female':1})
test.head()

Unnamed: 0,SAT,Admitted,Gender
0,1323,0,0
1,1725,1,1
2,1762,1,1
3,1777,1,0
4,1665,0,0


In [16]:
test_actual = test['Admitted']
test_data = test.drop('Admitted', axis=1) #the feature that gonna be test

In [17]:
test_data = sm.add_constant(test_data)
test_data #data that will be test (19 data)

Unnamed: 0,const,SAT,Gender
0,1.0,1323,0
1,1.0,1725,1
2,1.0,1762,1
3,1.0,1777,0
4,1.0,1665,0
5,1.0,1556,1
6,1.0,1731,1
7,1.0,1809,1
8,1.0,1930,1
9,1.0,1708,0


In [18]:
#function to get the confusion matrix + accuracy
def confusion_matrix(data, actual_values, model): #(data will be test, actual prediction, model)
    pred_values = model.predict(data)
    bins = np.array([0,0.5,1]) 
    cm = np.histogram2d(actual_values, pred_values, bins = bins)[0]
    accuracy = (cm[0,0]+cm[1,1])/cm.sum()
    return cm, accuracy

In [19]:
cm = confusion_matrix(test_data, test_actual, results_log)
cm

(array([[5.00, 1.00],
        [1.00, 12.00]]),
 0.8947368421052632)

In [20]:
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df=cm_df.rename(index={0:'Actual 0',1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5.0,1.0
Actual 1,1.0,12.0


In [21]:
print('Missclassification : '+str((cm[0][0][1]+cm[0][1][0])/cm[0].sum()))

Missclassification : 0.10526315789473684
