# Logical Regression process

## Import relevant libaries

In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Load data

In [4]:
raw_data = pd.read_csv("Admittance Logical Regression.csv")
raw_data.head()

Unnamed: 0,SAT,Admitted
0,1363,No
1,1792,Yes
2,1954,Yes
3,1653,No
4,1593,No


In [5]:
data = raw_data.copy()
data["Admitted"]= data["Admitted"].map({"Yes":1,"No":0})
data

## changes categorical data to numerical by mapping yes to 1, and no to 0

Unnamed: 0,SAT,Admitted
0,1363,0
1,1792,1
2,1954,1
3,1653,0
4,1593,0
...,...,...
163,1722,1
164,1750,1
165,1555,0
166,1524,0


## Variables

In [6]:
y = data["Admitted"]
x1 = data["SAT"]

## Regression

In [7]:
x = sm.add_constant(x1)
## add constant 
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
## regression variable, and model is Logit

Optimization terminated successfully.
         Current function value: 0.137766
         Iterations 10


  x = pd.concat(x[::order], 1)


In [8]:
## optimization message means regression was fitted after 10 iterations with a certain value

## Stats models uses a machine learning algorithm to fit the regression, like most modern libaries 
## need to know about function value because it is possible after certain amount of iterations 
## the model won't learn the relationship, so can't optimize the optimisation function
## max number of iterations in stats models is 35, and will get an error message when reaches that

## Regression summary

In [9]:
results_log.summary()

0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 07 Apr 2022",Pseudo R-squ.:,0.7992
Time:,12:44:26,Log-Likelihood:,-23.145
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.805000000000001e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-69.9128,15.737,-4.443,0.000,-100.756,-39.070
SAT,0.0420,0.009,4.454,0.000,0.024,0.060


In [10]:
## MLE maximum liklihood estimation 
## uses likelihood function- how likely the model describes the underlying relationship of variables
## bigger probability, higher likelihood the model is correct
## MLE tries to maximise likelihood function 
## MLE tries to find the best model, when it does, it stops the optimization, also how typical machine learning process works

In [11]:
## log likelihood is good measure. Almost always negative, bigger the better
## log lilihood null- log-liklihood of a model with no independant variables 
## same y of other models is same y for this, plus sole independant variable is an array of 1's, aka y = b0 * 1
## the 1 in front of y variable, is same constant added with sm.add_constant()

In [12]:
## good to compare log likelihood with LL null to see if model has any explanatory power, like f statistic for linear regression
## measure to look at for this is LLR p-value- tests if our model is statistically different from LL-null, a.k.a a useless model
## LLR value is 0.000 etc, so is significant 

In [13]:
## psuedo r squared is like r squared for linear regression but not exactly
## AIC, BIC, McFadden's R Squared are example of above, with pseduo r squared is mcfadden's. 
## According to McFadden, Good R squared is beteeen 0.2 and 0.4
## Measure mostly used for comparing variations of same model 

In [14]:
np.set_printoptions(formatter={"float": lambda x: '{0:0.2f}'.format(x)})
results_log.predict()

array([0.00, 1.00, 1.00, 0.38, 0.05, 0.98, 0.99, 1.00, 1.00, 0.03, 1.00,
       1.00, 0.50, 0.00, 0.77, 1.00, 0.21, 0.22, 0.23, 1.00, 1.00, 1.00,
       0.00, 0.00, 0.91, 1.00, 0.22, 1.00, 1.00, 0.98, 0.00, 0.61, 0.41,
       1.00, 1.00, 1.00, 0.50, 1.00, 0.39, 0.00, 0.04, 0.20, 1.00, 0.00,
       1.00, 0.00, 0.98, 0.00, 0.00, 0.01, 0.01, 1.00, 0.80, 0.01, 1.00,
       0.00, 0.56, 0.95, 0.22, 0.99, 0.01, 0.89, 1.00, 1.00, 0.99, 0.00,
       0.00, 0.00, 1.00, 0.00, 0.89, 0.22, 0.00, 1.00, 1.00, 1.00, 0.00,
       0.49, 1.00, 1.00, 0.01, 1.00, 1.00, 0.65, 1.00, 1.00, 0.00, 1.00,
       0.99, 0.72, 0.60, 0.00, 0.95, 0.91, 0.00, 1.00, 1.00, 0.01, 0.98,
       0.88, 0.99, 0.00, 0.99, 0.03, 0.00, 0.99, 0.99, 1.00, 0.00, 0.00,
       0.04, 0.52, 0.00, 1.00, 0.16, 0.00, 0.92, 0.00, 0.47, 1.00, 1.00,
       0.02, 0.00, 0.00, 1.00, 0.01, 0.96, 0.75, 0.72, 0.94, 0.60, 0.00,
       1.00, 0.00, 0.00, 0.00, 1.00, 0.08, 0.00, 0.01, 1.00, 0.98, 0.70,
       0.99, 1.00, 0.02, 0.00, 0.00, 0.00, 0.83, 1.

In [15]:
np.array(data["Admitted"])

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0], dtype=int64)

In [16]:
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ["Predicted 0","Predicted 1"]
cm_df = cm_df.rename(index={0:"Actual 0",1: "Actual 1"})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,67.0,7.0
Actual 1,7.0,87.0


In [17]:
cm = np.array(cm_df)
accuracy_train = (cm[0,0] + cm[1,1])/cm.sum()
accuracy_train

0.9166666666666666

In [18]:
accurate = accuracy_train * 100
accurate

91.66666666666666