## Logistic Regression Concept
#### Probability of passing an exam versus hours of study

example: https://en.wikipedia.org/wiki/Logistic_regression#Probability_of_passing_an_exam_versus_hours_of_study

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url='https://github.com/mathawanup/master_dataset/raw/master/study_hours.csv'
df=pd.read_csv(url)
df

In [None]:
df.T # transpose 

In [None]:
sns.scatterplot(data=df, x='Hours', y='Pass') # 1 pass , 0  fail 

In [None]:
sns.lmplot(x='Hours', y='Pass', data=df,
           logistic=False, ci=None, height=4, aspect=1.5,
           line_kws={'color': 'orange'}) # use linear model and linear regression line 
plt.axhline(0.5, color='red', linestyle='--') # Pass 1 divsion 2 =0.5
plt.axvline(2.75, color='green', linestyle='--'); # Hours 5.5 divsion 2 =0.75
# crossing x,y,linear regression line

In [None]:
df.at[19, 'Hours']=200  # fake up  outliner at index 19
# df.at[19, 'Hours']=5.5 # original data

In [None]:
sns.lmplot(x='Hours', y='Pass', data=df,
           logistic=False, ci=None, height=4, aspect=1.5,
           line_kws={'color': 'orange'})
plt.axhline(0.5, color='red', linestyle='--');
plt.axvline(100, color='green', linestyle='--'); # Hours 200 divsion 2 =100
# crossing x,y,linear regression line 
# Wrong prediction with outliner

In [None]:
# df.at[19, 'Hours']=100
df.at[19, 'Hours']=5.5 # original data

In [None]:
sns.lmplot(x='Hours', y='Pass', data=df,
           logistic=True, height=4, aspect=1.5, 
           line_kws={'color': 'orange'})
plt.ylabel('Probability of passing exam')
plt.axvline(2.71, color='green', linestyle='--')
plt.axhline(0.5, color='red', linestyle='--'); # use logistic model (Sigmoid) ,  logistic=True

In [None]:
df.at[19, 'Hours']=100  # fake up  outliner at index 19
# df.at[19, 'Hours']=5.5 # original data

In [None]:
sns.lmplot(x='Hours', y='Pass', data=df,
           logistic=True, height=4, aspect=1.5, 
           line_kws={'color': 'orange'})
plt.ylabel('Probability of passing exam')
plt.axvline(2.71, color='green', linestyle='--')
plt.axhline(0.5, color='red', linestyle='--');
# logistic can predict with outliner 

In [None]:
df.at[19, 'Hours']=5.5 # original data
df.T

In [None]:
sns.lmplot(x='Hours', y='Pass', data=df,
           logistic=True, height=4, aspect=1.5, 
           line_kws={'color': 'orange'})
plt.ylabel('Probability of passing exam')
plt.axvline(2.71, color='green', linestyle='--')
plt.axhline(0.5, color='red', linestyle='--'); # use logistic model (Sigmoid) ,  logistic=True

## Scikit-learn: LogisticRegression
ศึกษาเพิ่มเติม: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
from sklearn.model_selection import train_test_split # split data set
from sklearn.linear_model import LogisticRegression # Logistic Regression model

In [None]:
df.T

In [None]:
y = df['Pass'] # series
X = df[['Hours']] # 2D matrix ,data frame
test_size=0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=7)

# random_state for result reproduceable

In [None]:
# show X_train, X_test, y_train, y_test

In [None]:
# model = LogisticRegression() # do not hyper parameter
model = LogisticRegression(solver='lbfgs')

model

In [None]:
model.fit(X_train, y_train) # fit model

In [None]:
X_test

In [None]:
predicted = model.predict(X_test) # y from model predict
predicted

In [None]:
y_test.values # y original data

In [None]:
model.score(X_test, y_test) # 5/6 , Accuracy model

In [None]:
model.predict_proba(X_test)#  probability [Pass , fail ] , 1- proba_pass=proba_fail

## scikit-learn: confusion matrix

<table>
<tr>
    <td></td>
    <td>predicted false</td>
    <td>predicted true</td>
</tr>
<tr>
    <td>actual false</td>
    <td>tn</td>
    <td>fp</td>
</tr>
<tr>
    <td>actual true</td>
    <td>fn</td>
    <td>tp</td>
</tr>
<table>

$$Accuracy={\frac {{TP+TN} }{TP + TN +FP + FN} }\\
Precision={\frac {TP}{TP+FP}}\\
Recall={\frac {TP}{TP+FN}}\\
F1 = 2 \times {\frac {precision \times recall} {precision + recall}}
$$

* https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
* https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
* https://en.wikipedia.org/wiki/Confusion_matrix

In [None]:
from sklearn import metrics

In [None]:
metrics.confusion_matrix(y_test, predicted)

In [None]:
metrics.accuracy_score(y_test, predicted)

In [None]:
metrics.precision_score(y_test, predicted)

In [None]:
metrics.recall_score(y_test, predicted)

In [None]:
metrics.f1_score(y_test, predicted)

In [None]:
print(metrics.classification_report(y_test, predicted))