In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import os

In [None]:
pip install gunicorn flask

In [None]:
os.chdir("C:\\Users\\TBRC-lap35\\Desktop\\Assignments\\logistic")

In [None]:
data = sm.datasets.fair.load_pandas().data

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data["affair"] = (data["affairs"]>0).astype(int)

In [None]:
data.head()

In [None]:
data.groupby(['occupation']).mean()

In [None]:
data.groupby(['rate_marriage']).mean()

In [None]:
data.groupby(['affair']).mean()

In [None]:
data.groupby(['yrs_married']).mean()

# Data Visualization

In [None]:
data["children"].hist()
plt.xlabel("Children")
plt.ylabel("Frequency")
plt.title("No of Children")

In [None]:
pd.crosstab(data['occupation'],data['affair']).plot(kind="bar")
plt.title("Occupation Distribution by Affair Status")

In [None]:
pd.crosstab(data['age'],data['affair']).plot(kind="bar")
plt.title("Age Distribution by Affair Status")

In [None]:
pd.crosstab(data['rate_marriage'],data['affair']).plot(kind="bar")
plt.title("Marriage Rating Distribution by Affair Status")

# Prepare Data for Logistic Regression

In [None]:
y, x = dmatrices('affair ~ rate_marriage + age + yrs_married + children + religious + educ + C(occupation) + C(occupation_husb)',data, return_type="dataframe")

In [None]:
x = x.rename(columns = {'C(occupation)[T.2.0]':'occ_2',
                        'C(occupation)[T.3.0]':'occ_3',
                        'C(occupation)[T.4.0]':'occ_4',
                        'C(occupation)[T.5.0]':'occ_5',
                        'C(occupation)[T.6.0]':'occ_6',
                        'C(occupation_husb)[T.2.0]':'occ_husb_2',
                        'C(occupation_husb)[T.3.0]':'occ_husb_3',
                        'C(occupation_husb)[T.4.0]':'occ_husb_4',
                        'C(occupation_husb)[T.5.0]':'occ_husb_5',
                        'C(occupation_husb)[T.6.0]':'occ_husb_6'})

In [None]:
#converting y into 1-D array

y=np.ravel(y)

In [None]:
x.head()

# Logistic Regression

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size= 0.25, random_state = 355)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train)

In [None]:
y_pred = log_reg.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
# Confusion Matrix
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

In [None]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

In [None]:
# Breaking down the formula for Accuracy
Accuracy = (true_positive + true_negative) / (true_positive +false_positive + false_negative + true_negative)
Accuracy

In [None]:
# Precison
Precision = true_positive/(true_positive+false_positive)
Precision

In [None]:
# Recall
Recall = true_positive/(true_positive+false_negative)
Recall

In [None]:
# F1 Score
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
F1_Score

In [None]:
import pickle
# Writing different model files to file
with open( 'modelForPrediction.pickle', 'wb') as f:
    pickle.dump(log_reg,f)