## Logistic Regression Model

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [17]:
wq = pd.read_csv('../datasets/water_clean.csv')

In [18]:
wq.isnull().sum()

ph                 0
hardness           0
solids             0
chloramines        0
sulfate            0
conductivity       0
organic_carbon     0
trihalomethanes    0
turbidity          0
potability         0
dtype: int64

In [28]:
wq.corr()[['potability']].abs().sort_values('potability', ascending=False)

Unnamed: 0,potability
potability,1.0
solids,0.040674
turbidity,0.022682
chloramines,0.020784
organic_carbon,0.015567
conductivity,0.015496
sulfate,0.015303
ph,0.01453
trihalomethanes,0.009244
hardness,0.001505


In [19]:
features = ['ph', 'hardness', 'solids', 'chloramines', 'sulfate', 'conductivity', 
            'organic_carbon', 'trihalomethanes', 'turbidity']

# Define y and X.
y = wq[['potability']]
X = wq[features]

# Import train_test_split.
from sklearn.model_selection import train_test_split
# Create training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.33,
                                                    random_state = 42)

In [20]:
ss = StandardScaler()
ss.fit(X_train) # ONLY fit to X_train, NEVER to X_test

X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [23]:
# Step 1: Instantiate our model.
logreg = LogisticRegression(solver = 'liblinear', class_weight= {1: 0.1})

# Step 2: Fit our model.
logreg.fit(X_train_sc, y_train)

print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [-2.63856291]
Logistic Regression Coefficient: [[ 0.10100454  0.00463041  0.10921705  0.10986971 -0.03990635 -0.07767876
  -0.01099498  0.05966059  0.09472528]]


  y = column_or_1d(y, warn=True)


In [24]:
# Step 4: Evaluate model.
logreg.score(X_test_sc, y_test)

0.6009036144578314

In [25]:
# Generate predictions.
preds = logreg.predict(X_test_sc)

In [27]:
def nice_conmat(y_test, preds, classes):
    conmat = confusion_matrix(y_test, preds)
    return pd.DataFrame(conmat, columns=['Predicted' +class_ for class_ in classes], \
                index=['Actually'+ class_ for class_ in classes])
nice_conmat(y_test, preds, ['non-potable', 'potable'])

Unnamed: 0,Predictednon-potable,Predictedpotable
Actually non-potable,399,0
Actually potable,265,0
