In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import binarize
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
df = pd.read_csv("data/student-alcohol-maths.csv")

In [3]:
df.shape

(395, 33)

In [4]:
## Get the "sex" column as our "target" (the thing we're training the model to predict)
target = df.loc[:, 'sex']

## Get everything except the "sex" column as our features (the thing we'll tell our model)
features = pd.get_dummies(df.loc[:, df.columns != 'sex'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [6]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
y_pred_class = logreg.predict(X_test)

In [8]:
print(metrics.accuracy_score(y_test, y_pred_class))

0.6708860759493671


In [9]:
y_test.value_counts()

M    41
F    38
Name: sex, dtype: int64

In [10]:
y_test.value_counts().head(1)/len(y_test)

M    0.518987
Name: sex, dtype: float64

In [12]:
#Confusion matrix
#IMP: first argument is true_values, second argument is predicted_values
print(metrics.confusion_matrix(y_test, y_pred_class))

[[25 13]
 [13 28]]


In [13]:
#save confusion matrix and slice into 4 pieces.
confusion = metrics.confusion_matrix(y_test, y_pred_class)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

In [14]:
#Classification accuracy. Overall, how often is the classifier correct?
(TP + TN)/float(TP + TN + FP + FN)

0.6708860759493671

In [15]:
(FP + FN) / float(TP + TN + FP + FN)
print(1- metrics.accuracy_score(y_test, y_pred_class))

0.3291139240506329


In [17]:
#sensitivity
TP / float(TP + FN)

0.6829268292682927

In [18]:
#specificity
TN / float(TN + FP)

0.6578947368421053

In [19]:
#false positive rate
FP / float(TN + FP)

0.34210526315789475

In [22]:
#precision
TP / float(FP + TP)

0.6829268292682927