In [1]:
%load_ext pycodestyle_magic
%flake8_on --ignore E703

In [2]:
import os
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt

# Task 1

In [3]:
data_file = os.path.join('data', 'train.tsv')
test_data_file = os.path.join('data', 'test.tsv')
results_file = os.path.join('data', 'results.tsv')
output_file = os.path.join('data', 'out.tsv')

In [4]:
# define columns names
df_names = ['Occupancy', 'Date', 'Temperature', 'Humidity',
            'Light', 'CO2', 'HumidityRatio']

df = pd.read_csv(data_file, sep='\t', names=df_names)
df = df.dropna()

### Logistic regression classifier on one independent variable - training data

In [5]:
# percentage of occupancy and zero rule model accuracy
occupancy_percentage = sum(df["Occupancy"]) / len(df)
print("Occupancy percentage is: " + str(round(occupancy_percentage, 4)))
print("Zero rule model accuracy on training set is: " + str(round(1 - occupancy_percentage, 4)))

Occupancy percentage is: 0.2123
Zero rule model accuracy on training set is: 0.7877


In [6]:
# logistic regression classifier on one independent variable
clr = LogisticRegression()
X_train = df[['Temperature']]
y_train = df.Occupancy

In [7]:
clr.fit(X_train, y_train)
y_train_pred = clr.predict(X_train)

#### Accuracy for training set

In [8]:
clr_accuracy = sum(y_train == y_train_pred) / len(df)
print("Training set accuracy for logisitic regression model "
      + "on Temperature variable:\n" + str(clr_accuracy))

Training set accuracy for logisitic regression model on Temperature variable:
0.8240206312169962


#### Sensitivity
= true_positive/(false_negative + true_positive): The proportion of observed positives that were predicted to be positive
###### Specificity
= true_negative/(true_negative + false_positive): The proportion of observed negatives that were predicted to be negatives.

In [9]:
# source: https://www.theanalysisfactor.com/sensitivity-and-specificity/
conf_matrix = confusion_matrix(y_train, y_train_pred)
true_n, false_p, false_n, true_p = conf_matrix.ravel()

In [10]:
clr_sensitivity = true_p/(false_n + true_p)
print("Training set sensitivity for logisitic regression model "
      + "on Temperature variable:\n" + str(clr_sensitivity))

Training set sensitivity for logisitic regression model on Temperature variable:
0.4557547715442452


In [11]:
clr_specificity = true_n/(true_n+false_p)
print("Training set specificity for logisitic regression model "
      + "on Temperature variable:\n" + str(clr_specificity))

Training set specificity for logisitic regression model on Temperature variable:
0.9232927970065482


### Logistic regression classifier on all but 'date' independent variables - training data

In [12]:
clr_all = LogisticRegression()
X_train_all = df[['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']]

In [13]:
clr_all.fit(X_train_all, y_train);
y_train_pred_all = clr_all.predict(X_train_all)

In [14]:
clr_all_accuracy = accuracy_score(y_train, y_train_pred_all)
print("Training set accuracy for logisitic regression model " +
      "on all but date variable: " + str(clr_all_accuracy))

Training set accuracy for logisitic regression model on all but date variable: 0.9860002456097261


In [15]:
# source: https://www.theanalysisfactor.com/sensitivity-and-specificity/
conf_matrix = confusion_matrix(y_train, y_train_pred_all)
true_n, false_p, false_n, true_p = conf_matrix.ravel()

In [16]:
clr_all_sensitivity = true_p/(false_n + true_p)
print("Training set sensitivity for logisitic regression model "
      + "on Temperature variable:\n" + str(clr_all_sensitivity))

Training set sensitivity for logisitic regression model on Temperature variable:
0.9866975130133024


In [17]:
clr_all_specificity = true_n/(true_n+false_p)
print("Training set specificity for logisitic regression model "
      + "on Temperature variable:\n" + str(clr_all_specificity))

Training set specificity for logisitic regression model on Temperature variable:
0.9858122856251949


### Logistic regression classifier on all but 'date' independent variables - TEST data

In [18]:
# load data into model
df_column_names_all = ['Date', 'Temperature', 'Humidity',
                       'Light', 'CO2', 'HumidityRatio']
X_column_names_all = ['Temperature', 'Humidity', 'Light',
                      'CO2', 'HumidityRatio']

X_test_all = pd.read_csv(test_data_file,
                         sep='\t', names=df_column_names_all,
                         usecols=X_column_names_all)

df_results_all = pd.read_csv(results_file,
                             sep='\t', names=['y'])
df_results_all['y'] = df_results_all['y'].astype('category')

In [19]:
y_true_all = df_results_all['y']

In [20]:
y_test_pred_all = clr_all.predict(X_test_all)
# accuracy calculated using accuracy_score from sklearn.metrics
clr_test_accuracy_all = accuracy_score(y_true_all, y_test_pred_all)
print('Accuracy on test dataset (full model): ' + str(clr_test_accuracy_all))

Accuracy on test dataset (full model): 0.976360225140713


In [21]:
# source: https://www.theanalysisfactor.com/sensitivity-and-specificity/
conf_matrix = confusion_matrix(y_true_all, y_test_pred_all)
true_n, false_p, false_n, true_p = conf_matrix.ravel()

In [22]:
clr_test_sensitivity_all = true_p/(false_n + true_p)
print("Sensitivity on test dataset (full model):" + str(clr_test_sensitivity_all))

Sensitivity on test dataset (full model):0.9907407407407407


In [23]:
clr_test_specificity_all = true_n/(true_n+false_p)
print("Specificity on test dataset (full model):" + str(clr_test_specificity_all))

Specificity on test dataset (full model):0.9681039574719433


### Logistic regression classifier on one independent variable - TEST data

In [24]:
# load data into model
df_column_names = ['Date', 'Temperature', 'Humidity',
                   'Light', 'CO2', 'HumidityRatio']
X_column_names = ['Temperature']

X_test = pd.read_csv(test_data_file,
                     sep='\t', names=df_column_names, usecols=X_column_names)

df_results = pd.read_csv(results_file,
                         sep='\t', names=['y'])
df_results['y'] = df_results['y'].astype('category')

In [25]:
y_true = df_results['y']

In [26]:
y_test_pred = clr.predict(X_test)
# accuracy calculated using accuracy_score from sklearn.metrics
clr_test_accuracy = accuracy_score(y_true, y_test_pred)
print('Accuracy on test dataset (only Temperature): ' + str(clr_test_accuracy))

Accuracy on test dataset (only Temperature): 0.8532833020637899


In [27]:
# source: https://www.theanalysisfactor.com/sensitivity-and-specificity/
conf_matrix = confusion_matrix(y_true, y_test_pred)
true_n, false_p, false_n, true_p = conf_matrix.ravel()

In [28]:
clr_test_sensitivity = true_p/(false_n + true_p)
print("Sensitivity on test dataset (only Temperature):" + str(clr_test_sensitivity))

Sensitivity on test dataset (only Temperature):0.727366255144033


In [29]:
clr_test_specificity = true_n/(true_n+false_p)
print("Specificity on test dataset (only Temperature):" + str(clr_test_specificity))

Specificity on test dataset (only Temperature):0.9255759007678677


In [30]:
# saving the data
df = pd.DataFrame(y_test_pred)
df.to_csv(output_file, index=False, header=False)