In [1]:
%load_ext pycodestyle_magic
%flake8_on --ignore E703

In [2]:
import os
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt

# Task 1

In [3]:
data_file = os.path.join('data', 'train.tsv')
test_data_file = os.path.join('data', 'test.tsv')
results_file = os.path.join('data', 'results.tsv')
output_file = os.path.join('data', 'out.tsv')

In [4]:
# define columns names
df_names = ['Occupancy', 'Date', 'Temperature', 'Humidity',
            'Light', 'CO2', 'HumidityRatio']

df = pd.read_csv(data_file, sep='\t', names=df_names)
df = df.dropna()

## Logistic regression classifier on one independent variable - training data

In [5]:
# percentage of occupancy and zero rule model accuracy
occupancy_percentage = sum(df["Occupancy"]) / len(df)
print("Occupancy percentage is: " + str(round(occupancy_percentage, 4)))
print("Zero rule model accuracy on training set is: " + str(round(1 - occupancy_percentage, 4)))

Occupancy percentage is: 0.2123
Zero rule model accuracy on training set is: 0.7877


In [6]:
# logistic regression classifier on one independent variable
clr = LogisticRegression()
X_train = df[['Temperature']]
y_train = df.Occupancy

In [7]:
clr.fit(X_train, y_train)
y_train_pred = clr.predict(X_train)

### Accuracy for training set

In [8]:
clr_accuracy = sum(y_train == y_train_pred) / len(df)
print("Training set accuracy for logisitic regression model "
      + "on Temperature variable:\n" + str(clr_accuracy))

Training set accuracy for logisitic regression model on Temperature variable:
0.8240206312169962


### Sensitivity
= true_positive/(false_negative + true_positive): The proportion of observed positives that were predicted to be positive

### Specificity
= true_negative/(true_negative + false_positive): The proportion of observed negatives that were predicted to be negatives.

In [9]:
# source: https://www.theanalysisfactor.com/sensitivity-and-specificity/
conf_matrix = confusion_matrix(y_train, y_train_pred)
true_n, false_p, false_n, true_p = conf_matrix.ravel()

In [10]:
clr_sensitivity = true_p/(false_n + true_p)
print("Training set sensitivity for logisitic regression model "
      + "on Temperature variable:\n" + str(clr_sensitivity))

Training set sensitivity for logisitic regression model on Temperature variable:
0.4557547715442452


In [11]:
clr_specificity = true_n/(true_n+false_p)
print("Training set specificity for logisitic regression model "
      + "on Temperature variable:\n" + str(clr_specificity))

Training set specificity for logisitic regression model on Temperature variable:
0.9232927970065482


## Logistic regression classifier on all but 'date' independent variables - training data

In [12]:
clr_all = LogisticRegression()
X_train_all = df[['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']]

In [13]:
clr_all.fit(X_train_all, y_train);
y_train_pred_all = clr_all.predict(X_train_all)

In [14]:
clr_all_accuracy = accuracy_score(y_train, y_train_pred_all)
print("Training set accuracy for logisitic regression model " +
      "on all but date variable: " + str(clr_all_accuracy))

Training set accuracy for logisitic regression model on all but date variable: 0.9860002456097261


In [15]:
# source: https://www.theanalysisfactor.com/sensitivity-and-specificity/
conf_matrix = confusion_matrix(y_train, y_train_pred_all)
true_n, false_p, false_n, true_p = conf_matrix.ravel()

In [16]:
clr_all_sensitivity = true_p/(false_n + true_p)
print("Training set sensitivity for logisitic regression model "
      + "on Temperature variable:\n" + str(clr_all_sensitivity))

Training set sensitivity for logisitic regression model on Temperature variable:
0.9866975130133024


In [17]:
clr_all_specificity = true_n/(true_n+false_p)
print("Training set specificity for logisitic regression model "
      + "on Temperature variable:\n" + str(clr_all_specificity))

Training set specificity for logisitic regression model on Temperature variable:
0.9858122856251949


## Logistic regression classifier on all but 'date' independent variables - TEST data

In [18]:
# load data into model
df_column_names_all = ['Date', 'Temperature', 'Humidity',
                       'Light', 'CO2', 'HumidityRatio']
X_column_names_all = ['Temperature', 'Humidity', 'Light',
                      'CO2', 'HumidityRatio']

X_test_all = pd.read_csv(test_data_file,
                         sep='\t', names=df_column_names_all,
                         usecols=X_column_names_all)

df_results_all = pd.read_csv(results_file,
                             sep='\t', names=['y'])
df_results_all['y'] = df_results_all['y'].astype('category')

In [19]:
y_true_all = df_results_all['y']

In [20]:
y_test_pred_all = clr_all.predict(X_test_all)
# accuracy calculated using accuracy_score from sklearn.metrics
clr_test_accuracy_all = accuracy_score(y_true_all, y_test_pred_all)
print('Accuracy on test dataset (full model): ' + str(clr_test_accuracy_all))

Accuracy on test dataset (full model): 0.976360225140713


In [21]:
# source: https://www.theanalysisfactor.com/sensitivity-and-specificity/
conf_matrix = confusion_matrix(y_true_all, y_test_pred_all)
true_n, false_p, false_n, true_p = conf_matrix.ravel()

In [22]:
clr_test_sensitivity_all = true_p/(false_n + true_p)
print("Sensitivity on test dataset (full model):" + str(clr_test_sensitivity_all))

Sensitivity on test dataset (full model):0.9907407407407407


In [23]:
clr_test_specificity_all = true_n/(true_n+false_p)
print("Specificity on test dataset (full model):" + str(clr_test_specificity_all))

Specificity on test dataset (full model):0.9681039574719433


## Logistic regression classifier on one independent variable - TEST data

In [24]:
# load data into model
df_column_names = ['Date', 'Temperature', 'Humidity',
                   'Light', 'CO2', 'HumidityRatio']
X_column_names = ['Temperature']

X_test = pd.read_csv(test_data_file,
                     sep='\t', names=df_column_names, usecols=X_column_names)

df_results = pd.read_csv(results_file,
                         sep='\t', names=['y'])
df_results['y'] = df_results['y'].astype('category')

In [25]:
y_true = df_results['y']

In [26]:
y_test_pred = clr.predict(X_test)
# accuracy calculated using accuracy_score from sklearn.metrics
clr_test_accuracy = accuracy_score(y_true, y_test_pred)
print('Accuracy on test dataset (only Temperature): ' + str(clr_test_accuracy))

Accuracy on test dataset (only Temperature): 0.8532833020637899


In [27]:
# source: https://www.theanalysisfactor.com/sensitivity-and-specificity/
conf_matrix = confusion_matrix(y_true, y_test_pred)
true_n, false_p, false_n, true_p = conf_matrix.ravel()

In [28]:
clr_test_sensitivity = true_p/(false_n + true_p)
print("Sensitivity on test dataset (only Temperature):" + str(clr_test_sensitivity))

Sensitivity on test dataset (only Temperature):0.727366255144033


In [29]:
clr_test_specificity = true_n/(true_n+false_p)
print("Specificity on test dataset (only Temperature):" + str(clr_test_specificity))

Specificity on test dataset (only Temperature):0.9255759007678677


In [30]:
# saving the data
df = pd.DataFrame(y_test_pred)
df.to_csv(output_file, index=False, header=False)

# Task 2

In [31]:
# source: https://machinelearningmastery.com/fbeta-measure-for-machine-learning/
from sklearn.metrics import fbeta_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

1:80: E501 line too long (80 > 79 characters)


In [32]:
# load data
df_train = os.path.join('data', 'train.tsv')
df_test = os.path.join('data', 'test.tsv')
df_results = os.path.join('data', 'results.tsv')

# name of column in train file
df_column_names = ['Occupancy', 'Date', 'Temperature', 'Humidity',
                   'Light', 'CO2', 'HumidityRatio']
df = pd.read_csv(df_train, sep='\t', names=df_column_names).dropna()

## Variable: Temperature

In [33]:
clr = LogisticRegression()
X_train_temp = df[['Temperature']]
y_train_temp = df.Occupancy

In [34]:
clr.fit(X_train_temp, y_train_temp)
y_train_pred_temp = clr.predict(X_train_temp)

conf_matrix_temp = confusion_matrix(y_train_temp, y_train_pred_temp)
tn_temp, fp_temp, fn_temp, tp_temp = conf_matrix_temp.ravel()

### Presicion 
is a metric that calculates the percentage of correct predictions for the positive class.

In [35]:
precision_temp = precision_score(y_train_temp, y_train_pred_temp)
print('Precision (Temperature):', precision_temp)

Precision (Temperature): 0.615625


### Recall
calculates the percentage of correct predictions for the positive class out of all positive predictions that could be made. Maximizing precision will minimize the false-positive errors, whereas maximizing recall will minimize the false-negative errors.

In [36]:
recall_temp = recall_score(y_train_temp, y_train_pred_temp)
print('Recall (Temperature):', recall_temp, sep='\t')

Recall (Temperature):	0.4557547715442452


### F-measure 
is calculated as the harmonic mean of precision and recall, giving each the same weighting. It allows a model to be evaluated taking both the precision and recall into account using a single score, which is helpful when describing the performance of the model and in comparing models.

In [37]:
f1_temp = f1_score(y_train_temp, y_train_pred_temp)
print('F1 (Temperature):', f1_temp, sep='\t')

F1 (Temperature):	0.523762047191758


### Fbeta-measure 
is a generalization of the F-measure that adds a configuration parameter called beta. A default beta value is 1.0, which is the same as the F-measure. A smaller beta value, such as 0.5, gives more weight to precision and less to recall, whereas a larger beta value, such as 2.0, gives less weight to precision and more weight to recall in the calculation of the score.

In [38]:
beta_temp = 2.0 # recall is more important than precision in this case

f_beta_temp = fbeta_score(y_train_temp, y_train_pred_temp, beta=beta_temp)
print('F beta (Temperature):', f_beta_temp, sep='\t')

F beta (Temperature):	0.4807223035627135


1:16: E261 at least two spaces before inline comment


## Variable: CO2

In [39]:
clr = LogisticRegression()
X_train_CO2 = df[['CO2']]
y_train_CO2 = df.Occupancy

In [40]:
clr.fit(X_train_CO2, y_train_CO2)
y_train_pred_CO2 = clr.predict(X_train_CO2)

conf_matrix_CO2 = confusion_matrix(y_train_CO2, y_train_pred_CO2)
tn_CO2, fp_CO2, fn_CO2, tp_CO2 = conf_matrix_CO2.ravel()

In [41]:
precision_CO2 = precision_score(y_train_CO2, y_train_pred_CO2)
print('Precision (CO2):', precision_CO2)

Precision (CO2): 0.8224687933425797


In [42]:
recall_CO2 = recall_score(y_train_CO2, y_train_pred_CO2)
print('Recall (CO2):', recall_CO2, sep='\t')

Recall (CO2):	0.6859456333140543


In [43]:
f1_CO2 = f1_score(y_train_CO2, y_train_pred_CO2)
print('F1 (CO2):', f1_CO2, sep='\t')

F1 (CO2):	0.748029012929675


In [44]:
beta_CO2 = 0.5 # precision is more important than recall in this case

f_beta_CO2 = fbeta_score(y_train_CO2, y_train_pred_CO2, beta=beta_CO2)
print('F beta (CO2):', f_beta_CO2, sep='\t')

F beta (CO2):	0.7909830598906229


1:15: E261 at least two spaces before inline comment


## Variable: all independent but no 'date'

In [45]:
clr = LogisticRegression()
X_train_all = df[['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']]
y_train_all = df.Occupancy

In [46]:
clr.fit(X_train_all, y_train)
y_train_pred_all = clr.predict(X_train_all)

conf_matrix_all = confusion_matrix(y_train, y_train_pred_all)
tn_all, fp_all, fn_all, tp_all = conf_matrix_all.ravel()

In [47]:
precision_all = precision_score(y_train_all, y_train_pred_all)
print('Precision (All but date):', precision_all)

Precision (All but date): 0.9493600445186422


In [48]:
recall_all = recall_score(y_train_all, y_train_pred_all)
print('Recall (All but date):', recall_all, sep='\t')

Recall (All but date):	0.9866975130133024


In [49]:
f1_all = f1_score(y_train_all, y_train_pred_all)
print('F1 (All but date):', f1_all, sep='\t')

F1 (All but date):	0.9676687464549064


In [50]:
beta_all = 2.0 # recall is more important than precision in this case

f_beta_all = fbeta_score(y_train_all, y_train_pred_all, beta=beta_all)
print('F beta (All but date):', f_beta_all, sep='\t')

F beta (All but date):	0.9789969011821417


1:15: E261 at least two spaces before inline comment


## Summary

In [51]:
result_temp = {'Precision': precision_temp, 'Recall': recall_temp,
               'F': f1_temp, 'Fbeta': f_beta_temp}
result_CO2 = {'Precision': precision_CO2, 'Recall': recall_CO2,
              'F': f1_CO2, 'Fbeta': f_beta_CO2}
result_all = {'Precision': precision_all, 'Recall': recall_all,
              'F': f1_all, 'Fbeta': f_beta_all}

In [52]:
df_summary = pd.DataFrame(data=[result_temp, result_CO2, result_all],
                          columns=['Precision', 'Recall', 'F', 'Fbeta'],
                          index=['Temperature', 'CO2', 'All']).round(4)
df_summary

Unnamed: 0,Precision,Recall,F,Fbeta
Temperature,0.6156,0.4558,0.5238,0.4807
CO2,0.8225,0.6859,0.748,0.791
All,0.9494,0.9867,0.9677,0.979
