## Notebook Setup

In [4]:
#Common imports
import sys
import os
import numpy as np

#Python Data Analysis Library
import pandas as pd

#Data visualization
#%matplotlib inline #sets the backend of matplotlib to the 'inline' backend
%matplotlib notebook
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

# Where to save the figures
project_root_dir = "."
project_name = "occupancy_detection"
images_path = os.path.join(project_root_dir, "images", project_name)
os.makedirs(images_path, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

## Data Import

In [16]:
df_1 = pd.read_csv("datasets/occupancy_detection/datatraining.txt")
df_2 = pd.read_csv("datasets/occupancy_detection/datatest.txt")
df_3 = pd.read_csv("datasets/occupancy_detection/datatest2.txt")
df = pd.concat([df_1,df_2,df_3], axis = 0)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20560 entries, 1 to 9752
Data columns (total 7 columns):
date             20560 non-null object
Temperature      20560 non-null float64
Humidity         20560 non-null float64
Light            20560 non-null float64
CO2              20560 non-null float64
HumidityRatio    20560 non-null float64
Occupancy        20560 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 1.3+ MB


In [19]:
df.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
3,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
4,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
5,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


## Data Engineering and Visualization

Occupancy is our label for this classification project. It was imported as an 'int64' dtype and requires a correct type conversion

In [20]:
df['Occupancy'] = df['Occupancy'].astype("bool")

In [26]:
df.Occupancy.value_counts()

False    15810
True      4750
Name: Occupancy, dtype: int64