<a href="https://www.kaggle.com/code/kouroshsajjadi/smart-buildings-assignment-1?scriptVersionId=143760883" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import glob
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns

# Read Building Energy Dataset

In [None]:
path = '../input/building-energy-dataset'

# Use glob to get a list of all CSV files in the specified directory
all_files = glob.glob(path + "/*.csv")

# Initialize an empty list to store DataFrames
li = []

# Loop through each CSV file in the list of file paths
for filename in all_files:
    # Read the CSV file into a DataFrame
    # - 'filename': The name of the CSV file to be read
    # - 'index_col="Time"': Set the "Time" column as the index of the DataFrame
    # - 'parse_dates=True': Automatically parse date-like columns as datetime objects
    # - 'header=0': Use the first row of the CSV file as column headers
    df = pd.read_csv(filename, index_col="Time", parse_dates=True, header=0)
    
    # Append the DataFrame to the list of DataFrames
    li.append(df)

# Concatenate the list of DataFrames vertically (row-wise) into a single DataFrame
# - 'pd.concat()': Combine multiple DataFrames into one, stacking them vertically (axis=0)
# - 'li': List of individual DataFrames to concatenate
# - 'axis=0': Concatenate vertically
# - 'ignore_index=False': Preserve the original indices of the DataFrames
building = pd.concat(li, axis=0, ignore_index=False)

# Optionally, sort the DataFrame by the index if needed
# - 'building.sort_index()': Sort the DataFrame based on the index (row labels)
# - 'inplace=True': Apply the sorting operation directly to the 'building' DataFrame
building.sort_index(inplace=True)

# Visualize Data

In [None]:
# Display information about the concatenated DataFrame
# - 'building.info()': Print details about the resulting DataFrame, including data types and memory usage
building.info()

building.head()

In [None]:
# Missingo on the raw data.
msno.matrix(building)

In [None]:
# Show info about the total missing data.
missing_data = building.isna().sum()

# To calculate the total number of missing values
total_missing = missing_data.sum()

# To calculate the percentage of missing values
percentage_missing = (total_missing / (building.shape[0] * building.shape[1])) * 100

print("Total Missing Values:", total_missing)
print("Percentage of Missing Values:", percentage_missing, "%")

In [None]:
# Report on rows having maximum and minimum missing values.

max_missing_index = building.isna().sum(axis=1).idxmax()
min_missing_index = building.isna().sum(axis=1).idxmin()

print("Index with Maximum Missing Values:", max_missing_index)
print("Index with Minimum Missing Values:", min_missing_index)

max_missing_row = building.loc[max_missing_index]
min_missing_row = building.loc[min_missing_index]

print("\nRow with maximum missing values.")
print(max_missing_row)

print("\nRow with minimum missing values.")
print(min_missing_row)

In [None]:
msno.bar(building)

In [None]:
# Show the missing data on each column.

# Assuming 'building' is your DataFrame
missing_data = building.isna().sum()

# Calculate the total number of missing values
total_missing = missing_data.sum()

# Calculate the percentage of missing values
percentage_missing = (total_missing / (building.shape[0] * building.shape[1])) * 100

# Create a bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(missing_data.index, missing_data)
plt.xlabel('Columns')
plt.ylabel('Number of Missing Values')
plt.title('Number of Missing Values by Column')
plt.xticks(rotation=90)

# Add percentages on top of the bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval}\n{yval/total_missing*100:.2f}%', ha='center', va='bottom', color='black', fontsize=8)

# Add legend for total missing and percentage
legend_text = f'Total Missing: {total_missing}\nPercentage: {percentage_missing:.3f}%'
plt.legend([legend_text])

plt.show()



In [None]:
# Report on columns having maximum and minimum missing values.

max_missing_index = building.isna().sum(axis=0).idxmax()
min_missing_index = building.isna().sum(axis=0).idxmin()

print("Index with Maximum Missing Values:", max_missing_index)
print("Index with Minimum Missing Values:", min_missing_index)

In [None]:
msno.heatmap(building)

In [None]:
msno.dendrogram(building)

In [None]:
# Resampling is not a solution here.
# building = building.resample('H').sum()
# msno.matrix(building)

In [None]:
building_linearly_interpolated = building.interpolate(method='linear')

msno.matrix(building_linearly_interpolated)

# Show info about the total missing data on linearly interpolated data.
missing_data = building_linearly_interpolated.isna().sum()

# To calculate the total number of missing values
total_missing = missing_data.sum()

# To calculate the percentage of missing values
percentage_missing = (total_missing / (building_linearly_interpolated.shape[0] * building_linearly_interpolated.shape[1])) * 100

print("Total Missing Values:", total_missing)
print("Percentage of Missing Values:", percentage_missing, "%")

In [None]:
building_linearly_interpolated = building.interpolate(method='linear')

msno.matrix(building_linearly_interpolated)

# Show info about the total missing data on linearly interpolated data.
missing_data = building_linearly_interpolated.isna().sum()

# To calculate the total number of missing values
total_missing = missing_data.sum()

# To calculate the percentage of missing values
percentage_missing = (total_missing / (building_linearly_interpolated.shape[0] * building_linearly_interpolated.shape[1])) * 100

print("Total Missing Values:", total_missing)
print("Percentage of Missing Values:", percentage_missing, "%")

In [None]:
# building_test = pd.read_csv('../input/buildingtest/buildingtest.csv')
# building['Time'] = pd.to_datetime(building['Time'])  # Convert to datetime if not already
# building_test.set_index('ts', inplace=True)
building.index = pd.to_datetime(building.index, format='%d/%m/%Y %H:%M')
building.head()
# building_test = building_test.resample('H').sum()

# building_test.head()

In [None]:
# plt.plot(building.index, building)
# plt.show()


In [None]:
# plt.plot(missing_values.index, missing_values)
# plt.show()

In [None]:
# Bar plot of missing values.
plt.figure(figsize=(10, 6))
plt.bar(missing_percentage.index, missing_percentage)
plt.xlabel('Columns')
plt.ylabel('Percentage of Missing Values')
plt.title('Percentage of Missing Values in Columns')
plt.xticks(rotation=45)
plt.show()