In [26]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
base_path = '/content/drive/MyDrive/capstone-data/'

uci_path = base_path + 'individual+household+electric+power+consumption/household_power_consumption.txt'
weather_folder_path = base_path + 'Data-weather-paris/'


In [28]:
import pandas as pd
import numpy as np
import glob

In [29]:
# Load UCI power data
uci = pd.read_csv(uci_path, sep=';', low_memory=False, na_values='?')

# Convert datetime
uci['datetime'] = pd.to_datetime(uci['Date'] + ' ' + uci['Time'], dayfirst=True, errors='coerce')
uci.set_index('datetime', inplace=True)

# Convert numeric columns
cols = ['Global_active_power', 'Global_reactive_power', 'Voltage',
        'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
uci[cols] = uci[cols].apply(pd.to_numeric, errors='coerce')

# Drop missing
uci.dropna(inplace=True)

uci_2007 = uci['2007-01-01':'2007-12-31']

# Remove outliers using IQR
Q1 = uci_2007['Global_active_power'].quantile(0.25)
Q3 = uci_2007['Global_active_power'].quantile(0.75)
IQR = Q3 - Q1
upper_limit = Q3 + 1.5 * IQR
uci_2007 = uci_2007[uci_2007['Global_active_power'] < upper_limit]

# Keep only numeric columns
uci_2007_numeric = uci_2007.select_dtypes(include=[np.number])

# Resample to hourly
uci_hourly = uci_2007_numeric.resample('h').mean()



In [30]:
# weather_folder_path = '/content/drive/MyDrive/capstone-data/Data-weather-paris/'

# Match all CSV files
csv_files = glob.glob(weather_folder_path + '*.csv')
print(f"Found {len(csv_files)} weather CSV files")

# Load and concatenate all files
weather_dfs = [pd.read_csv(file) for file in csv_files]
weather = pd.concat(weather_dfs)

# Parse datetime and set index
weather['datetime'] = pd.to_datetime(weather['datetime'], format='mixed', errors='coerce')
# Drop invalid datetime rows
weather = weather.dropna(subset=['datetime'])

# Set as index
weather.set_index('datetime', inplace=True)

# Keep only relevant columns
weather = weather[['temp', 'humidity', 'windspeed', 'cloudcover']]

# Handle missing data
weather = weather.interpolate(method='time')
weather.sort_index(inplace=True)

# Filter to 2007 if needed
weather_2007 = weather['2007-01-01':'2007-12-31']
weather_2007.head()


Found 12 weather CSV files


Unnamed: 0_level_0,temp,humidity,windspeed,cloudcover
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-01-01 00:00:00,13.9,86.47,30.9,95.5
2007-01-01 01:00:00,14.0,85.1,25.5,96.9
2007-01-01 02:00:00,14.0,84.29,27.7,92.7
2007-01-01 03:00:00,13.8,80.82,21.7,67.7
2007-01-01 04:00:00,12.9,59.76,26.8,23.7


In [31]:
combined = uci_hourly.join(weather_2007, how='inner')
combined.head()


Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,temp,humidity,windspeed,cloudcover
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-01-01 00:00:00,2.550633,0.1128,241.361333,10.53,0.0,0.583333,0.0,13.9,86.47,30.9,95.5
2007-01-01 01:00:00,2.5234,0.071633,241.0965,10.443333,0.0,0.0,0.0,14.0,85.1,25.5,96.9
2007-01-01 02:00:00,2.582333,0.106667,243.200167,10.54,0.0,0.333333,0.0,14.0,84.29,27.7,92.7
2007-01-01 03:00:00,2.541667,0.0901,243.265667,10.4,0.0,0.266667,0.0,13.8,80.82,21.7,67.7
2007-01-01 04:00:00,2.475733,0.088167,242.456167,10.11,0.0,0.0,0.0,12.9,59.76,26.8,23.7


In [32]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7357 entries, 2007-01-01 00:00:00 to 2007-12-31 23:00:00
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Global_active_power    7277 non-null   float64
 1   Global_reactive_power  7277 non-null   float64
 2   Voltage                7277 non-null   float64
 3   Global_intensity       7277 non-null   float64
 4   Sub_metering_1         7277 non-null   float64
 5   Sub_metering_2         7277 non-null   float64
 6   Sub_metering_3         7277 non-null   float64
 7   temp                   7357 non-null   float64
 8   humidity               7357 non-null   float64
 9   windspeed              7357 non-null   float64
 10  cloudcover             7357 non-null   float64
dtypes: float64(11)
memory usage: 689.7 KB


In [33]:
# Save and download
combined.to_csv('/content/drive/MyDrive/capstone-data/cleaned_combined_data_2007.csv')

# from google.colab import files
# files.download('cleaned_combined_data_2007.csv')
