In [None]:
# Лабораторна робота №4
# ФБ-25 Голубєва Ірина

In [None]:
import pandas as pd
import numpy as np
import os
import urllib.request
import zipfile
import timeit

In [None]:
def setup_folders(folder='lab4_data'):
    if not os.path.exists(folder):
        os.makedirs(folder)

def download_and_unzip_data(url, folder, file_name):
    file_path = os.path.join(folder, file_name)

    if not os.path.exists(file_path):
        print(f"Downloading {file_name}...")
        with urllib.request.urlopen(url) as response, open(file_path, 'wb') as out_file:
            out_file.write(response.read())
        print(f"{file_name} downloaded!")

        # Unzip the file
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(folder)
        print(f"Extracted {file_name} to {folder}")

folder = 'lab4_data'
setup_folders(folder)

download_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip'
file_name = 'household_power_consumption.zip'
download_and_unzip_data(download_url, folder, file_name)

Downloading household_power_consumption.zip...
household_power_consumption.zip downloaded!
Extracted household_power_consumption.zip to lab4_data


In [None]:
def process_pandas_data(file_path):
    df = pd.read_csv(file_path, sep=";", na_values=['?'])
    df = df.dropna()
    df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H:%M:%S')
    df = df.drop(columns=['Date', 'Time'])
    numeric_cols = ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
    df[numeric_cols] = df[numeric_cols].astype(float)
    return df

file_path = os.path.join(folder, 'household_power_consumption.txt')
df = process_pandas_data(file_path)
print("Pandas dataframe processed successfully!")
df.head()

Pandas dataframe processed successfully!


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
0,4.216,0.418,234.84,18.4,0.0,1.0,17.0,2006-12-16 17:24:00
1,5.36,0.436,233.63,23.0,0.0,1.0,16.0,2006-12-16 17:25:00
2,5.374,0.498,233.29,23.0,0.0,2.0,17.0,2006-12-16 17:26:00
3,5.388,0.502,233.74,23.0,0.0,1.0,17.0,2006-12-16 17:27:00
4,3.666,0.528,235.68,15.8,0.0,1.0,17.0,2006-12-16 17:28:00


In [None]:
start_time = timeit.default_timer()
high_power = df[df['Global_active_power'] > 5]
end_time = timeit.default_timer()

total_time_pd = end_time - start_time

print(f"Час виконання для Pandas: {total_time_pd} секунд")
print("Домогосподарства з загальною активною потужністю більше 5 кВт:")
high_power.head()

Час виконання для Pandas: 0.024811275000004684 секунд
Домогосподарства з загальною активною потужністю більше 5 кВт:


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
1,5.36,0.436,233.63,23.0,0.0,1.0,16.0,2006-12-16 17:25:00
2,5.374,0.498,233.29,23.0,0.0,2.0,17.0,2006-12-16 17:26:00
3,5.388,0.502,233.74,23.0,0.0,1.0,17.0,2006-12-16 17:27:00
11,5.412,0.47,232.78,23.2,0.0,1.0,17.0,2006-12-16 17:35:00
12,5.224,0.478,232.99,22.4,0.0,1.0,16.0,2006-12-16 17:36:00


In [None]:
start_time = timeit.default_timer()
high_voltage = df[df['Voltage'] > 235]
end_time = timeit.default_timer()

total_time_pd = end_time - start_time

print(f"Час виконання для Pandas: {total_time_pd} секунд")
print("Домогосподарства з напругою більше 235 В:")
high_voltage.head()

Час виконання для Pandas: 0.0912653359999922 секунд
Домогосподарства з напругою більше 235 В:


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
4,3.666,0.528,235.68,15.8,0.0,1.0,17.0,2006-12-16 17:28:00
5,3.52,0.522,235.02,15.0,0.0,2.0,17.0,2006-12-16 17:29:00
6,3.702,0.52,235.09,15.8,0.0,1.0,17.0,2006-12-16 17:30:00
7,3.7,0.52,235.22,15.8,0.0,1.0,17.0,2006-12-16 17:31:00
14,4.054,0.422,235.24,17.6,0.0,1.0,17.0,2006-12-16 17:38:00


In [None]:
start_time = timeit.default_timer()
current_range = df[(df['Global_intensity'] >= 19) & (df['Global_intensity'] <= 20)]
condition_met = current_range[(current_range['Sub_metering_2'] > current_range['Sub_metering_3'])]
end_time = timeit.default_timer()

total_time_pd = end_time - start_time

print(f"Час виконання для Pandas: {total_time_pd} секунд")
print("Домогосподарства з силою струму 19-20 А, де пральна машина та холодильник споживають більше:")
condition_met.head()

Час виконання для Pandas: 0.026487234999990505 секунд
Домогосподарства з силою струму 19-20 А, де пральна машина та холодильник споживають більше:


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
45,4.464,0.136,234.66,19.0,0.0,37.0,16.0,2006-12-16 18:09:00
460,4.582,0.258,238.08,19.6,0.0,13.0,0.0,2006-12-17 01:04:00
464,4.618,0.104,239.61,19.6,0.0,27.0,0.0,2006-12-17 01:08:00
475,4.636,0.14,237.37,19.4,0.0,36.0,0.0,2006-12-17 01:19:00
476,4.634,0.152,237.17,19.4,0.0,35.0,0.0,2006-12-17 01:20:00


In [None]:
start_time = timeit.default_timer()
random_sample = df.sample(n=500000, replace=False)
mean_consumption = random_sample[['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']].mean()
end_time = timeit.default_timer()

total_time_pd = end_time - start_time

print(f"Час виконання для Pandas: {total_time_pd} секунд")
print("Середні величини усіх 3-х груп споживання електричної енергії:")
mean_consumption

Час виконання для Pandas: 0.18597357799998804 секунд
Середні величини усіх 3-х груп споживання електричної енергії:


Sub_metering_1    1.107400
Sub_metering_2    1.311102
Sub_metering_3    6.459858
dtype: float64

In [None]:
start_time = timeit.default_timer()

evening_high_usage = df[(df['DateTime'].dt.hour >= 18) & (df['Global_active_power'] > 6)]
group2_largest = evening_high_usage[evening_high_usage['Sub_metering_2'] > evening_high_usage[['Sub_metering_1', 'Sub_metering_3']].max(axis=1)]
first_half_selection = group2_largest.iloc[:len(group2_largest)//2:3]
second_half_selection = group2_largest.iloc[len(group2_largest)//2::4]
end_time = timeit.default_timer()

total_time_pd = end_time - start_time

print(f"Час виконання для Pandas: {total_time_pd} секунд")
combined_df = pd.concat([first_half_selection, second_half_selection])
print("Домогосподарства, які після 18-00 споживають понад 6 кВт за хвилину в середньому:")
combined_df.head()

Час виконання для Pandas: 0.06843012400000248 секунд
Домогосподарства, які після 18-00 споживають понад 6 кВт за хвилину в середньому:


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
41,6.052,0.192,232.93,26.2,0.0,37.0,17.0,2006-12-16 18:05:00
44,6.308,0.116,232.25,27.0,0.0,36.0,17.0,2006-12-16 18:08:00
17494,6.386,0.374,236.63,27.0,1.0,36.0,17.0,2006-12-28 20:58:00
17498,8.088,0.262,235.5,34.4,1.0,72.0,17.0,2006-12-28 21:02:00
17501,7.23,0.152,235.22,30.6,1.0,73.0,17.0,2006-12-28 21:05:00


In [None]:
def load_data(file_path):
    data_array = np.genfromtxt(file_path, delimiter=';', skip_header=1, dtype=None, encoding='utf-8')
    string_data_array = np.array([tuple(str(item) for item in row) for row in data_array])
    mask = np.all((string_data_array != '?'), axis=1)
    data = string_data_array[mask]
    float_cols = data[:, 2:8].astype(float)
    data = np.hstack((data[:, :2], float_cols, data[:, 8:]))
    return data

folder = 'lab4_data'
file_path = os.path.join(folder, 'household_power_consumption.txt')
np_data = load_data(file_path)
print("Numpy array processed successfully!")
print(np_data[:5])


Numpy array processed successfully!
[['16/12/2006' '17:24:00' '4.216' '0.418' '234.84' '18.4' '0.0' '1.0'
  '17.0']
 ['16/12/2006' '17:25:00' '5.36' '0.436' '233.63' '23.0' '0.0' '1.0'
  '16.0']
 ['16/12/2006' '17:26:00' '5.374' '0.498' '233.29' '23.0' '0.0' '2.0'
  '17.0']
 ['16/12/2006' '17:27:00' '5.388' '0.502' '233.74' '23.0' '0.0' '1.0'
  '17.0']
 ['16/12/2006' '17:28:00' '3.666' '0.528' '235.68' '15.8' '0.0' '1.0'
  '17.0']]


In [None]:
mask = np_data[:, 2].astype(float) > 5
selected_rows = np_data[mask]

time_np = timeit.timeit(lambda: selected_rows, number=1)

print(f"Час виконання для NumPy: {time_np} секунд")
print("Домогосподарства з загальною активною потужністю більше 5 кВт:")
print(selected_rows)

Час виконання для NumPy: 1.5100000041456951e-06 секунд
Домогосподарства з загальною активною потужністю більше 5 кВт:
[['16/12/2006' '17:25:00' '5.36' ... '0.0' '1.0' '16.0']
 ['16/12/2006' '17:26:00' '5.374' ... '0.0' '2.0' '17.0']
 ['16/12/2006' '17:27:00' '5.388' ... '0.0' '1.0' '17.0']
 ...
 ['24/11/2010' '07:50:00' '5.172' ... '0.0' '38.0' '17.0']
 ['24/11/2010' '07:51:00' '5.75' ... '0.0' '39.0' '17.0']
 ['25/11/2010' '07:21:00' '5.074' ... '1.0' '2.0' '18.0']]


In [None]:
mask = np_data[:, 4].astype(float) > 235
selected_rows_np = np_data[mask]

time_np = timeit.timeit(lambda: selected_rows_np, number=1)

print(f"Час виконання для NumPy: {time_np} секунд")
print("Домогосподарства з напругою більше 235 В:")
print(selected_rows_np)

Час виконання для NumPy: 2.0589999394360348e-06 секунд
Домогосподарства з напругою більше 235 В:
[['16/12/2006' '17:28:00' '3.666' ... '0.0' '1.0' '17.0']
 ['16/12/2006' '17:29:00' '3.52' ... '0.0' '2.0' '17.0']
 ['16/12/2006' '17:30:00' '3.702' ... '0.0' '1.0' '17.0']
 ...
 ['26/11/2010' '21:00:00' '0.938' ... '0.0' '0.0' '0.0']
 ['26/11/2010' '21:01:00' '0.934' ... '0.0' '0.0' '0.0']
 ['26/11/2010' '21:02:00' '0.932' ... '0.0' '0.0' '0.0']]


In [None]:
start_time = timeit.default_timer()

mask = (np_data[:, 5].astype(float) >= 19) & (np_data[:, 5].astype(float) <= 20)
current_filtered = np_data[mask]

mask_appliance = (current_filtered[:, 6].astype(float) + current_filtered[:, 7].astype(float)) > current_filtered[:, 8].astype(float)
selected_rows = current_filtered[mask_appliance]

end_time = timeit.default_timer()
total_time_np = end_time - start_time

print(f"Час виконання для NumPy: {time_np} секунд")
print("Домогосподарства з силою струму 19-20 А, де пральна машина та холодильник споживають більше:")
print(selected_rows)

Час виконання для NumPy: 2.0589999394360348e-06 секунд
Домогосподарства з силою струму 19-20 А, де пральна машина та холодильник споживають більше:
[['16/12/2006' '18:09:00' '4.464' ... '0.0' '37.0' '16.0']
 ['17/12/2006' '01:04:00' '4.582' ... '0.0' '13.0' '0.0']
 ['17/12/2006' '01:08:00' '4.618' ... '0.0' '27.0' '0.0']
 ...
 ['24/11/2010' '07:55:00' '4.602' ... '0.0' '40.0' '17.0']
 ['24/11/2010' '07:56:00' '4.536' ... '0.0' '39.0' '17.0']
 ['24/11/2010' '07:57:00' '4.626' ... '0.0' '39.0' '17.0']]


In [None]:
start_time = timeit.default_timer()

random_indices = np.random.choice(np_data.shape[0], size=500000, replace=False)
selected_rows = np_data[random_indices]

average_consumptions = np.mean(selected_rows[:, 6:9].astype(float), axis=0)

end_time = timeit.default_timer()
total_time_np = end_time - start_time

print(f"Час виконання для NumPy: {time_np} секунд")
print("Середні величини усіх 3-х груп споживання електричної енергії:")
print("Sub_metering_1:", average_consumptions[0], "Sub_metering_2:", average_consumptions[1], "Sub_metering_3:", average_consumptions[2])

Час виконання для NumPy: 2.0589999394360348e-06 секунд
Середні величини усіх 3-х груп споживання електричної енергії:
Sub_metering_1: 1.118904 Sub_metering_2: 1.3126 Sub_metering_3: 6.465132


In [None]:
start_time = timeit.default_timer()

mask_time = np_data[:, 1].astype(str) > '18:00:00'
mask_power = np_data[:, 2].astype(float) > 6
selected_rows = np_data[mask_time & mask_power]
mask_group = (
    (selected_rows[:, 6].astype(float) + selected_rows[:, 7].astype(float)) > selected_rows[:, 8].astype(float)
)
final_selected = selected_rows[mask_group]

half = len(final_selected) // 2
first_half = final_selected[:half:3]
second_half = final_selected[half::4]
result = np.vstack((first_half, second_half))

end_time = timeit.default_timer()
total_time_np = end_time - start_time

print(f"Час виконання для NumPy: {time_np} секунд")
print("Домогосподарства, які після 18-00 споживають понад 6 кВт за хвилину в середньому:")
print(result)

Час виконання для NumPy: 2.0589999394360348e-06 секунд
Домогосподарства, які після 18-00 споживають понад 6 кВт за хвилину в середньому:
[['16/12/2006' '18:05:00' '6.052' ... '0.0' '37.0' '17.0']
 ['16/12/2006' '18:08:00' '6.308' ... '0.0' '36.0' '17.0']
 ['22/12/2006' '21:27:00' '6.906' ... '20.0' '0.0' '16.0']
 ...
 ['20/11/2010' '18:38:00' '6.302' ... '15.0' '34.0' '17.0']
 ['20/11/2010' '18:42:00' '6.238' ... '14.0' '35.0' '16.0']
 ['20/11/2010' '18:46:00' '6.438' ... '13.0' '39.0' '16.0']]
