# Checkpoint 01 – Exercícios adicionais

Este notebook complementa a análise da base **Individual Household Electric Power Consumption** e adiciona as tarefas extras (21–25). Também trabalha com o dataset **Appliances Energy Prediction** para as tarefas 26–35 e replica os exercícios propostos para Orange Data Mining (36–40) usando Python.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
%matplotlib inline

## Carregamento da base de consumo doméstico

Carregamos novamente a base `household_power_consumption.txt`, combinando as colunas **Date** e **Time** em um índice datetime e convertendo as colunas numéricas.

In [None]:
df_hpc = pd.read_csv('hpc_data/household_power_consumption.txt', sep=';', na_values='?', dtype=str)
df_hpc['DateTime'] = pd.to_datetime(df_hpc['Date'] + ' ' + df_hpc['Time'], format='%d/%m/%Y %H:%M:%S')
df_hpc.set_index('DateTime', inplace=True)
num_cols = ['Global_active_power','Global_reactive_power','Voltage','Global_intensity','Sub_metering_1','Sub_metering_2','Sub_metering_3']
for col in num_cols:
    df_hpc[col] = pd.to_numeric(df_hpc[col], errors='coerce')

df_hpc.head()

## Tarefa 21 – Séries temporais por hora

In [None]:
hourly = df_hpc['Global_active_power'].resample('H').mean()
hourly_by_hour = hourly.groupby(hourly.index.hour).mean()
peak_hours = hourly_by_hour.sort_values(ascending=False).head(5)
peak_hours

In [None]:
hourly_by_hour.plot(marker='o')
plt.title('Média horária de Global_active_power ao longo do dia')
plt.xlabel('Hora do dia')
plt.ylabel('Potência ativa média (kW)')

## Tarefa 22 – Autocorrelação do consumo

In [None]:
acf1 = hourly.autocorr(lag=1)
acf24 = hourly.autocorr(lag=24)
acf48 = hourly.autocorr(lag=48)
(acf1, acf24, acf48)

## Tarefa 23 – PCA das variáveis elétricas

In [None]:
features = ['Global_active_power','Global_reactive_power','Voltage','Global_intensity']
df_sel = df_hpc[features].dropna()
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df_sel)
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled)
pca.explained_variance_ratio_

## Tarefa 24 – Visualização de clusters no espaço PCA

In [None]:
daily_features = pd.DataFrame({
    'Active_power_mean': df_hpc['Global_active_power'].resample('D').mean(),
    'Reactive_power_mean': df_hpc['Global_reactive_power'].resample('D').mean(),
    'Voltage_mean': df_hpc['Voltage'].resample('D').mean(),
    'Intensity_mean': df_hpc['Global_intensity'].resample('D').mean(),
    'Total_Sub_metering': (df_hpc['Sub_metering_1']+df_hpc['Sub_metering_2']+df_hpc['Sub_metering_3']).resample('D').mean()
}).dropna()
scaler_daily = MinMaxScaler()
daily_scaled = scaler_daily.fit_transform(daily_features)
kmeans_daily = KMeans(n_clusters=3, random_state=42, n_init=10)
daily_labels = kmeans_daily.fit_predict(daily_scaled)
label_series = pd.Series(daily_labels, index=daily_features.index)
pca_df = pd.DataFrame(pca_components, columns=['PC1','PC2'], index=df_sel.index)
pca_labels = label_series.reindex(pca_df.index, method='ffill')
for c in sorted(pca_labels.dropna().unique()):
    points = pca_df[pca_labels == c]
    plt.scatter(points['PC1'], points['PC2'], s=2, label=f'Cluster {c}')
plt.title('Clusters no espaço PCA')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()

## Tarefa 25 – Regressão linear vs polinomial

In [None]:
df_reg = df_hpc[['Global_active_power','Voltage']].dropna()
X = df_reg[['Voltage']]
y = df_reg['Global_active_power']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
y_pred_lin = lin_model.predict(X_test)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_pred_poly = poly_model.predict(X_test_poly)
rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))
rmse_poly = np.sqrt(mean_squared_error(y_test, y_pred_poly))
(rmse_lin, rmse_poly)

## Carregamento da base Appliances Energy Prediction

In [None]:
df_appl = pd.read_csv('energydata_complete.csv')
df_appl.head()

## Tarefa 26 – Inspeção inicial

In [None]:
df_appl.info()
df_appl.describe()

## Tarefa 27 – Distribuição do consumo de Appliances

In [None]:
df_appl['Appliances'].hist(bins=50)
plt.title('Histograma de Appliances')
plt.xlabel('Consumo (Wh)')
plt.ylabel('Frequência')

In [None]:
plt.plot(pd.to_datetime(df_appl['date']), df_appl['Appliances'])
plt.title('Série temporal de Appliances')
plt.xlabel('Data')
plt.ylabel('Consumo (Wh)')

## Tarefa 28 – Correlação com variáveis ambientais

In [None]:
env_cols = [c for c in df_appl.columns if c.startswith('T') or c.startswith('RH')]
correlations = df_appl[env_cols].corrwith(df_appl['Appliances']).sort_values(ascending=False)
correlations.head()

## Tarefa 29 – Normalização dos dados

In [None]:
num_cols = df_appl.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
df_appl_scaled = pd.DataFrame(scaler.fit_transform(df_appl[num_cols]), columns=num_cols)
df_appl_scaled.describe().loc[['min','max']]

## Tarefa 30 – PCA no dataset de Appliances

In [None]:
pca_appl = PCA(n_components=2)
components_appl = pca_appl.fit_transform(df_appl_scaled)
pca_appl.explained_variance_ratio_

## Tarefa 31 – Regressão linear múltipla

In [None]:
X = df_appl[num_cols].drop('Appliances', axis=1)
y = df_appl['Appliances']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lin_multi = LinearRegression()
lin_multi.fit(X_train, y_train)
y_pred = lin_multi.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
(rmse, mae, r2)

## Tarefa 32 – Random Forest Regressor

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
(rmse_rf, mae_rf, r2_rf)

## Tarefa 33 – K-means clustering

In [None]:
for k in [3,4,5]:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(df_appl_scaled)
    print(k, km.inertia_)

## Tarefa 34 – Classificação binária (alto vs baixo consumo)

In [None]:
median = df_appl['Appliances'].median()
df_appl['HighConsumption'] = (df_appl['Appliances'] > median).astype(int)
X_bin = df_appl[num_cols].drop(['Appliances'], axis=1)
y_bin = df_appl['HighConsumption']
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X_bin, y_bin, test_size=0.2, random_state=42, stratify=y_bin)
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_bin, y_train_bin)
y_pred_log = log_reg.predict(X_test_bin)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_bin, y_train_bin)
y_pred_rf = rf_clf.predict(X_test_bin)
(accuracy_score(y_test_bin, y_pred_log), accuracy_score(y_test_bin, y_pred_rf))

## Tarefa 35 – Avaliação de classificação

In [None]:
# Função utilitária
def classification_report(model_name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(model_name)
    print('Matriz de confusão:', cm)
    print('Accuracy', acc, 'Precision', prec, 'Recall', rec, 'F1', f1)

classification_report('Logistic Regression', y_test_bin, y_pred_log)
classification_report('Random Forest', y_test_bin, y_pred_rf)

## Exercícios no estilo Orange (36–40) com Python

### Tarefa 36 – Amostragem de 1% e distribuição

In [None]:
sample_hpc = df_hpc.sample(frac=0.01, random_state=42)
(df_hpc['Global_active_power'].mean(), sample_hpc['Global_active_power'].mean())

### Tarefa 37 – Distribuição de Global_active_power

In [None]:
df_hpc['Global_active_power'].hist(bins=50)
plt.title('Distribuição de Global_active_power')
plt.xlabel('Global_active_power (kW)')
plt.ylabel('Frequência')

### Tarefa 38 – Relação Voltage vs Global_intensity

In [None]:
corr_vi = df_hpc['Voltage'].corr(df_hpc['Global_intensity'])
corr_vi

In [None]:
plt.scatter(df_hpc['Voltage'].sample(n=5000, random_state=42), df_hpc['Global_intensity'].sample(n=5000, random_state=42), s=2, alpha=0.5)
plt.title('Scatter Voltage vs Global_intensity')
plt.xlabel('Voltage (V)')
plt.ylabel('Global_intensity (A)')

### Tarefa 39 – K-means em submeterings

In [None]:
sub = df_hpc[['Sub_metering_1','Sub_metering_2','Sub_metering_3']].dropna()
scaler_sub = MinMaxScaler()
scaled_sub = scaler_sub.fit_transform(sub)
km_sub = KMeans(n_clusters=3, random_state=42, n_init=10)
labels_sub = km_sub.fit_predict(scaled_sub)
plt.scatter(scaled_sub[:,0], scaled_sub[:,1], c=labels_sub, s=2, cmap='viridis')
plt.title('K-means em Sub-meterings')
plt.xlabel('Sub_metering_1 (scaled)')
plt.ylabel('Sub_metering_2 (scaled)')