# Линейная регрессия
Модель линейной регрессии энергопотребления здания, используя температуру воздуха и влажность.

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Загрузка, объединение, фильтрация данных

In [2]:
buildings= pd.read_csv("../data/buildings.csv")
weather = pd.read_csv("../data/weather.csv")
energy = pd.read_csv("../data/train.csv")

print(energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   building_id    8784 non-null   int64  
 1   meter          8784 non-null   int64  
 2   timestamp      8784 non-null   object 
 3   meter_reading  8784 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 274.6+ KB
None


In [3]:
energy = pd.merge(left=energy, right=buildings, how="left", left_on="building_id", right_on="building_id")
energy.set_index(["timestamp", "site_id"], inplace=True)

weather.set_index(["timestamp", "site_id"], inplace=True)
energy = pd.merge(left=energy, right=weather, how="left", left_index=True, right_index=True)
energy.reset_index(inplace=True)

energy = energy[energy["meter_reading"]>0]
energy["timestamp"] = pd.to_datetime(energy["timestamp"])
energy["hour"] = energy["timestamp"].dt.hour

print(energy.head())


               timestamp  site_id  building_id  meter  meter_reading  \
704  2016-01-30 08:00:00        0            0      0        43.6839   
725  2016-01-31 05:00:00        0            0      0        37.5408   
737  2016-01-31 17:00:00        0            0      0        52.5571   
2366 2016-04-08 14:00:00        0            0      0        59.3827   
2923 2016-05-01 19:00:00        0            0      0       448.0000   

     primary_use  square_feet  year_built  floor_count  air_temperature  \
704    Education         7432      2008.0          NaN              8.3   
725    Education         7432      2008.0          NaN             12.8   
737    Education         7432      2008.0          NaN             20.6   
2366   Education         7432      2008.0          NaN             21.7   
2923   Education         7432      2008.0          NaN             31.1   

      cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
704              NaN              6.

## Разделение данных на обучение и проверку

In [4]:
energy_train, energy_test = train_test_split(energy, test_size=0.2)
print(energy_train.head())

               timestamp  site_id  building_id  meter  meter_reading  \
8266 2016-12-10 10:00:00        0            0      0        88.7328   
3702 2016-06-03 06:00:00        0            0      0       249.8170   
7067 2016-10-21 11:00:00        0            0      0       261.4210   
7671 2016-11-15 15:00:00        0            0      0       202.0380   
6003 2016-09-07 03:00:00        0            0      0       258.6900   

     primary_use  square_feet  year_built  floor_count  air_temperature  \
8266   Education         7432      2008.0          NaN             10.6   
3702   Education         7432      2008.0          NaN             25.0   
7067   Education         7432      2008.0          NaN             20.6   
7671   Education         7432      2008.0          NaN             18.9   
6003   Education         7432      2008.0          NaN             25.0   

      cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
8266             NaN              7.

## Модель линейной регрессии и среднее

$$\normalsize meter\_reading = A*air\_temperature+B*air\_temperature+C$$

где:
- $meter\_reading - потребление\; энергии\; в\; кВтч\; (или\; эквивалент);$
- $air\_temperature - температура\; воздуха, С;$
- $air\_temperature - точка\; росы\; (влажность), С;$

Дополнительно вычислим среднее по часам, что бы сравнить линейную регрессию с более простой моделью

In [6]:
energy_train_avg = energy_train.groupby("hour")["meter_reading"].mean()

energy_train_lr = pd.DataFrame(
    energy_train,
    columns=["meter_reading", "air_temperature", "dew_temperature"]
)

y = energy_train_lr["meter_reading"]
x = energy_train_lr.drop(labels=["meter_reading"], axis=1)
model = LinearRegression().fit(x,y)

print(model.coef_, model.intercept_)

[2.20944873 4.11903125] 102.2129872177969
