# Линейная регрессия
Модель линейной регрессии энергопотребления здания, используя температуру воздуха и влажность.

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Загрузка, объединение, фильтрация данных

In [2]:
buildings= pd.read_csv("../data/buildings.csv")
weather = pd.read_csv("../data/weather.csv")
energy = pd.read_csv("../data/train.csv")

print(energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   building_id    8784 non-null   int64  
 1   meter          8784 non-null   int64  
 2   timestamp      8784 non-null   object 
 3   meter_reading  8784 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 274.6+ KB
None


In [3]:
energy = pd.merge(left=energy, right=buildings, how="left", left_on="building_id", right_on="building_id")
energy.set_index(["timestamp", "site_id"], inplace=True)

weather.set_index(["timestamp", "site_id"], inplace=True)
energy = pd.merge(left=energy, right=weather, how="left", left_index=True, right_index=True)
energy.reset_index(inplace=True)

energy = energy[energy["meter_reading"]>0]
energy["timestamp"] = pd.to_datetime(energy["timestamp"])
energy["hour"] = energy["timestamp"].dt.hour

print(energy.head())


               timestamp  site_id  building_id  meter  meter_reading  \
704  2016-01-30 08:00:00        0            0      0        43.6839   
725  2016-01-31 05:00:00        0            0      0        37.5408   
737  2016-01-31 17:00:00        0            0      0        52.5571   
2366 2016-04-08 14:00:00        0            0      0        59.3827   
2923 2016-05-01 19:00:00        0            0      0       448.0000   

     primary_use  square_feet  year_built  floor_count  air_temperature  \
704    Education         7432      2008.0          NaN              8.3   
725    Education         7432      2008.0          NaN             12.8   
737    Education         7432      2008.0          NaN             20.6   
2366   Education         7432      2008.0          NaN             21.7   
2923   Education         7432      2008.0          NaN             31.1   

      cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
704              NaN              6.

## Разделение данных на обучение и проверку

In [4]:
energy_train, energy_test = train_test_split(energy, test_size=0.2)
print(energy_train.head())

               timestamp  site_id  building_id  meter  meter_reading  \
3563 2016-05-28 11:00:00        0            0      0        174.735   
7926 2016-11-26 06:00:00        0            0      0        234.801   
7996 2016-11-29 04:00:00        0            0      0        225.927   
4489 2016-07-06 01:00:00        0            0      0        236.166   
5514 2016-08-17 18:00:00        0            0      0        310.565   

     primary_use  square_feet  year_built  floor_count  air_temperature  \
3563   Education         7432      2008.0          NaN             21.7   
7926   Education         7432      2008.0          NaN             17.8   
7996   Education         7432      2008.0          NaN             20.6   
4489   Education         7432      2008.0          NaN             31.1   
5514   Education         7432      2008.0          NaN             32.2   

      cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
3563             2.0             19.