# Оптимизация регрессии

Несколько моделей линейной регрессии, что бы найти более оптимальную для первых 20 зданий

In [3]:
import pandas as pd
import numpy as np

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, BayesianRidge

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
buildings = pd.read_csv("../data/buildings.csv")
weather = pd.read_csv("../data/weather.csv")
energy = pd.read_csv("../data/train.0.csv")

## Взять 20 зданий, объединить, оптимизировать

In [4]:
energy = energy[energy["building_id"] < 20]
energy = pd.merge(
    left=energy,
    right=buildings,
    how="left",
    left_on="building_id",
    right_on="building_id",
)
energy = energy.set_index(["timestamp", "site_id"])
weather =weather.set_index(["timestamp", "site_id"])
energy = pd.merge(
    left=energy,
    right=weather,
    how="left",
    left_index=True,
    right_index=True,
)
energy.reset_index(inplace=True)
energy = energy.drop(
    columns=[
        "meter",
        "site_id",
        "year_built",
        "square_feet",
        "floor_count",
    ],
    axis=1
)

del buildings
del weather

energy = reduce_mem_usage(energy)
print(energy.info())

Потребление памяти меньше на - 10.39 Мб (минус 70.5%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           175680 non-null  datetime64[ns]
 1   building_id         175680 non-null  int8          
 2   meter_reading       175680 non-null  float16       
 3   primary_use         175680 non-null  category      
 4   air_temperature     175620 non-null  float16       
 5   cloud_coverage      99080 non-null   float16       
 6   dew_temperature     175620 non-null  float16       
 7   precip_depth_1_hr   175660 non-null  float16       
 8   sea_level_pressure  173980 non-null  float16       
 9   wind_direction      170680 non-null  float16       
 10  wind_speed          175680 non-null  float16       
dtypes: category(1), datetime64[ns](1), float16(8), int8(1)
memory usage: 4.4 MB
None


## Обогащение данных: час, дни недели, праздники

In [5]:
energy["hour"] = energy["timestamp"].dt.hour.astype("int8")
energy["weekday"] = energy["timestamp"].dt.weekday.astype("int8")

for weekday in range(0, 7):
    energy[f"is_wday {str(weekday)}"] = energy["weekday"].isin([weekday]).astype("int8")

energy["date"] = pd.to_datetime(energy["timestamp"].dt.date)
dates_range = pd.date_range(start="2015-12-31", end="2017-01-01")
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())
energy["is_holyday"] = energy["date"].isin(us_holidays).astype("int8")

## Логарифм

In [6]:
energy["meter_reading_log"] = np.log(energy["meter_reading"] + 1)

## Разделение данных

In [7]:
energy_train, energy_test = train_test_split(
    energy[energy["meter_reading"] > 0],
    test_size=0.2
)
print(energy_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86858 entries, 85331 to 175335
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   timestamp           86858 non-null  datetime64[ns]
 1   building_id         86858 non-null  int8          
 2   meter_reading       86858 non-null  float16       
 3   primary_use         86858 non-null  category      
 4   air_temperature     86858 non-null  float16       
 5   cloud_coverage      50535 non-null  float16       
 6   dew_temperature     86858 non-null  float16       
 7   precip_depth_1_hr   86858 non-null  float16       
 8   sea_level_pressure  86402 non-null  float16       
 9   wind_direction      84023 non-null  float16       
 10  wind_speed          86858 non-null  float16       
 11  hour                86858 non-null  int8          
 12  weekday             86858 non-null  int8          
 13  is_wday 0           86858 non-null  int8 

In [8]:
energy_train.head()

Unnamed: 0,timestamp,building_id,meter_reading,primary_use,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,weekday,is_wday 0,is_wday 1,is_wday 2,is_wday 3,is_wday 4,is_wday 5,is_wday 6,date,is_holyday,meter_reading_log
85331,2016-06-26 18:00:00,11,480.5,Education,32.8125,4.0,23.296875,0.0,1018.5,110.0,3.099609,18,6,0,0,0,0,0,0,1,2016-06-26,0,6.175781
153373,2016-11-15 12:00:00,13,467.25,Education,15.0,6.0,12.203125,0.0,1016.5,350.0,3.099609,12,1,0,1,0,0,0,0,0,2016-11-15,0,6.148438
82696,2016-06-21 06:00:00,16,1206.0,Education,21.09375,0.0,16.09375,0.0,1020.5,30.0,2.599609,6,1,0,1,0,0,0,0,0,2016-06-21,0,7.097656
75737,2016-06-06 18:00:00,17,46.40625,Office,29.40625,8.0,23.90625,0.0,1010.0,160.0,6.199219,18,0,1,0,0,0,0,0,0,2016-06-06,0,3.859375
170335,2016-12-20 20:00:00,15,281.25,Office,20.0,,16.703125,0.0,1025.0,10.0,5.699219,20,1,0,1,0,0,0,0,0,2016-12-20,0,5.644531
