# Иерархия моделей

1. Посчитать модель линейной регрессии по первым 100 зданиям.
2. Найти точность, используя только дни недели и праздники, примемяя fit_intercept=False и логарифмируя целевой показатель.

Для вычисления отсутствующих или некорректных данных построить модели по всем зданиям одного типа в одном и во всех городах.

## Подключение библиотек

In [1]:
import pandas as pd
import numpy as np

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
buildings = pd.read_csv("../data/buildings.csv")
weather = pd.read_csv("../data/weather.csv")
energy = pd.read_csv("../data/train.0.csv")

## Отбор 100 зданий, объединение и оптимизация

In [3]:
energy = energy[energy["building_id"]<100]
energy = pd.merge(
    left=energy,
    right=buildings,
    how="left",
    left_on="building_id",
    right_on="building_id",
)
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(
    left=energy,
    right=weather,
    how="left",
    left_index=True,
    right_index=True,
)
energy.reset_index(inplace=True)
energy = energy.drop(
    columns=[
        "meter",
        "year_built",
        "square_feet",
        "floor_count",
    ],
    axis=1
)

del buildings
del weather

energy = reduce_mem_usage(energy)
print(energy.info())

Потребление памяти меньше на - 56.89 Мб (минус 71.9%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864557 entries, 0 to 864556
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           864557 non-null  datetime64[ns]
 1   site_id             864557 non-null  int8          
 2   building_id         864557 non-null  int8          
 3   meter_reading       864557 non-null  float16       
 4   primary_use         864557 non-null  category      
 5   air_temperature     864263 non-null  float16       
 6   cloud_coverage      487693 non-null  float16       
 7   dew_temperature     864263 non-null  float16       
 8   precip_depth_1_hr   864459 non-null  float16       
 9   sea_level_pressure  856210 non-null  float16       
 10  wind_direction      839970 non-null  float16       
 11  wind_speed          864557 non-null  float16       
dtypes: category(1), datetime64[ns](1

## Обогащение данных: час, дни недели, праздники

In [4]:
energy["hour"] = energy["timestamp"].dt.hour.astype("int8")
energy["weekday"] = energy["timestamp"].dt.weekday.astype("int8")

for weekday in range(0, 7):
    energy[f"is_wday {str(weekday)}"] = energy["weekday"].isin([weekday]).astype("int8")

energy["date"] = pd.to_datetime(energy["timestamp"].dt.date)
dates_range = pd.date_range(start="2015-12-31", end="2017-01-01")
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())
energy["is_holiday"] = energy["date"].isin(us_holidays).astype("int8")

## Логарифмирование данных

In [5]:
energy["meter_reading_log"] = np.log(energy["meter_reading"] + 1)

## Разделение данных

In [6]:
energy_train, energy_test = train_test_split(
    energy[energy["meter_reading"] > 0],
    test_size=0.2
)
print(energy_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 427872 entries, 385883 to 710027
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           427872 non-null  datetime64[ns]
 1   site_id             427872 non-null  int8          
 2   building_id         427872 non-null  int8          
 3   meter_reading       427872 non-null  float16       
 4   primary_use         427872 non-null  category      
 5   air_temperature     427867 non-null  float16       
 6   cloud_coverage      248665 non-null  float16       
 7   dew_temperature     427867 non-null  float16       
 8   precip_depth_1_hr   427869 non-null  float16       
 9   sea_level_pressure  425626 non-null  float16       
 10  wind_direction      414086 non-null  float16       
 11  wind_speed          427872 non-null  float16       
 12  hour                427872 non-null  int8          
 13  weekday             4278

## Линейная регрессия:

1. по часам

In [7]:
hours = range(0, 24)
buildings = range(0, energy_train["building_id"].max() + 1)
lr_columns = [
    "meter_reading_log",
    "hour",
    "building_id",
    "is_holiday",
]

for wday in range(0, 7):
    lr_columns.append(f"is_wday {str(wday)}")

energy_train_lr = pd.DataFrame(energy_train, columns=lr_columns)
energy_lr = [[] for _ in range(len(buildings))]

for building in buildings:
    energy_lr[building] = [[] for _ in range(len(hours))]
    energy_train_b = energy_train_lr[energy_train_lr["building_id"] == building]
    for hour in hours:
        energy_lr[building].append([0 for _ in range(len(lr_columns) - 3)])
        energy_train_bh = pd.DataFrame(energy_train_b[energy_train_b["hour"] == hour])
        y = energy_train_bh["meter_reading_log"]
        if len(y) > 0:
            x = energy_train_bh.drop(labels=[
                    "meter_reading_log",
                    "hour",
                    "building_id",
                ],
                axis=1
            )
            model = LinearRegression(fit_intercept=False).fit(x, y)
            energy_lr[building][hour] = model.coef_
            energy_lr[building][hour] = np.append(energy_lr[building][hour], model.intercept_)

print(energy_lr[0])

[array([-0.14266659,  5.48859534,  5.42505787,  5.49555496,  5.44189769,
        5.47925173,  5.41436298,  5.45272091,  0.        ]), array([-0.03556129,  5.46504843,  5.48317308,  5.4811481 ,  5.43870647,
        5.424375  ,  5.46831597,  5.4421875 ,  0.        ]), array([-0.09178188,  5.42429508,  5.43923611,  5.49739583,  5.49632753,
        5.46313414,  5.47237723,  5.44173177,  0.        ]), array([-0.14011557,  5.52352896,  5.49473505,  5.4703776 ,  5.46563633,
        5.45341712,  5.45878233,  5.48350694,  0.        ]), array([-0.11436068,  5.47018701,  5.42390046,  5.50811298,  5.48918928,
        5.45330306,  5.455     ,  5.4384375 ,  0.        ]), array([-0.05101732,  5.44760242,  5.4367488 ,  5.47828125,  5.4640625 ,
        5.43342692,  5.4266183 ,  5.44260817,  0.        ]), array([0.04638672, 5.4523112 , 5.44850852, 5.4645544 , 5.43574219,
       5.45911458, 5.44679418, 5.45751953, 0.        ]), array([-0.1180114 ,  5.50209044,  5.45630787,  5.47809103,  5.48244934,
     