# Экспорт и импорт данных

1. Подготовить данные для построения модели.
2. Сохранить готовые данные в: CSV, HDF5

In [1]:
import pandas as pd
import numpy as np

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
buildings = pd.read_csv("../data/buildings.csv")
weather = pd.read_csv("../data/weather.csv")
energy = pd.read_csv("../data/train.0.csv")

## Взять 20 зданий, объединить, оптимизировать

In [3]:
weather = weather[weather["site_id"] == 0]
energy = energy[energy["building_id"] < 20]
energy = pd.merge(
    left=energy,
    right=buildings,
    how="left",
    left_on="building_id",
    right_on="building_id",
)
del buildings

## Интерполяция значений

In [4]:
weather["precip_depth_1_hr"] = weather["precip_depth_1_hr"].apply(lambda x: x if x> 0 else 0)
interpolate_columns = [
    "air_temperature",
    "dew_temperature",
    "cloud_coverage",
    "wind_speed",
    "wind_direction",
    "precip_depth_1_hr",
    "sea_level_pressure",
]

for col in interpolate_columns:
    weather[col] = weather[col].interpolate(limit_direction="both", kind="cubic")

## Обогащение данных: погода

In [5]:
weather["air_temperature_diff_1"] = weather["air_temperature"].diff()
weather.at[0, "air_temperature_diff_1"] = weather.at[1, "air_temperature_diff_1"]
weather["air_temperature_diff_2"] = weather["air_temperature_diff_1"].diff()
weather.at[0, "air_temperature_diff_2"] = weather.at[1, "air_temperature_diff_2"]