# Оптимизация потребления памяти
Загрузка данных по энергопотреблению всех зданий в оперативную память. Добится минимального расхода памяти.

In [1]:
import pandas as pd
import numpy as np

## Точность и размер типов

In [2]:
for t in ["f2", "f4"]:
    print(np.finfo(t))

for t in ["i1", "i2", "i4"]:
    print(np.iinfo(t))

Machine parameters for float16
---------------------------------------------------------------
precision =   3   resolution = 1.00040e-03
machep =    -10   eps =        9.76562e-04
negep =     -11   epsneg =     4.88281e-04
minexp =    -14   tiny =       6.10352e-05
maxexp =     16   max =        6.55040e+04
nexp =        5   min =        -max
smallest_normal = 6.10352e-05   smallest_subnormal = 5.96046e-08
---------------------------------------------------------------

Machine parameters for float32
---------------------------------------------------------------
precision =   6   resolution = 1.0000000e-06
machep =    -23   eps =        1.1920929e-07
negep =     -24   epsneg =     5.9604645e-08
minexp =   -126   tiny =       1.1754944e-38
maxexp =    128   max =        3.4028235e+38
nexp =        8   min =        -max
smallest_normal = 1.1754944e-38   smallest_subnormal = 1.4012985e-45
---------------------------------------------------------------

Machine parameters for int8
------

## Загрузка данных

In [3]:
buildings = pd.read_csv("../data/buildings.csv")
weather = pd.read_csv("../data/weather.csv")
energy = pd.read_csv("../data/train.0.csv")

## Потребление памяти

In [4]:
print(f"Строения: {buildings.memory_usage().sum()/1024**2} Мб")
print(f"Погода: {weather.memory_usage().sum()/1024**2} Мб")
print(f"Энергопотребление: {energy.memory_usage().sum()/1024**2} Мб")

Строения: 0.0664520263671875 Мб
Погода: 9.597572326660156 Мб
Энергопотребление: 368.06988525390625 Мб


## Функция оптимизации памяти

In [5]:
def reduce_mem_usage(df: pd.DataFrame):
    start_mem = df.memory_usage().sum()/1024**2

    for col in df.columns:
        col_type = df[col].dtypes
        
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)

        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    
    end_mem = df.memory_usage().sum()/1024**2
    print(f"Потребление памяти меньше на - {round(start_mem - end_mem, 2)} Мб (минус {round(100*(start_mem - end_mem)/start_mem, 1)}%")        
    return df

## Оптимизация пямяти: здания

In [6]:
buildings = reduce_mem_usage(buildings)
print(buildings.info())

Потребление памяти меньше на - 0.05 Мб (минус 73.9%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   site_id      1449 non-null   int8    
 1   building_id  1449 non-null   int16   
 2   primary_use  1449 non-null   category
 3   square_feet  1449 non-null   int32   
 4   year_built   675 non-null    float16 
 5   floor_count  355 non-null    float16 
dtypes: category(1), float16(2), int16(1), int32(1), int8(1)
memory usage: 17.8 KB
None


## Оптимизация пямяти: погода

In [7]:
weather = reduce_mem_usage(weather)
print(weather.info())

Потребление памяти меньше на - 6.53 Мб (минус 68.1%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   site_id             139773 non-null  int8          
 1   timestamp           139773 non-null  datetime64[ns]
 2   air_temperature     139718 non-null  float16       
 3   cloud_coverage      70600 non-null   float16       
 4   dew_temperature     139660 non-null  float16       
 5   precip_depth_1_hr   89484 non-null   float16       
 6   sea_level_pressure  129155 non-null  float16       
 7   wind_direction      133505 non-null  float16       
 8   wind_speed          139469 non-null  float16       
dtypes: datetime64[ns](1), float16(7), int8(1)
memory usage: 3.1 MB
None


## Оптимизация пямяти: энергопотребление

In [8]:
energy = reduce_mem_usage(energy)
print(energy.info())

Потребление памяти меньше на - 195.54 Мб (минус 53.1%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12060910 entries, 0 to 12060909
Data columns (total 4 columns):
 #   Column         Dtype         
---  ------         -----         
 0   building_id    int16         
 1   meter          int8          
 2   timestamp      datetime64[ns]
 3   meter_reading  float32       
dtypes: datetime64[ns](1), float32(1), int16(1), int8(1)
memory usage: 172.5 MB
None


## Объединение данных

In [9]:
energy = pd.merge(
    left=energy, 
    right=buildings,
    how="left",
    left_on="building_id",
    right_on="building_id",
)

energy = pd.merge(
    left=energy.set_index(["timestamp", "site_id"]),
    right = weather.set_index(["timestamp", "site_id"]),
    how="left",
    left_index=True,
    right_index=True,
)

energy.reset_index(inplace=True)
energy = energy.drop(columns=["site_id", "meter"], axis=1)
print(energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12060910 entries, 0 to 12060909
Data columns (total 14 columns):
 #   Column              Dtype         
---  ------              -----         
 0   timestamp           datetime64[ns]
 1   building_id         int16         
 2   meter_reading       float32       
 3   primary_use         category      
 4   square_feet         int32         
 5   year_built          float16       
 6   floor_count         float16       
 7   air_temperature     float16       
 8   cloud_coverage      float16       
 9   dew_temperature     float16       
 10  precip_depth_1_hr   float16       
 11  sea_level_pressure  float16       
 12  wind_direction      float16       
 13  wind_speed          float16       
dtypes: category(1), datetime64[ns](1), float16(9), float32(1), int16(1), int32(1)
memory usage: 425.6 MB
None
