In [1]:
from pathlib import Path
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

%matplotlib inline

In [2]:
print("Pandas: ", pd.__version__)
print("Numpy:  ", np.__version__)
print("sklearn:", sklearn.__version__)

Pandas:  0.25.2
Numpy:   1.17.3
sklearn: 0.21.3


In [4]:
path = Path('/run/media/javi/NVMe/Datasets/Kaggle/ashrae-energy-prediction')
! ls {path}

building_metadata.csv  test.csv   weather_test.csv
sample_submission.csv  train.csv  weather_train.csv


In [5]:
%%time
metadata_df       = pd.read_csv(path / 'building_metadata.csv')
train_df          = pd.read_csv(path / 'train.csv',         parse_dates=['timestamp'])
test_df           = pd.read_csv(path / 'test.csv',          parse_dates=['timestamp'])
weather_train_df  = pd.read_csv(path / 'weather_train.csv', parse_dates=['timestamp'])
weather_test_df   = pd.read_csv(path / 'weather_test.csv',  parse_dates=['timestamp'])
sample_submission = pd.read_csv(path / 'sample_submission.csv')

CPU times: user 17.4 s, sys: 1.62 s, total: 19.1 s
Wall time: 20 s


In [6]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


metadata_df      = reduce_mem_usage(metadata_df)
train_df         = reduce_mem_usage(train_df)
test_df          = reduce_mem_usage(test_df)
weather_train_df = reduce_mem_usage(weather_train_df)
weather_test_df  = reduce_mem_usage(weather_test_df)

Mem. usage decreased to  0.03 Mb (60.3% reduction)
Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)


In [7]:
print("Edificios metadata:")
display(metadata_df.head())

print("Train & test:")
display(train_df.head())
display(test_df.head())

print("Train & test weather:")
display(weather_train_df.head())
display(weather_test_df.head())

Edificios metadata:


Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


Train & test:


Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01,0.0
1,1,0,2016-01-01,0.0
2,2,0,2016-01-01,0.0
3,3,0,2016-01-01,0.0
4,4,0,2016-01-01,0.0


Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01
1,1,1,0,2017-01-01
2,2,2,0,2017-01-01
3,3,3,0,2017-01-01
4,4,4,0,2017-01-01


Train & test weather:


Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.5,0.0,0.0
1,0,2016-01-01 01:00:00,24.40625,,21.09375,-1.0,1020.0,70.0,1.5
2,0,2016-01-01 02:00:00,22.796875,2.0,21.09375,0.0,1020.0,0.0,0.0
3,0,2016-01-01 03:00:00,21.09375,2.0,20.59375,0.0,1020.0,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.599609


Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2017-01-01 00:00:00,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
1,0,2017-01-01 01:00:00,17.796875,2.0,12.796875,0.0,1022.0,130.0,3.099609
2,0,2017-01-01 02:00:00,16.09375,0.0,12.796875,0.0,1022.0,140.0,3.099609
3,0,2017-01-01 03:00:00,17.203125,0.0,13.296875,0.0,1022.0,140.0,3.099609
4,0,2017-01-01 04:00:00,16.703125,2.0,13.296875,0.0,1022.5,130.0,2.599609


## Generate big dataset (Fusion)

In [8]:
train_df = train_df.merge(metadata_df, on='building_id', how='left')
test_df  = test_df.merge(metadata_df,  on='building_id', how='left')

train_df = train_df.merge(weather_train_df, on=['site_id', 'timestamp'], how='left')
test_df  = test_df.merge(weather_test_df,   on=['site_id', 'timestamp'], how='left')

display(train_df.head())
display(test_df.head())

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0


Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0,2017-01-01,0,Education,7432,2008.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
1,1,1,0,2017-01-01,0,Education,2720,2004.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
2,2,2,0,2017-01-01,0,Education,5376,1991.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
3,3,3,0,2017-01-01,0,Education,23685,2002.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
4,4,4,0,2017-01-01,0,Education,116607,1975.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609


## Qué atributos vamos a usar?

In [9]:
features = ['square_feet', 'year_built', 'air_temperature', 'dew_temperature', 
            'sea_level_pressure', 'wind_direction', 'wind_speed', 'precip_depth_1_hr', 
            'floor_count']

x = train_df[features].values
y = np.log1p(train_df[['meter_reading']].values)

test = test_df[features].values

## Qué pasa con los NaNs?

In [10]:
imputer = SimpleImputer(strategy='mean', missing_values=np.nan, add_indicator=False)
imputer.fit(x)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [11]:
x = imputer.transform(x)

In [12]:
test = imputer.transform(test)

## Regresion lineal

In [13]:
clf = LinearRegression(n_jobs=-1)
clf.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

## Predicciones

In [14]:
preds = np.expm1(clf.predict(test))
preds

array([[37.4138755 ],
       [35.73475152],
       [34.67226968],
       ...,
       [36.5585549 ],
       [41.9311274 ],
       [63.24548743]])

## Preparemos el envío

In [15]:
sample_submission.head()

Unnamed: 0,row_id,meter_reading
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [17]:
sample_submission['meter_reading'] = preds
sample_submission.head()

Unnamed: 0,row_id,meter_reading
0,0,37.413876
1,1,35.734752
2,2,34.67227
3,3,40.754263
4,4,68.036311


In [18]:
csv_fname = '1-regresion-lineal.csv.zip'
sample_submission.to_csv(csv_fname, compression='zip', index=False)

In [None]:
# LB 1.96