In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import datetime
import gc

## Train dataset

In [2]:
train_df = pd.read_csv('train_cleaned.csv')

In [3]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count
0,105,0,2016-01-01 00:00:00,23.3036,1,Education,50623,,5.0
1,106,0,2016-01-01 00:00:00,0.3746,1,Education,5374,,4.0
2,106,3,2016-01-01 00:00:00,0.0,1,Education,5374,,4.0
3,107,0,2016-01-01 00:00:00,175.184,1,Education,97532,2005.0,10.0
4,108,0,2016-01-01 00:00:00,91.2653,1,Education,81580,1913.0,5.0


In [4]:
def building_preprocess(df):
    df.drop(["year_built", "floor_count"], axis=1, inplace=True)
    le = LabelEncoder()
    df.primary_use = le.fit_transform(df.primary_use)

    del le
    gc.collect()

In [5]:
building_preprocess(train_df)

In [6]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet
0,105,0,2016-01-01 00:00:00,23.3036,1,0,50623
1,106,0,2016-01-01 00:00:00,0.3746,1,0,5374
2,106,3,2016-01-01 00:00:00,0.0,1,0,5374
3,107,0,2016-01-01 00:00:00,175.184,1,0,97532
4,108,0,2016-01-01 00:00:00,91.2653,1,0,81580


### weather metadata

In [7]:
weather_df = pd.read_csv('weather_train.csv')

In [8]:
weather_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [9]:
def weather_preprocess(df):
    df.drop(["sea_level_pressure", "wind_direction", "wind_speed"], axis=1, inplace=True)
    df = df.groupby("site_id").apply(lambda group: group.interpolate(limit_direction="both"))
    return df

In [10]:
weather_df = weather_preprocess(weather_df)

In [11]:
weather_df.isnull().sum()

site_id                  0
timestamp                0
air_temperature          0
cloud_coverage       17228
dew_temperature          0
precip_depth_1_hr    26273
dtype: int64

In [12]:
# 1, 5, 12 precip_depth_1_hr
# site 7, 11 cloud_coverage
site_ids = weather_df['site_id'].unique()
for site_id in site_ids:
    null_sum = weather_df.query('site_id == @site_id')['cloud_coverage'].isnull().sum()
    if null_sum != 0:
        rows = weather_df.query('site_id == @site_id').shape[0]
        print('site_id {}, rows {}, rows of null {}'.format(site_id, rows, null_sum))

site_id 7, rows 8614, rows of null 8614
site_id 11, rows 8614, rows of null 8614


### Merge  datasets

In [13]:
train_df = train_df.merge(weather_df, on=['site_id','timestamp'], how='left')

In [14]:
train_df.timestamp = pd.to_datetime(train_df.timestamp, format='%Y-%m-%d %H:%M:%S')

In [15]:
train_df["meter_reading"] = train_df["meter_reading"].map(np.log1p)

In [16]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr
0,105,0,2016-01-01,3.190624,1,0,50623,3.8,0.0,2.4,
1,106,0,2016-01-01,0.318163,1,0,5374,3.8,0.0,2.4,
2,106,3,2016-01-01,0.0,1,0,5374,3.8,0.0,2.4,
3,107,0,2016-01-01,5.171529,1,0,97532,3.8,0.0,2.4,
4,108,0,2016-01-01,4.524668,1,0,81580,3.8,0.0,2.4,


In [17]:
del weather_df
gc.collect()

76

### Creating time-based features

In [18]:
def add_timebase_features(df):
    df["hour"] = df["timestamp"].dt.hour
    df["weekend"] = df["timestamp"].dt.weekday
    df['square_feet'] =  np.log1p(df['square_feet'])
    return df

In [19]:
train_df = add_timebase_features(train_df)

In [20]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,hour,weekend
0,105,0,2016-01-01,3.190624,1,0,10.832181,3.8,0.0,2.4,,0,4
1,106,0,2016-01-01,0.318163,1,0,8.589514,3.8,0.0,2.4,,0,4
2,106,3,2016-01-01,0.0,1,0,8.589514,3.8,0.0,2.4,,0,4
3,107,0,2016-01-01,5.171529,1,0,11.487946,3.8,0.0,2.4,,0,4
4,108,0,2016-01-01,4.524668,1,0,11.309352,3.8,0.0,2.4,,0,4


In [21]:
train_df.tail()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,hour,weekend
1995,1381,1,2016-01-01,1.237591,15,6,11.345879,,,,,0,4
1996,1381,2,2016-01-01,7.330169,15,6,11.345879,,,,,0,4
1997,1382,0,2016-01-01,5.900993,15,9,12.401117,,,,,0,4
1998,1382,2,2016-01-01,7.166961,15,9,12.401117,,,,,0,4
1999,1383,0,2016-01-01,4.484414,15,0,11.567537,,,,,0,4


In [22]:
train_df.drop(['timestamp', 'building_id', 'site_id'], axis=1, inplace=True)
gc.collect()

29

In [23]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(train_df.index)
np.random.shuffle(l)
train_df = train_df.loc[l]

In [24]:
rows = train_df.shape[0]
train = int(.7 * rows)

In [25]:
train_df.head()

Unnamed: 0,meter,meter_reading,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,hour,weekend
51,3,4.615121,6,9.562405,3.8,0.0,2.4,,0,4
1327,0,5.516895,6,11.719313,-8.3,8.0,-12.2,-1.0,0,4
1488,2,7.665664,6,12.427046,-8.3,8.0,-12.2,-1.0,0,4
1432,1,4.2142,0,11.960709,-8.3,8.0,-12.2,-1.0,0,4
417,0,3.128951,9,9.489864,10.0,8.0,2.2,0.0,0,4


In [26]:
columns = ['meter_reading', 'meter', 'primary_use', 'square_feet',
           'air_temperature', 'cloud_coverage', 'dew_temperature',
           'precip_depth_1_hr', 'hour', 'weekend']

In [27]:
train_df.iloc[:train].to_csv('ashrae_train.csv',
                             index=False,
                             header=False,
                             columns=columns)

In [28]:
train_df.iloc[train:].to_csv('ashrae_validation.csv',
                             index=False,
                             header=False,
                             columns=columns)

## Test dataset

In [29]:
test_df = pd.read_csv('test.csv')

In [30]:
building_df = pd.read_csv('building_metadata.csv')
test_df = test_df.merge(building_df, on='building_id', how='left')

In [31]:
building_preprocess(test_df)

In [32]:
test_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet
0,0,0,0,2017-01-01 00:00:00,0,0,7432
1,1,1,0,2017-01-01 00:00:00,0,0,2720
2,2,2,0,2017-01-01 00:00:00,0,0,5376
3,3,3,0,2017-01-01 00:00:00,0,0,23685
4,4,4,0,2017-01-01 00:00:00,0,0,116607


In [34]:
weather_test_df = pd.read_csv('weather_test.csv')
weather_test_df = weather_preprocess(weather_test_df)

In [35]:
test_df = test_df.merge(weather_test_df, on=['site_id','timestamp'], how='left')
test_df.timestamp = pd.to_datetime(test_df.timestamp, format='%Y-%m-%d %H:%M:%S')

In [36]:
del weather_test_df
del building_df
gc.collect()

42

In [37]:
test_df = add_timebase_features(test_df)
test_df.drop(['timestamp', 'building_id', 'site_id'], axis=1, inplace=True)

In [38]:
test_df.head()

Unnamed: 0,row_id,meter,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,hour,weekend
0,0,0,0,8.913685,17.8,4.0,11.7,0.0,0,6
1,1,0,0,7.908755,17.8,4.0,11.7,0.0,0,6
2,2,0,0,8.589886,17.8,4.0,11.7,0.0,0,6
3,3,0,0,10.072639,17.8,4.0,11.7,0.0,0,6
4,4,0,0,11.666573,17.8,4.0,11.7,0.0,0,6


In [40]:
test_df.to_csv('ashrae_test.csv', index=False)

In [41]:
del train_df, test_df
gc.collect()

98

In [43]:
# Write Column List
with open('ashrae_train_column_list.txt', 'w') as f:
    f.write(','.join(columns))

## Upload data files to S3

In [48]:
import boto3

In [44]:
# Specify your bucket name
bucket_name = 'lifa08-ml-sagemaker'

training_folder = r'ashrae/training/'
validation_folder = r'ashrae/validation/'
test_folder = r'ashrae/test/'

In [45]:
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [49]:
write_to_s3('ashrae_train.csv', 
            bucket_name,
            training_folder + 'ashrae_train.csv')

write_to_s3('ashrae_validation.csv',
            bucket_name,
            validation_folder + 'ashrae_validation.csv')

write_to_s3('ashrae_test.csv',
            bucket_name,
            test_folder + 'ashrae_test.csv')