# Load Data

In [1]:
import gc
import os
import numpy as np
import pandas as pd
import subprocess
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [2]:
def check_fline(fpath):
    """check total number of lines of file for large files
    
    Args:
    fpath: string. file path
    
    Returns:
    None
    
    """
    lines = subprocess.run(['wc', '-l', fpath], stdout=subprocess.PIPE).stdout.decode('utf-8')
    print(lines, end='', flush=True)

In [15]:
fs=['./data/train.csv', './data/test.csv', './data/weather_test.csv',
    './data/weather_train.csv','./data/building_metadata.csv']
[check_fline(s) for s in fs]

20216101 ./data/train.csv
41697601 ./data/test.csv
277244 ./data/weather_test.csv
139774 ./data/weather_train.csv
1450 ./data/building_metadata.csv


[None, None, None, None, None]

In [16]:
# Load sample training data
df_train = pd.read_csv('./data/train.csv')
df_train_weather = pd.read_csv('./data/weather_train.csv')
df_test = pd.read_csv('./data/test.csv')
df_test_weather = pd.read_csv('./data/weather_test.csv')
df_building = pd.read_csv('./data/building_metadata.csv')

In [18]:
# Show data shape
[print(item.shape) for item in [df_train,df_train_weather,df_test,df_test_weather,df_building]]

(20216100, 4)
(139773, 9)
(41697600, 4)
(277243, 9)
(1449, 6)


[None, None, None, None, None]

In [10]:
df_train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0


In [9]:
df_train_weather.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [19]:
df_building.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [11]:
def feat_value_count(df,colname):
    """value count of each feature
    
    Args
    df: data frame.
    colname: string. Name of to be valued column
    
    Returns
    df_count: data frame.
    """
    df_count = df[colname].value_counts().to_frame().reset_index()
    df_count = df_count.rename(columns={'index':colname+'_values',colname:'counts'})
    return df_count

In [12]:
feat_value_count(df_train,'building_id')

Unnamed: 0,building_id_values,counts
0,1298,35136
1,1249,35136
2,1301,35128
3,1241,35116
4,1296,35115
...,...,...
1444,783,2657
1445,420,2327
1446,53,1685
1447,604,1012


In [13]:
feat_value_count(df_train,'meter')

Unnamed: 0,meter_values,counts
0,0,12060910
1,1,4182440
2,2,2708713
3,3,1264037


Looks like meter 0: electricity has more record.

In [14]:
feat_value_count(df_train_weather,'site_id')

Unnamed: 0,site_id_values,counts
0,8,8784
1,0,8784
2,13,8783
3,4,8783
4,2,8783
5,10,8782
6,6,8782
7,9,8780
8,3,8780
9,14,8777


In [20]:
feat_value_count(df_building,'primary_use')

Unnamed: 0,primary_use_values,counts
0,Education,549
1,Office,279
2,Entertainment/public assembly,184
3,Public services,156
4,Lodging/residential,147
5,Other,25
6,Healthcare,23
7,Parking,22
8,Warehouse/storage,13
9,Manufacturing/industrial,12


In [21]:
df_building.groupby('site_id')['building_id'].agg('count')

site_id
0     105
1      51
2     135
3     274
4      91
5      89
6      44
7      15
8      70
9     124
10     30
11      5
12     36
13    154
14    102
15    124
Name: building_id, dtype: int64

In [22]:
feat_value_count(df_building,'site_id')

Unnamed: 0,site_id_values,counts
0,3,274
1,13,154
2,2,135
3,15,124
4,9,124
5,0,105
6,14,102
7,4,91
8,5,89
9,8,70
