# Load Data

In [1]:
import gc
import os
import numpy as np
import pandas as pd
import subprocess
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [2]:
def check_fline(fpath):
    """check total number of lines of file for large files
    
    Args:
    fpath: string. file path
    
    Returns:
    None
    
    """
    lines = subprocess.run(['wc', '-l', fpath], stdout=subprocess.PIPE).stdout.decode('utf-8')
    print(lines, end='', flush=True)

In [3]:
fs=['./data/train.csv', './data/test.csv', './data/weather_test.csv','./data/weather_train.csv']
[check_fline(s) for s in fs]

20216101 ./data/train.csv
41697601 ./data/test.csv
277244 ./data/weather_test.csv
139774 ./data/weather_train.csv


[None, None, None, None]

In [4]:
# Load sample training data
df_train = pd.read_csv('./data/train.csv')
df_train_weather = pd.read_csv('./data/weather_train.csv')
df_test = pd.read_csv('./data/test.csv')
df_test_weather = pd.read_csv('./data/weather_test.csv')

In [6]:
# Show data shape
print(df_train.shape)
print(df_train_weather.shape)
print(df_test.shape)
print(df_test_weather.shape)

(20216100, 4)
(139773, 9)
(41697600, 4)
(277243, 9)


In [10]:
df_train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0


In [9]:
df_train_weather.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [11]:
def feat_value_count(df,colname):
    """value count of each feature
    
    Args
    df: data frame.
    colname: string. Name of to be valued column
    
    Returns
    df_count: data frame.
    """
    df_count = df[colname].value_counts().to_frame().reset_index()
    df_count = df_count.rename(columns={'index':colname+'_values',colname:'counts'})
    return df_count

In [12]:
feat_value_count(df_train,'building_id')

Unnamed: 0,building_id_values,counts
0,1298,35136
1,1249,35136
2,1301,35128
3,1241,35116
4,1296,35115
...,...,...
1444,783,2657
1445,420,2327
1446,53,1685
1447,604,1012


In [13]:
feat_value_count(df_train,'meter')

Unnamed: 0,meter_values,counts
0,0,12060910
1,1,4182440
2,2,2708713
3,3,1264037


In [14]:
feat_value_count(df_train_weather,'site_id')

Unnamed: 0,site_id_values,counts
0,8,8784
1,0,8784
2,13,8783
3,4,8783
4,2,8783
5,10,8782
6,6,8782
7,9,8780
8,3,8780
9,14,8777
