In [171]:
#Data Cleaning and Organizing
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore')


#pre processing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

#Data Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Modeling
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [172]:
#each data set, not included in github
buildMetData = pd.read_csv('energyCSV/building_metadata.csv')
samSub = pd.read_csv('energyCSV/sample_submission.csv')
test = pd.read_csv('energyCSV/test.csv')
train = pd.read_csv('energyCSV/train.csv')
weatherTest = pd.read_csv('energyCSV/weather_test.csv')
weatherTrain = pd.read_csv('energyCSV/weather_train.csv')

In [173]:
# Function to reduce the DF size
# Code from Kaggle user Koustav Banerjee

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [174]:
#Running the function to reduce the memory
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

weatherTrain = reduce_mem_usage(weatherTrain)
weatherTest = reduce_mem_usage(weatherTest)
buildMetData = reduce_mem_usage(buildMetData)

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)
Mem. usage decreased to  0.03 Mb (60.3% reduction)


## Pre-processing

In [175]:
print(train.info(), test.info(), weatherTrain.info(), weatherTest.info(), buildMetData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   building_id    int16  
 1   meter          int8   
 2   timestamp      object 
 3   meter_reading  float32
dtypes: float32(1), int16(1), int8(1), object(1)
memory usage: 289.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   row_id       int32 
 1   building_id  int16 
 2   meter        int8  
 3   timestamp    object
dtypes: int16(1), int32(1), int8(1), object(1)
memory usage: 596.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   site_id             139773 non-null  int8   
 1   timestamp           139773 non-null  object 


The 'timestamp' coloumns in 'train', 'test', 'weatherTrain', 'weatherTest' - as well as the column 'primary_use' in BuildMetData - are currently an onject type. Need to change those formats into something more usable.

In [176]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

weatherTrain['timestamp'] = pd.to_datetime(weatherTrain['timestamp'])
weatherTest['timestamp'] = pd.to_datetime(weatherTest['timestamp'])

buildMetData['primary_use'] = buildMetData['primary_use'].astype('category')

In [177]:
print(train.info(), test.info(), weatherTrain.info(), weatherTest.info(), buildMetData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column         Dtype         
---  ------         -----         
 0   building_id    int16         
 1   meter          int8          
 2   timestamp      datetime64[ns]
 3   meter_reading  float32       
dtypes: datetime64[ns](1), float32(1), int16(1), int8(1)
memory usage: 289.2 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
 #   Column       Dtype         
---  ------       -----         
 0   row_id       int32         
 1   building_id  int16         
 2   meter        int8          
 3   timestamp    datetime64[ns]
dtypes: datetime64[ns](1), int16(1), int32(1), int8(1)
memory usage: 596.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------

Much better. Now, as the data is currently 'timestamp presents itself as a single moment in time. A single hour for every day. this can be broken into further cartegories: 'hour', 'day', 'weekday', & 'month'. This recategorizing could help us find trends in specific, months, days, or even hours for the data to find trends in.

In [178]:
train

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.000000
1,1,0,2016-01-01 00:00:00,0.000000
2,2,0,2016-01-01 00:00:00,0.000000
3,3,0,2016-01-01 00:00:00,0.000000
4,4,0,2016-01-01 00:00:00,0.000000
...,...,...,...,...
20216095,1444,0,2016-12-31 23:00:00,8.750000
20216096,1445,0,2016-12-31 23:00:00,4.825000
20216097,1446,0,2016-12-31 23:00:00,0.000000
20216098,1447,0,2016-12-31 23:00:00,159.574997


In [179]:
train["hour"] = train["timestamp"].dt.hour
train["day"] = train["timestamp"].dt.day
train["weekend"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month

test["hour"] = test["timestamp"].dt.hour
test["day"] = test["timestamp"].dt.day
test["weekend"] = test["timestamp"].dt.weekday
test["month"] = test["timestamp"].dt.month

In [180]:
train

Unnamed: 0,building_id,meter,timestamp,meter_reading,hour,day,weekend,month
0,0,0,2016-01-01 00:00:00,0.000000,0,1,4,1
1,1,0,2016-01-01 00:00:00,0.000000,0,1,4,1
2,2,0,2016-01-01 00:00:00,0.000000,0,1,4,1
3,3,0,2016-01-01 00:00:00,0.000000,0,1,4,1
4,4,0,2016-01-01 00:00:00,0.000000,0,1,4,1
...,...,...,...,...,...,...,...,...
20216095,1444,0,2016-12-31 23:00:00,8.750000,23,31,5,12
20216096,1445,0,2016-12-31 23:00:00,4.825000,23,31,5,12
20216097,1446,0,2016-12-31 23:00:00,0.000000,23,31,5,12
20216098,1447,0,2016-12-31 23:00:00,159.574997,23,31,5,12


Neat-o. Next, concatenating train & test on BuildMetData and the respective weather df so that train and test have all the infomation to train on.

In [181]:
#Merging Train, BuildMetData, weatherTrain
trainBuild = train.merge(buildMetData, on=['building_id'], how='left')
train = trainBuild.merge(weatherTrain, on=['site_id','timestamp'], how='left')

#Merging Test, BuildMetData, weatherTest
testBuild = test.merge(buildMetData, on=['building_id'], how='left')
test = testBuild.merge(weatherTest, on=['site_id','timestamp'], how='left')

MemoryError: Unable to allocate 318. MiB for an array with shape (41697600,) and data type int64

In [None]:
train

buildMetData = pd.read_csv('energyCSV/building_metadata.csv')
samSub = pd.read_csv('energyCSV/sample_submission.csv')
test = pd.read_csv('energyCSV/test.csv')
train = pd.read_csv('energyCSV/train.csv')
weatherTest = pd.read_csv('energyCSV/weather_test.csv')
weatherTrain = pd.read_csv('energyCSV/weather_train.csv')

train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])
weatherTrain['timestamp'] = pd.to_datetime(weatherTrain['timestamp'])
weatherTest['timestamp'] = pd.to_datetime(weatherTest['timestamp'])
buildMetData['primary_use'] = buildMetData['primary_use'].astype('category')

train["hour"] = train["timestamp"].dt.hour
train["day"] = train["timestamp"].dt.day
train["weekend"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month
test["hour"] = test["timestamp"].dt.hour
test["day"] = test["timestamp"].dt.day
test["weekend"] = test["timestamp"].dt.weekday
test["month"] = test["timestamp"].dt.month

temp_df = train[['building_id']]
temp_df = temp_df.merge(buildMetData, on=['building_id'], how='left')
del temp_df['building_id']
trainEx = pd.concat([train, temp_df], axis=1)

temp_df = test[['building_id']]
temp_df = temp_df.merge(buildMetData, on=['building_id'], how='left')
del temp_df['building_id']
testEx = pd.concat([test, temp_df], axis=1)

del temp_df

temp_df = trainEx[['site_id','timestamp']]
temp_df = temp_df.merge(weatherTrain, on=['site_id','timestamp'], how='left')

del temp_df['site_id'], temp_df['timestamp']
trainEx = pd.concat([train, temp_df], axis=1)

temp_df = testEx[['site_id','timestamp']]
temp_df = temp_df.merge(weatherTest, on=['site_id','timestamp'], how='left')

del temp_df['site_id'], temp_df['timestamp']
testEx = pd.concat([test, temp_df], axis=1)

del temp_df

There are a couple things that need to happen until we can call this data 'clean enough'

    - Label encode 'primary_use'
    - drop 'timestamp', 'row_id_ & 'site_id' as they are now unnecessary
    - drop 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed', & 'floor_count' as they are either
    unnecessary or will skew the data more than their worth to fix worth.

In [None]:
#'primary_use' needs to be label encoded to run a model
le = LabelEncoder()

train['primary_use'] = le.fit_transform(train['primary_use']).astype(np.int8)
test['primary_use'] = le.fit_transform(test['primary_use']).astype(np.int8)

In [None]:
#Tkaing a quick look at the full data before we remove some things
train.hist(figsize=(20,20), bins=20)

In [None]:
train

In [None]:
dropCols = ['precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed', 'site_id', 'floor_count', 'cloud_coverage']

train = train.drop(dropCols, axis = 1)
test = test.drop(dropCols + ['row_id'], axis = 1)

In [None]:
#Data Prep
scaler = StandardScaler()

cats = ["building_id", "primary_use", "hour", "day", "weekend", "month", "meter"]
nums = ["square_feet", "year_built", "air_temperature", "cloud_coverage","dew_temperature"]
featCols = cats + nums

#Manually train test splitting since the data is pre-separated
num_folds = 5
kf = KFold(n_splits = num_folds, shuffle = False, random_state = 42)
for i, (trainIndex, testIndex) in enumerate(kf.split(train)):
    if i + 1 < num_folds:
        continue
    print(trainIndex.max(), testIndex.min())
    
    xTrain = train[featCols].iloc[trainIndex]
    xTest = train[featCols].iloc[testIndex]
    yTrain = test.iloc[trainIndex]
    yTest = test.iloc[testIndex]


xTrainScaled = scaler.fit_transform(xTrain)
xTestScaled = scaler.transform(xTest)

In [None]:
logReg = LogisticRegression(random_state = 123, class_weight = 'balanced')
logReg.fit(xTrainScaled, yTrain)

yPredTrain = logReg.predict(xTrainScaled)
yPredTest = logReg.predict(xTestScaled)

#plot_confusion_matrix(logReg, xTestScaled, yTest)

print(f"Accuracy Train Score: {accuracy_score(yTrain, yPredTrain)}")
print(f"Accuracy Test Score: {accuracy_score(yTest, yPredTest)}")
print('-----')
print(f"F1-Train Score: {f1_score(yTrain, yPredTrain)}")
print(f"F1-Test Score: {f1_score(yTest, yPredTest)}")
print('-----')
print(f"Precision Train Score: {precision_score(yTrain, yPredTrain)}")
print(f"Precision Test Score: {precision_score(yTest, yPredTest)}")
print('-----')
print(f"Recall Train Score: {recall_score(yTrain, yPredTrain)}")
print(f"Recall Test Score: {recall_score(yTest, yPredTest)}")
print('-----')
print('0 is "Not at risk of a stroke"')
print('1 is "At risk of a stroke"')

In [None]:
train