In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df_train = pd.read_csv("/kaggle/input/widsdatathon2022/train.csv")
df_test = pd.read_csv("/kaggle/input/widsdatathon2022/test.csv")

In [3]:
df_train.shape, df_test.shape

In [4]:
df_train.describe()

In [5]:
df_train["site_eui"].describe()

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [7]:
df_train.isnull().any()

In [8]:
### calculate the percentage of records missing for each column
def missing(dff):
    print (round((dff.isnull().sum() * 100/ len(dff)),2).sort_values(ascending=False))

missing(df_train)

In [9]:
#scatter plot for numerical variables
def scatterplot(var):
    data = pd.concat([df_train['site_eui'], df_train[var]], axis=1)
    data.plot.scatter(x=var, y='site_eui', ylim=(0,1000))
    
def boxplot(var):
    data = pd.concat([df_train['site_eui'], df_train[var]], axis=1)
    f, ax = plt.subplots(figsize=(16, 8))
    fig = sns.boxplot(x=var, y="site_eui", data=data)
    fig.axis(ymin=0, ymax=1000)
    plt.xticks(rotation=90)

scatterplot('direction_max_wind_speed')

In [10]:
df_train.drop(['days_with_fog','direction_max_wind_speed','direction_peak_wind_speed','max_wind_speed'],axis=1,inplace=True)
df_test.drop(['days_with_fog','direction_max_wind_speed','direction_peak_wind_speed','max_wind_speed'],axis=1,inplace=True)

In [11]:
boxplot("energy_star_rating")

In [12]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=50)
df_train['energy_star_rating'] = imputer.fit_transform(df_train['energy_star_rating'].to_numpy().reshape(-1,1))
df_test['energy_star_rating'] = imputer.fit_transform(df_test['energy_star_rating'].to_numpy().reshape(-1,1))

In [13]:
df_train['energy_star_rating'].describe()

In [14]:
boxplot("energy_star_rating")

In [15]:
scatterplot('year_built')

In [16]:
df_train['year_built'].describe()

In [17]:
df_train[df_train['year_built']<1700]['year_built'].value_counts()

In [18]:
df_train['year_built'] = np.where(df_train['year_built']<1700,df_train['year_built'].mean(),df_train['year_built'])
df_train['year_built'] = df_train['year_built'].fillna(df_train['year_built'].mean())

df_test['year_built'] = np.where(df_test['year_built']<1700,df_test['year_built'].mean(),df_test['year_built'])
df_test['year_built'] = df_test['year_built'].fillna(df_test['year_built'].mean())


In [19]:
scatterplot('year_built')

In [20]:
#correlation matrix
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(corrmat, vmax=.8, square=True);

In [21]:
#correlation matrix
corrmat = df_train.corr().iloc[5:41,5:41].corr()
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(corrmat, vmax=.8, square=True);

In [22]:
df_train = df_train[df_train.columns.drop(list(df_train.filter(regex='max_temp')))]
df_train = df_train[df_train.columns.drop(list(df_train.filter(regex='min_temp')))]

df_test = df_test[df_test.columns.drop(list(df_test.filter(regex='max_temp')))]
df_test = df_test[df_test.columns.drop(list(df_test.filter(regex='min_temp')))]

In [23]:
#correlation matrix
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(corrmat, vmax=.8, square=True);

In [24]:
winter_cols = ['january_avg_temp', 'february_avg_temp','december_avg_temp']
summer_cols = ['june_avg_temp', 'july_avg_temp','august_avg_temp']
fall_cols = ['september_avg_temp', 'october_avg_temp','november_avg_temp']
spring_cols = ['march_avg_temp', 'april_avg_temp','may_avg_temp']

df_train['winter_avg_temp'] =  df_train[winter_cols].mean(axis=1)
df_train['summer_avg_temp'] =  df_train[summer_cols].mean(axis=1)
df_train['fall_avg_temp'] =  df_train[fall_cols].mean(axis=1)
df_train['spring_avg_temp'] =  df_train[spring_cols].mean(axis=1)


df_test['winter_avg_temp'] =  df_test[winter_cols].mean(axis=1)
df_test['summer_avg_temp'] =  df_test[summer_cols].mean(axis=1)
df_test['fall_avg_temp'] =  df_test[fall_cols].mean(axis=1)
df_test['spring_avg_temp'] =  df_test[spring_cols].mean(axis=1)

In [25]:
df_train.head()

In [26]:
df_train = df_train.drop(winter_cols+fall_cols+summer_cols+spring_cols, axis = 1)
df_test = df_test.drop(winter_cols+fall_cols+summer_cols+spring_cols, axis = 1)

In [27]:
df_train.head()

In [28]:
df_train[(df_train['days_below_0F']==0) & (df_train['days_below_10F']!=0)].head()

In [29]:
#correlation matrix
corrmat = df_train.corr().iloc[10:19,10:19]
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(corrmat, vmax=.8, square=True);


In [30]:
df_train = df_train.drop(['days_below_20F','days_below_10F','days_above_100F','days_above_90F','avg_temp'], axis = 1)
df_test = df_test.drop(['days_below_20F','days_below_10F','days_above_100F','days_above_90F','avg_temp'], axis = 1)

In [31]:
df_train.head()

In [32]:
len(df_train.columns)

In [33]:
len(df_test.columns)

In [34]:
boxplot('facility_type')

In [35]:
df_train['facility_type'].unique()

In [36]:
high_emit_type = (df_train['facility_type']=='Data_Center') | (df_train['facility_type']=='Laboratory')
mid_emit_type = ((df_train['facility_type']=='Grocery_store_or_food_market') | (df_train['facility_type']=='Health_Care_Inpatient')| (df_train['facility_type']=='Health_Care_Uncategorized')
                | (df_train['facility_type']=='Health_Care_Outpatient_Uncategorized') | (df_train['facility_type']=='Public_Safety_Penitentiary')
                | (df_train['facility_type']=='Service_Vehicle_service_repair_shop'))

df_train['facility_type_LMH'] =  np.where(high_emit_type,"High", (np.where(mid_emit_type,"Mid", "Low")))


high_emit_type = (df_test['facility_type']=='Data_Center') | (df_test['facility_type']=='Laboratory')
mid_emit_type = ((df_test['facility_type']=='Grocery_store_or_food_market') | (df_test['facility_type']=='Health_Care_Inpatient')| (df_test['facility_type']=='Health_Care_Uncategorized')
                | (df_test['facility_type']=='Health_Care_Outpatient_Uncategorized') | (df_test['facility_type']=='Public_Safety_Penitentiary')
                | (df_test['facility_type']=='Service_Vehicle_service_repair_shop'))

df_test['facility_type_LMH'] =  np.where(high_emit_type,"High", (np.where(mid_emit_type,"Mid", "Low")))


In [37]:
# df_train.groupby('facility_type_LMH').size()
df_train = df_train.drop('facility_type', axis=1)
df_test = df_test.drop('facility_type', axis=1)

In [38]:
df_train.head()

In [39]:
df_train.dtypes

In [40]:
#convert categorical variable into dummy
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [41]:
df_train.head()

In [42]:
#histogram and normal probability plot
sns.distplot(df_train['site_eui']);
fig = plt.figure()
res = stats.probplot(df_train['site_eui'], plot=plt)

In [43]:
#applying log transformation
df_train['site_eui'] = np.log(df_train['site_eui'])

In [44]:
#histogram and normal probability plot
sns.distplot(df_train['site_eui']);
fig = plt.figure()
res = stats.probplot(df_train['site_eui'], plot=plt)

In [45]:
df_train.head()

In [46]:
df_test.head()

In [47]:
y = df_train['site_eui']
y.head()

In [48]:
df_train = df_train.drop('site_eui', axis=1)

In [49]:
len(df_train.columns)

In [50]:
#Train Data df_train
#Test Data df_test
#desired output y


In [51]:
df_test.head()

In [52]:
# Compare two test and train
print(set(df_train.columns)-set(df_test.columns))

In [53]:
df_train = df_train.drop('State_Factor_State_6', axis=1)

In [54]:
from catboost import CatBoostRegressor
import random, os
MODEL_MAX_DEPTH = 12
MODEL_TASK_TYPE = 'GPU'
MODEL_RL = 0.025
MODEL_EVAL_METRIC ='RMSE'
MODEL_LOSS_FUNCTION = 'RMSE'
MODEL_ESR = 10
MODEL_VERBOSE = 1000
MODEL_ITERATIONS = 28000

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()


model = CatBoostRegressor(
    verbose=MODEL_VERBOSE,
    early_stopping_rounds=MODEL_ESR,
    random_seed=SEED,
    max_depth=MODEL_MAX_DEPTH,
    task_type=MODEL_TASK_TYPE,
    learning_rate=MODEL_RL,
    iterations=MODEL_ITERATIONS,
    loss_function=MODEL_LOSS_FUNCTION,
    eval_metric= MODEL_EVAL_METRIC
)
model.fit(df_train, y)

In [65]:
SAMPLE_SUBMISSION_PATH = "../input/widsdatathon2022/sample_solution.csv"
print(f"Testing Data Lenght{len(df_test)}")
pred_test = model.predict(df_test)
print(f"Prediction Data Lenght{len(pred_test)}")
sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)

In [60]:
sub['site_eui']=pred_test

In [61]:
sub.head()

In [66]:
sub.to_csv("submission.csv",index=False)