# Set up

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import math

# load data

In [4]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id
0,1,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,36,50.5,...,14,0,0,0,1.0,1.0,1.0,,248.682615,0
1,1,State_1,Commercial,Warehouse_Distribution_or_Shipping_center,274000.0,1955.0,45.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,26.50015,1
2,1,State_1,Commercial,Retail_Enclosed_mall,280025.0,1951.0,97.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,24.693619,2
3,1,State_1,Commercial,Education_Other_classroom,55325.0,1980.0,46.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,48.406926,3
4,1,State_1,Commercial,Warehouse_Nonrefrigerated,66000.0,1985.0,100.0,2.4,36,50.5,...,14,0,0,0,1.0,1.0,1.0,,3.899395,4


# Feature engineering

In [78]:
def clean(df):
    df['energy_star_rating'] = df['energy_star_rating'].fillna(df.energy_star_rating.mean())
    df['year_built'] = df['year_built'].fillna(df.year_built.mean())
#     df['days_with_fog'] = df['days_with_fog'].fillna(df.days_with_fog.mean())
    df['bldg_class_ohe'] = pd.get_dummies(data=df['building_class'], drop_first=True)
    df['State_Factor'] = pd.factorize(df['State_Factor'])[0]
    df['facility_type'] = pd.factorize(df['facility_type'])[0]

    return df

In [79]:
df_train = clean(df)

# Train

In [80]:
X_cols = ['year_built', 'energy_star_rating', 'bldg_class_ohe', 'floor_area', 'days_below_30F', 'days_above_80F',
         'Year_Factor', 'ELEVATION', 'january_avg_temp', 'january_min_temp', 'july_avg_temp', 'snowfall_inches',
         'heating_degree_days', 'cooling_degree_days', 'march_min_temp', 'december_min_temp', "State_Factor",
         "facility_type"]
output = 'site_eui'

In [81]:
X = df[X_cols].astype(int)
y = df['site_eui']

# Train-validation split

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=42)

In [83]:
regr = LinearRegression()
regr.fit(X_train, y_train)

LinearRegression()

# Predict

In [84]:
y_pred = regr.predict(X_test)
y_pred

array([101.45069086,  57.32809308,  75.81298755, ...,  92.31424782,
       103.26124367,  54.07306161])

# Calc Performance

In [85]:
rmse = mean_squared_error(list(y_test), list(y_pred), squared = False)

print(rmse)

52.705784682089586


# Predict on test set

In [19]:
df_test = pd.read_csv("test.csv")

In [20]:
df_test = clean(df_test)

In [21]:
X_final = df_test[X_cols].astype(int)

In [22]:
y_final = regr.predict(X_final)

# Create Submission

In [23]:
data = {"id": list(df_test["id"]), "site_eui": list(y_final)}

In [24]:
submission = pd.DataFrame.from_dict(data)

In [25]:
submission.to_csv("submission_2022-01-13.csv", index=False)

In [26]:
!kaggle competitions submit widsdatathon2022 -f submission_2022-01-13.csv -m "sample submission"

100%|█████████████████████████████████████████| 229k/229k [00:01<00:00, 207kB/s]
Successfully submitted to WiDS Datathon 2022