In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error

from xgboost import XGBRegressor
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/widsdatathon2022/train.csv
/kaggle/input/widsdatathon2022/test.csv
/kaggle/input/widsdatathon2022/sample_solution.csv


In [2]:
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv("/kaggle/input/widsdatathon2022/train.csv")
X_test_full = pd.read_csv("/kaggle/input/widsdatathon2022/test.csv")

X_test_full.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_below_0F,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,id
0,7,State_1,Commercial,Grocery_store_or_food_market,28484.0,1994.0,37.0,2.4,38,50.596774,...,0,29,5,2,0,,,,,75757
1,7,State_1,Commercial,Grocery_store_or_food_market,21906.0,1961.0,55.0,45.7,38,50.596774,...,0,29,5,2,0,,,,,75758
2,7,State_1,Commercial,Grocery_store_or_food_market,16138.0,1950.0,1.0,59.1,38,50.596774,...,0,29,5,2,0,,,,,75759
3,7,State_1,Commercial,Grocery_store_or_food_market,97422.0,1971.0,34.0,35.4,38,50.596774,...,0,29,5,2,0,,,,,75760
4,7,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,35.0,1.8,38,50.596774,...,0,29,5,2,0,340.0,330.0,22.8,126.0,75761


In [3]:
# Using Mutual Information
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [4]:
# Remove rows with missing target, separate target from predictors
drop_cols = ['january_min_temp', 'january_avg_temp', 'january_max_temp','february_min_temp', 'february_avg_temp',
             'february_max_temp','march_min_temp', 'march_avg_temp', 'march_max_temp', 'april_min_temp',
             'april_avg_temp', 'april_max_temp', 'may_min_temp', 'may_avg_temp','may_max_temp', 'june_min_temp',
             'june_avg_temp', 'june_max_temp','july_min_temp', 'july_avg_temp', 'july_max_temp', 'august_min_temp',
             'august_avg_temp', 'august_max_temp', 'september_min_temp','september_avg_temp', 'september_max_temp',
             'october_min_temp','october_avg_temp', 'october_max_temp', 'november_min_temp','november_avg_temp',
             'november_max_temp', 'december_min_temp','december_avg_temp', 'december_max_temp', 'days_below_0F',
             'days_below_10F','days_above_100F', 'days_above_110F', 'direction_max_wind_speed','max_wind_speed',
             'Year_Factor','State_Factor', 'precipitation_inches' ]

X = X.drop(drop_cols, axis = 1)
X.drop(['site_eui'], axis=1, inplace=True)

X_test_full = X_test_full.drop(drop_cols, axis = 1)

X.head(10)


Unnamed: 0,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,cooling_degree_days,heating_degree_days,snowfall_inches,snowdepth_inches,avg_temp,days_below_30F,days_below_20F,days_above_80F,days_above_90F,direction_peak_wind_speed,days_with_fog,id
0,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,115,2960,0.0,0,56.972603,0,0,14,0,1.0,,0
1,Commercial,Warehouse_Distribution_or_Shipping_center,274000.0,1955.0,45.0,1.8,115,2960,0.0,0,56.972603,0,0,14,0,,12.0,1
2,Commercial,Retail_Enclosed_mall,280025.0,1951.0,97.0,1.8,115,2960,0.0,0,56.972603,0,0,14,0,,12.0,2
3,Commercial,Education_Other_classroom,55325.0,1980.0,46.0,1.8,115,2960,0.0,0,56.972603,0,0,14,0,,12.0,3
4,Commercial,Warehouse_Nonrefrigerated,66000.0,1985.0,100.0,2.4,115,2960,0.0,0,56.972603,0,0,14,0,1.0,,4
5,Commercial,Warehouse_Selfstorage,119900.0,1956.0,,2.4,115,2960,0.0,0,56.972603,0,0,14,0,1.0,,5
6,Commercial,Warehouse_Nonrefrigerated,91367.0,1982.0,56.0,2.4,115,2960,0.0,0,56.972603,0,0,14,0,1.0,,6
7,Commercial,Warehouse_Nonrefrigerated,50422.0,1947.0,99.0,2.4,115,2960,0.0,0,56.972603,0,0,14,0,1.0,,7
8,Commercial,Office_Uncategorized,122020.0,1929.0,98.0,2.4,115,2960,0.0,0,56.972603,0,0,14,0,1.0,,8
9,Commercial,Office_Uncategorized,102612.0,1979.0,83.0,2.4,115,2960,0.0,0,56.972603,0,0,14,0,1.0,,9


In [5]:
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3,
                                                                random_state=1)

NameError: name 'y' is not defined

In [None]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

In [None]:
# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [None]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(low_cardinality_cols, axis=1)
num_X_valid = X_valid.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

OH_X_test.head(10)

In [None]:
# make copy to avoid changing original data (when Imputing)
new_X_train = OH_X_train.copy()
# make new columns indicating what will be imputed
cols_with_missing = (col for col in new_X_train.columns if new_X_train[col].isnull().any())
for col in cols_with_missing:
    new_X_train[col + '_was_missing'] = new_X_train[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(OH_X_valid))
imputed_X_test = pd.DataFrame(my_imputer.transform(OH_X_test)

# Imputation removed column names; put them back
imputed_X_train.columns = OH_X_train.columns
imputed_X_valid.columns = OH_X_valid.columns
imputed_X_test.columns = OH_X_test.columns

imputed_X_train.head(10)

In [None]:
def score_dataset(estimators, X_train, X_valid, y_train, y_valid):
    my_model = XGBRegressor(n_estimators = estimators, learning_rate=0.05, random_state = 0)
    my_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    predictions = my_model.predict(X_valid)
    print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)) + " for n_estimators = " + str(estimators))

In [None]:
results = {}
for i in range(3,8):
    results[200*i] = score_dataset(200*i, imputed_X_train, imputed_X_valid, y_train, y_valid)

In [None]:
final_model = XGBRegressor(n_estimators = 1000, learning_rate=0.05, random_state = 0)
final_model.fit(imputed_X_train, y_train, eval_set=[(imputed_X_valid, y_valid)], verbose=False)
predictions = final_model.predict(imputed_X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

In [None]:
submission = final_model.predict(X_test)
SAMPLE_SUBMISSION_PATH = "../input/widsdatathon2022/sample_solution.csv"
SUBMISSION_PATH = "submission.csv"
sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub['site_eui'] = submission
sub.to_csv(SUBMISSION_PATH,index=False)