In [266]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import math
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from xgboost import XGBRegressor

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

data = pd.read_csv("train.csv")
#features = ["LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "TotalBsmtSF", "GrLivArea", "1stFlrSF", "2ndFlrSF", "GarageArea", "PoolArea", "YrSold"]
X = data.copy()
y = data["SalePrice"]
X.drop(columns=["SalePrice"], inplace=True)
#X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

In [267]:
for colname in X.select_dtypes("object"):
  X[colname], _ = X[colname].factorize()

imputer = SimpleImputer(strategy="most_frequent")
X.iloc[:,:] = imputer.fit_transform(X)

discrete_features = X.dtypes == "int64"

In [268]:
def make_mi_scores(X, y, discrete_features):
  mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
  mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
  mi_scores = mi_scores.sort_values(ascending=False)
  return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
print(mi_scores.head(20))

OverallQual     0.570615
Neighborhood    0.500496
GrLivArea       0.482093
YearBuilt       0.370797
GarageCars      0.370386
TotalBsmtSF     0.366610
GarageArea      0.361996
ExterQual       0.329182
BsmtQual        0.325860
KitchenQual     0.325173
1stFlrSF        0.310173
MSSubClass      0.270527
FullBath        0.265014
GarageFinish    0.259831
GarageYrBlt     0.250578
YearRemodAdd    0.249197
FireplaceQu     0.208085
GarageType      0.205787
2ndFlrSF        0.204696
TotRmsAbvGrd    0.202146
Name: MI Scores, dtype: float64


In [269]:
#features = mi_scores.head(20).index
#X = X[features]
C = data.loc[:, ["YearBuilt"]]

kmeans = KMeans(n_clusters=3)
C["Cluster"] = kmeans.fit_predict(C)
#C["Cluster"] = C["Cluster"].astype("category")

X = X.join(C, how="left", lsuffix="YearBuilt")

In [270]:
C["Cluster"].value_counts()

1    595
0    591
2    274
Name: Cluster, dtype: int64

In [271]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

def competition_error_calculation(model, X_val, y_val):
  predictions = model.predict(X_val)
  return math.sqrt(mean_squared_log_error(predictions, y_val))

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_val, y_val)], verbose=False)
predictions = my_model.predict(X_val)
print("SMSE: " + str(competition_error_calculation(my_model, X_val, y_val)))



SMSE: 0.13070777113186974


In [272]:
def make_submission(model, test_data):
  predictions = model.predict(test_data)
  predictions_df = pd.DataFrame(data={"Id": range(1461, 1461 + len(test_data)), "SalePrice": predictions})
  predictions_df.to_csv("submission.csv", index=False)

test_data = pd.read_csv("test.csv")

for colname in test_data.select_dtypes("object"):
  test_data[colname], _ = test_data[colname].factorize()

imputer = SimpleImputer(strategy="most_frequent")
test_data.iloc[:,:] = imputer.fit_transform(test_data)

C = data.loc[:, ["YearBuilt"]]
kmeans = KMeans(n_clusters=3)
C["Cluster"] = kmeans.fit_predict(C)
test_data = test_data.join(C, how="left", lsuffix="YearBuilt")

make_submission(model=my_model, test_data=test_data)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,YearBuilt,Cluster
0,1461.0,20.0,0.0,80.0,11622.0,0.0,-1.0,0.0,0.0,0.0,...,-1.0,0.0,-1.0,0.0,6.0,2010.0,0.0,0.0,2003,2
1,1462.0,20.0,1.0,81.0,14267.0,0.0,-1.0,1.0,0.0,0.0,...,-1.0,-1.0,0.0,12500.0,6.0,2010.0,0.0,0.0,1976,0
2,1463.0,60.0,1.0,74.0,13830.0,0.0,-1.0,1.0,0.0,0.0,...,-1.0,0.0,-1.0,0.0,3.0,2010.0,0.0,0.0,2001,2
3,1464.0,60.0,1.0,78.0,9978.0,0.0,-1.0,1.0,0.0,0.0,...,-1.0,-1.0,-1.0,0.0,6.0,2010.0,0.0,0.0,1915,1
4,1465.0,120.0,1.0,43.0,5005.0,0.0,-1.0,1.0,1.0,0.0,...,-1.0,-1.0,-1.0,0.0,1.0,2010.0,0.0,0.0,2000,2
