In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ppscore as pps
import warnings
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn import preprocessing

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold
import seaborn as sns

warnings.filterwarnings('ignore')
from IPython.display import clear_output
import json


In [2]:
# General functions and variables

le = preprocessing.LabelEncoder()

def encode_drop(df1, column_name):
    return encode_drop_per_df(df1, column_name)

def encode_drop_per_df(df, column_name):
    df['enc_'+column_name] = le.fit_transform(df[column_name].values)
    return df.drop(column_name, axis=1)

def bar(df, column_name, aggfunc=np.median, figsize=(4,4), xrotation=0):
    pivot = df.pivot_table(index=column_name,
                  values='SalePrice', aggfunc=aggfunc)

    pivot.sort_values('SalePrice').plot(kind='bar', color='blue', figsize=figsize)
    plt.xlabel(column_name)
    plt.ylabel('Median Sale Price')
    plt.xticks(rotation=xrotation)
    plt.show()
    
def encode(df1, col_name, manual_map):
    df1[col_name] = df1[col_name].map(manual_map)
    return df1

def encode_corr(df1, col_name):
    pivot = df1.pivot_table(index=col_name,
                      values='SalePrice', aggfunc=np.median)
    manual_map = {}
    for i, (idx, _) in enumerate(pivot.sort_values('SalePrice').iterrows()):
        manual_map[idx] = i

    return encode(df1, col_name, manual_map)

def dummy(df, col):
    df_dummies = pd.get_dummies(df[col], drop_first=True, prefix=col)
    return df.join(df_dummies)

In [3]:
# Modeling
def prep_data(df):
    df = df.select_dtypes(include=[np.number]).interpolate().dropna()
    y = np.log(df.SalePrice)
    X = df.drop(['SalePrice', 'Id'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(
                          X, y, random_state=42, test_size=.33)
    return X_train, X_test, y_train, y_test

def linear(df):
    return linearM(df, linear_model.LinearRegression())

def ridge(df):
    return linearM(df, linear_model.Ridge())

def linearM(df, model):
    X_train, X_test, y_train, y_test = prep_data(df)
    predictions = model.fit(X_train, y_train).predict(X_test)
    return mean_squared_error(y_test, predictions, squared=False)

In [4]:
train = pd.read_csv('data/train.csv')
#test = pd.read_csv('data/test.csv')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [8]:
df = train.copy()#.drop(['Id'], axis=1)
conf = {}

columns = df.keys()
best_rmse = 100
best_conf = {}
run = 0

def search(columns, df):
    global best_rmse, run
    #print("columns:", len(columns))
    #print("df:", len(df.keys()))
    
    if not columns.empty:
        #for col in columns:
        col = columns[0]
        #print(f"Encoding column {col}")

        if np.issubdtype(df[col], object):
            df[col].fillna("NullVal", inplace=True)
            
            #print(f"{col}\tDummyfying")
            df1 = pd.get_dummies(df, columns = [col])
            conf[col] = "dummy"
            search(columns[1:], df1)
            
            #print(f"{col}\tCorrelation Encoding")
            df1 = encode_corr(df, col)
            conf[col] = "corr"
            search(columns[1:], df1)
            
            #print(f"{col}\Encoding")
            df1 = encode_drop(df, col)
            conf[col] = "encode"
            search(columns[1:], df1)
        
        else:
            search(columns[1:], df)

    else:
        #print("Mon DF:", df.keys())
        clear_output(wait=True)
        rmse = linear(df)
        if rmse < best_rmse:
            best_rmse = rmse
            best_conf = conf
        #    print("Previous Best RMSE:", best_rmse)
        #    print("New best RMSE:", rmse)
        #else:
        run += 1
        print("Run:", run)
        print(f"Best RMSE: {best_rmse} - Current: {rmse}")
        print("Conf:", json.dumps(conf, indent=4))
        

search(columns, df.copy())

KeyboardInterrupt: 

In [9]:
df = train.copy()#.drop(['Id'], axis=1)
conf = {}

columns = df.keys()
best_rmse = 100
best_conf = {}
run = 0

def search(columns, df):
    global best_rmse, run
    #print("columns:", len(columns))
    #print("df:", len(df.keys()))
    
    if not columns.empty:
        #for col in columns:
        col = columns[0]
        #print(f"Encoding column {col}")

        if np.issubdtype(df[col], object):
            df[col].fillna("NullVal", inplace=True)
            
            #print(f"{col}\tDummyfying")
            df1 = pd.get_dummies(df, columns = [col])
            conf[col] = "dummy"
            search(columns[1:], df1)
            
            #print(f"{col}\tCorrelation Encoding")
            df1 = encode_corr(df, col)
            conf[col] = "corr"
            search(columns[1:], df1)
            
            #print(f"{col}\Encoding")
            df1 = encode_drop(df, col)
            conf[col] = "encode"
            search(columns[1:], df1)
        
        else:
            search(columns[1:], df)

    else:
        #print("Mon DF:", df.keys())
        clear_output(wait=True)
        rmse = ridge(df)
        if rmse < best_rmse:
            best_rmse = rmse
            best_conf = conf
        #    print("Previous Best RMSE:", best_rmse)
        #    print("New best RMSE:", rmse)
        #else:
        run += 1
        print("Run:", run)
        print(f"Best RMSE: {best_rmse} - Current: {rmse}")
        print("Conf:", json.dumps(conf, indent=4))
        

search(columns, df.copy())

KeyboardInterrupt: 

In [53]:
pos = len(train.select_dtypes(include='object').keys())*3

In [54]:
from random import shuffle

myrange = list(range(1,pos))
shuffle(myrange)
#print(myrange)
print(myrange[0])
print(myrange[0]//3)
print(myrange[0]%3)

13
4
1


In [57]:
train.select_dtypes(include='object').keys()[2]

'Alley'

In [None]:
train.iloc

In [59]:
def search(df, col, enc):
    global best_rmse, run
    
    df[col].fillna("NullVal", inplace=True)

    if enc == 0:
        df = pd.get_dummies(df, columns = [col])
        conf[col] = "dummy"

    elif enc == 1:
        df = encode_corr(df, col)
        conf[col] = "corr"

    elif enc == 2:
        df = encode_drop(df, col)
        conf[col] = "encode"

    return df, conf

In [60]:
df = train.copy()#.drop(['Id'], axis=1)
conf = {}

columns = train.select_dtypes(include='object').keys()
best_rmse = 100
best_conf = {}
run = 0

for r in myrange:
    col = columns[r//3]
    enc = r%3

    df1, conf = search(df, col, enc)
    
    rmse = ridge(df1)
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_conf = conf
        
    run += 1
    clear_output(wait=True)
    
    print("Run:", run)
    print(f"Best RMSE: {best_rmse} - Current: {rmse}")
    print("Conf:", json.dumps(conf, indent=4))

Run: 128
Best RMSE: 0.12827789123754027 - Current: 0.13857875147288962
Conf: {
    "LandContour": "encode",
    "GarageQual": "dummy",
    "BldgType": "encode",
    "PavedDrive": "encode",
    "BsmtFinType1": "encode",
    "Electrical": "dummy",
    "Neighborhood": "encode",
    "MSZoning": "corr",
    "HouseStyle": "encode",
    "RoofMatl": "dummy",
    "SaleCondition": "corr",
    "GarageCond": "dummy",
    "MasVnrType": "encode",
    "Foundation": "corr",
    "RoofStyle": "dummy",
    "ExterQual": "dummy",
    "SaleType": "dummy",
    "CentralAir": "corr",
    "HeatingQC": "dummy",
    "BsmtExposure": "corr",
    "Exterior1st": "corr",
    "KitchenQual": "dummy",
    "FireplaceQu": "encode",
    "Utilities": "dummy",
    "BsmtFinType2": "encode",
    "BsmtCond": "dummy",
    "Condition2": "corr",
    "Exterior2nd": "encode",
    "PoolQC": "corr",
    "Condition1": "encode",
    "Heating": "encode",
    "LotConfig": "dummy",
    "BsmtQual": "encode",
    "Street": "dummy",
    "LotSh