In [49]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [50]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('../Data/baseline_model/CC_baseline_header_long_tail.csv')

# Explore the data
print(df.head())       # Check the first few rows
print(df.info())       # Check data types and non-null counts
print(df.describe())   # Summary statistics for numeric columns

          DOM  SQFT  BEDS  BATHS  AGE  zip_cluster
0  242.509910  1368     2    2.0    3       2474.0
1  239.892049  1850     2    2.5    5       2124.0
2  236.463293  1469     2    1.0   38       2184.0
3  238.368257  2945     2    2.5    4       1949.0
4   56.000000  2536     3    2.5    3       2139.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10799 entries, 0 to 10798
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DOM          10697 non-null  float64
 1   SQFT         10799 non-null  int64  
 2   BEDS         10799 non-null  int64  
 3   BATHS        10799 non-null  float64
 4   AGE          10799 non-null  int64  
 5   zip_cluster  10799 non-null  float64
dtypes: float64(3), int64(3)
memory usage: 506.3 KB
None
                DOM          SQFT          BEDS         BATHS           AGE  \
count  10697.000000  10799.000000  10799.000000  10799.000000  10799.000000   
mean      49.771103   1347.342439  

In [51]:
# Check for missing values
print(df.isnull().sum())   # Count of NaN values per column

# Replace infinite values and drop rows with NaNs in critical columns (e.g., 'DOM')
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=['DOM'])  # Remove rows with missing 'DOM'


DOM            102
SQFT             0
BEDS             0
BATHS            0
AGE              0
zip_cluster      0
dtype: int64


In [52]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Apply normalization to AGE and SQFT
df[['AGE_normalized', 'SQFT_normalized']] = scaler.fit_transform(df[['AGE', 'SQFT']])

# Check normalized values
print(df[['AGE', 'AGE_normalized', 'SQFT', 'SQFT_normalized']].head())

   AGE  AGE_normalized  SQFT  SQFT_normalized
0    3        0.813946  1368          0.01368
1    5        0.814150  1850          0.01850
2   38        0.817515  1469          0.01469
3    4        0.814048  2945          0.02945
4    3        0.813946  2536          0.02536


In [53]:
# Example: Interaction between normalized square footage and bedrooms
df['SQFT_BEDS_interaction'] = df['SQFT_normalized'] * df['BEDS']

# Example: Interaction between normalized age and bathrooms
df['AGE_BATHS_interaction'] = df['AGE_normalized'] * df['BATHS']

# Check new interaction features
print(df[['SQFT_BEDS_interaction', 'AGE_BATHS_interaction']].head())

   SQFT_BEDS_interaction  AGE_BATHS_interaction
0               0.027360               1.627893
1               0.037000               2.035376
2               0.029380               0.817515
3               0.058901               2.035121
4               0.076081               2.034866


In [54]:
# Apply log transformation to 'DOM' to reduce the impact of outliers
df['DOM_log'] = np.log1p(df['DOM'])


In [55]:
# Bin 'AGE_normalized' into categories
age_bins = [0, 0.3, 0.7, 1]
age_labels = ['New', 'Mid-Age', 'Old']
df['AGE_binned'] = pd.cut(df['AGE_normalized'], bins=age_bins, labels=age_labels)

# Check the binned column
print(df[['AGE_normalized', 'AGE_binned']].head())

   AGE_normalized AGE_binned
0        0.813946        Old
1        0.814150        Old
2        0.817515        Old
3        0.814048        Old
4        0.813946        Old


In [59]:
from sklearn.model_selection import KFold

# Define target encoding function
def target_encode(df, cat_column, target_column, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    encoded_col = np.zeros(df.shape[0])
    
    for train_idx, valid_idx in kf.split(df):
        train_data, valid_data = df.iloc[train_idx], df.iloc[valid_idx]
        mean_target = train_data.groupby(cat_column)[target_column].mean()
        encoded_col[valid_idx] = valid_data[cat_column].map(mean_target)
    
    return encoded_col

# Apply target encoding to 'zip_cluster'
df['zip_cluster_target_encoded'] = target_encode(df, 'zip_cluster', 'DOM')
# Check the encoded column
print(df[['zip_cluster', 'zip_cluster_target_encoded']].head())

   zip_cluster  zip_cluster_target_encoded
0       2474.0                   48.130346
1       2124.0                   39.604167
2       2184.0                   54.957674
3       1949.0                   58.492998
4       2139.0                   45.858559


In [64]:
from sklearn.preprocessing import PolynomialFeatures

# Define features for polynomial transformation
poly_features = ['SQFT_normalized', 'AGE_normalized']
poly = PolynomialFeatures(degree=2, include_bias=False)

# Transform the data and add it back to the dataframe
poly_transformed = poly.fit_transform(df[poly_features])
poly_columns = poly.get_feature_names_out(poly_features)

df_poly = pd.DataFrame(poly_transformed, columns=poly_columns)
df = pd.concat([df, df_poly], axis=1)

# Check polynomial features
print(df[poly_columns].head())

   SQFT_normalized  SQFT_normalized  AGE_normalized  AGE_normalized  \
0          0.01368          0.01368        0.813946        0.813946   
1          0.01850          0.01850        0.814150        0.814150   
2          0.01469          0.01469        0.817515        0.817515   
3          0.02945          0.02945        0.814048        0.814048   
4          0.02536          0.02536        0.813946        0.813946   

   SQFT_normalized^2  SQFT_normalized AGE_normalized  AGE_normalized^2  
0           0.000187                        0.011135          0.662509  
1           0.000342                        0.015062          0.662841  
2           0.000216                        0.012009          0.668330  
3           0.000867                        0.023974          0.662675  
4           0.000643                        0.020642          0.662509  


In [66]:
print(df.columns)

Index(['DOM', 'SQFT', 'BEDS', 'BATHS', 'AGE', 'zip_cluster', 'AGE_normalized',
       'SQFT_normalized', 'SQFT_BEDS_interaction', 'AGE_BATHS_interaction',
       'DOM_log', 'AGE_binned', 'zip_cluster_target_encoded',
       'SQFT_normalized', 'AGE_normalized', 'SQFT_normalized^2',
       'SQFT_normalized AGE_normalized', 'AGE_normalized^2'],
      dtype='object')


In [67]:
from sklearn.model_selection import train_test_split

# Define the features and target variable
features = [
    'SQFT_normalized', 'BEDS', 'BATHS', 'AGE_normalized', 'zip_cluster_target_encoded',
    'SQFT_BEDS_interaction', 'AGE_BATHS_interaction', 'AGE_binned',
    'SQFT_normalized^2', 'SQFT_normalized AGE_normalized', 'AGE_normalized^2'
]
target = 'DOM'

# Split the data into train and test sets
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the shapes of the resulting splits for verification
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (8639, 13)
X_test shape: (2160, 13)
y_train shape: (8639,)
y_test shape: (2160,)


In [68]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 2, 5]
}

In [72]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
import numpy as np

# Expanded parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300],                # Number of trees
    'learning_rate': [0.01, 0.05, 0.1, 0.2],           # Step size shrinkage
    'max_depth': [3, 5, 7, 10],                        # Maximum depth of trees
    'subsample': [0.6, 0.8, 1],                        # Fraction of samples used per tree
    'colsample_bytree': [0.6, 0.8, 1],                 # Fraction of features used per tree
    'gamma': [0, 0.1, 0.3, 0.5],                       # Minimum loss reduction for split
    'min_child_weight': [1, 3, 5],                     # Minimum sum of weights in a child
    'reg_alpha': [0, 0.01, 0.1],                       # L1 regularization term
    'reg_lambda': [1, 1.5, 2],                         # L2 regularization term
    'scale_pos_weight': [1],                           # Class imbalance scaling
    'tree_method': ['exact', 'approx', 'hist'],        # Tree construction algorithm
}

# Initialize the XGBoost Regressor
xgb_regressor = XGBRegressor(random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb_regressor,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=0,  # Disable default verbose output
    n_jobs=-1
)

# Wrap the fitting process with a tqdm progress bar
with tqdm(total=len(param_grid['n_estimators']) * len(param_grid['learning_rate']) * len(param_grid['max_depth']) * 
          len(param_grid['subsample']) * len(param_grid['colsample_bytree']) * len(param_grid['gamma']) * 
          len(param_grid['min_child_weight']) * len(param_grid['reg_alpha']) * len(param_grid['reg_lambda']) * 
          len(param_grid['tree_method'])) as pbar:
    def tqdm_callback(*args, **kwargs):
        pbar.update(1)

    grid_search.fit(
        train.drop(columns=['DOM']),
        train['DOM']
    )

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE Score:", -grid_search.best_score_)

  0%|          | 0/186624 [00:00<?, ?it/s]