### Packages

In [10]:
import pandas as pd
import numpy as np
from scipy import stats
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from tpot import TPOTRegressor

### Main Code Block

In [12]:
# Load the dataset
data = pd.read_csv('data/merged.csv', infer_datetime_format=True, low_memory = False)

# Drop columns with many unique options
data = data.drop(columns = ['Unnamed: 0', 'Non Use Code', 'Assessor Remarks', 'Location', 'OPM remarks'])

# Separate numerical and categorical features
numerical_features = data.select_dtypes(include=['number']).columns
categorical_features = data.select_dtypes(exclude=['number']).columns

# Impute missing values for numerical features
imputer = SimpleImputer(strategy='mean')
data[numerical_features] = imputer.fit_transform(data[numerical_features])

# Impute missing values for categorical features
imputer = SimpleImputer(strategy='most_frequent')
data[categorical_features] = imputer.fit_transform(data[categorical_features])

# Encode high-cardinality categorical variables using target encoding
encoder = TargetEncoder(cols=['Town', 'Address', 'Property Type', 'Residential Type'])
data[categorical_features] = encoder.fit_transform(data[categorical_features], data['Sale Amount'])

# Define features (X) and target (y)
X = data.drop(columns='Sale Amount')  # Features
y = data['Sale Amount']  # Target variable

In [13]:
# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


# Define models to test
models = {
    # 'CatBoost Regression': CatBoostRegressor(loss_function="Poisson", iterations=400, border_count=254, random_state=25, depth=8),
    'Decision Tree': DecisionTreeRegressor(max_depth=6, min_samples_leaf=17, min_samples_split=5),
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
}

for model_name, model in models.items():
    print(f"Testing {model_name}:")
    model.fit(X_train, y_train)
    y_val_hat = model.predict(X_val)
    validation_score = r2_score(y_val, y_val_hat)
    print(f"Validation R-squared for {model_name}: {validation_score}")
    y_test_hat = model.predict(X_test)
    test_score = r2_score(y_test, y_test_hat)
    print(f"Test R-squared for {model_name}: {test_score}")
    print()

Testing Decision Tree:
Validation R-squared for Decision Tree: 0.8317227898498738
Test R-squared for Decision Tree: 0.5301348800153661

Testing Linear Regression:
Validation R-squared for Linear Regression: 0.8018991851684967
Test R-squared for Linear Regression: 0.9088160156684683

Testing Lasso Regression:
Validation R-squared for Lasso Regression: 0.8018996470271533
Test R-squared for Lasso Regression: 0.908816241513524

Testing Random Forest:
Validation R-squared for Random Forest: 0.8614361485901804
Test R-squared for Random Forest: 0.9760606769597313

Testing XGBoost:
Validation R-squared for XGBoost: 0.540781935969054
Test R-squared for XGBoost: 0.8528391505438797



In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and run TPOTRegressor for automated model selection with cross-validation
tpot = TPOTRegressor(generations=2, population_size=50, random_state=42, n_jobs=-1, verbosity = 2, cv = 5)
tpot.fit(X_train, y_train)

# Evaluate the best pipeline on the test set
test_score = tpot.score(X_test, y_test)
print("Test R-squared:", test_score)

# Get the best pipeline and export it as a Python script (if desired)
best_pipeline = tpot.fitted_pipeline_
best_pipeline.export('best_regression_pipeline.py')

Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -35044483261677.93

Generation 2 - Current best internal CV score: -35044483261677.93

Best pipeline: DecisionTreeRegressor(input_matrix, max_depth=6, min_samples_leaf=17, min_samples_split=5)
Test R-squared: -1325454356399.405


AttributeError: 'Pipeline' object has no attribute 'export'

### Feature Selection Work

In [None]:
# Loading the dataset
data = pd.read_csv('data/merged.csv', infer_datetime_format=True)

# Feature Selection Testing
data_full = data.drop(columns= 'Unnamed: 0')
data_pruned = data.drop(columns= ['Unnamed: 0', 'Non Use Code', 'Assessor Remarks', 'Location', 'OPM remarks', 'Town', 'Address'])
data_max_prune = data.drop(columns= ['Unnamed: 0', 'Non Use Code', 'Assessor Remarks', 'Location', 'Property Type', 'Residential Type', 'OPM remarks'])

# Removing Null Values From Data (Could Look Into Mean/Median/Mode Replacement)
data_dropped = data_full.dropna()
data_pruned_dropped = data_pruned.dropna()
data_max_prune_dropped = data_max_prune.dropna()

### Analyzing Feature Types

In [None]:
print('Full Data')
print(data_full.dtypes, '\n')

print('Data Pruned')
print(data_dropped.dtypes, '\n')

print('Data Pruned NA-Dropped')
print(data_pruned_dropped.dtypes, '\n')

print('Data Max Prune NA-Dropped')
print(data_max_prune_dropped.dtypes, '\n')

### Analyzing Feature Counts

In [None]:
print('Full Data')
print(data_full.count(), '\n')

print('Data Pruned')
print(data_dropped.count(), '\n')

print('Data Pruned NA-Dropped')
print(data_pruned_dropped.count(), '\n')

print('Data Max Prune NA-Dropped')
print(data_max_prune_dropped.count(), '\n')

### Correlation Matrix For Feature Selection Analysis

In [None]:
corr_matrix = data_pruned.corr(numeric_only=True)
plt.figure(figsize=(12,12))
ax = sns.heatmap(corr_matrix)

In [None]:
corr_matrix = data_pruned_dropped.corr(numeric_only=True)
plt.figure(figsize=(12,12))
ax = sns.heatmap(corr_matrix)

In [None]:
corr_matrix = data_max_prune_dropped.corr(numeric_only=True)
plt.figure(figsize=(12,12))
ax = sns.heatmap(corr_matrix)

### MISC

In [None]:
# data.head()

# # Validation Prediction
# y_val_hat = clf.predict(X_val)
# validation_score = r2_score(y_val, y_val_hat)
# print("Validation R-squared:", validation_score)

# # Evaluate the final model's performance on the test set
# y_test_hat = clf.predict(X_test)
# test_score = r2_score(y_test, y_test_hat)
# print("Test R-squared:", test_score)


# Handle missing values (you can use other strategies)
# data = data.dropna()

#One-Hot Encoding Categorical Values With Managable Number of Unique Option (Possibility)
#data = pd.get_dummies(data, columns = ['Property Type', 'Residential Type'])