In [None]:
# kaggle competitions download -c house-prices-advanced-regression-techniques -p "/Users/issacsmacbookpro/.cursor-tutor/projects/Housing Price Prediction"
# https://www.kaggle.com/code/ryannolan1/kaggle-housing-youtube-video

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

In [None]:
train_df = pd.read_csv('/Users/issacsmacbookpro/.cursor-tutor/projects/Housing Price Prediction/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('/Users/issacsmacbookpro/.cursor-tutor/projects/Housing Price Prediction/house-prices-advanced-regression-techniques/test.csv')

In [None]:
# Add mkt_cyc column to train_df
train_df['mkt_cyc'] = pd.cut(train_df['YrSold'],
                              bins=[2005, 2008, 2010, 2012],  # Adjusted bins
                              labels=['Peak and Initial Decline', 'Crash and Bottom', 'Early Recovery'],  # Corresponding labels
                              right=False)  # Changed to right=False to include the left edge

# Add mkt_cyc column to test_df
test_df['mkt_cyc'] = pd.cut(test_df['YrSold'],
                            bins=[2005, 2008, 2010, 2012],  # Adjusted bins
                            labels=['Peak and Initial Decline', 'Crash and Bottom', 'Early Recovery'],  # Corresponding labels
                            right=False)  # Changed to right=False to include the left edge

# Add house_age column to train_df
train_df['house_age'] = train_df['YrSold'] - train_df['YearBuilt']  # Calculate house age

# Add house_age_range column to train_df
train_df['house_age_range'] = pd.cut(
    train_df['house_age'],
    bins=[-1, 5, 20, 50, 80, 100, float('inf')],
    labels=['New Construction', 'Recent Builds', 'Contemporary', 'Mid-Century', 'Pre-War', 'Historic']
)

# Add house_age column to test_df
test_df['house_age'] = test_df['YrSold'] - test_df['YearBuilt']  # Calculate house age

# Add house_age_range column to test_df
test_df['house_age_range'] = pd.cut(
    test_df['house_age'],
    bins=[-1, 5, 20, 50, 80, 100, float('inf')],
    labels=['New Construction', 'Recent Builds', 'Contemporary', 'Mid-Century', 'Pre-War', 'Historic']
)

In [None]:
# train_df.head()
test_df.head()

In [None]:

# Assuming df is your dataframe
numeric_features = train_df.select_dtypes(include=['int64', 'float64'])

# Display the numeric features
print(numeric_features.columns)

In [None]:

# Assuming df is your dataframe
numeric_features_test = test_df.select_dtypes(include=['int64', 'float64'])

# Display the numeric features
print(numeric_features_test.columns)

In [None]:
# Get summary statistics of numeric features
numeric_summary = numeric_features.describe()

# Display the summary
print(numeric_summary)



In [None]:
# Get summary statistics of numeric features
numeric_summary_test = numeric_features_test.describe()

# Display the summary
print(numeric_summary_test)





In [None]:
# Check for missing values in numeric features
missing_values = numeric_features.isnull().sum()

# Display features with missing values
print(missing_values[missing_values > 0])

In [None]:
# Check for missing values in numeric features
missing_values_test = numeric_features_test.isnull().sum()

# Display features with missing values
print(missing_values_test[missing_values_test > 0])

In [None]:
# Fill missing LotFrontage based on the median LotFrontage for each Neighborhood
train_df['LotFrontage'] = train_df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

train_df['MasVnrArea'] = train_df.groupby('Neighborhood')['MasVnrArea'].transform(lambda x: x.fillna(x.median()))

train_df['GarageYrBlt'] = train_df.groupby('Neighborhood')['GarageYrBlt'].transform(lambda x: x.fillna(x.median()))

# Double Check for missing values in numeric features 
numeric_features = train_df.select_dtypes(include=['int64', 'float64'])
# Check for missing values in numeric features
missing_values = numeric_features.isnull().sum()

# Display features with missing values
print(missing_values[missing_values > 0])


In [None]:
# Fill missing values in test_df based on the median for specified features
features_to_fill = ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
                    'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
                    'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']

for feature in features_to_fill:
    test_df[feature] = test_df[feature].fillna(test_df[feature].median())

# Double Check for missing values in numeric features 
numeric_features_test = test_df.select_dtypes(include=['int64', 'float64'])
# Check for missing values in numeric features
missing_values_test = numeric_features_test.isnull().sum()

# Display features with missing values
print(missing_values_test[missing_values_test > 0])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Assuming df is your dataframe
# Step 1: Sort out numeric features
numeric_features = train_df.select_dtypes(include=['int64', 'float64'])

# Step 2: Exclude 'LotArea', 'SalePrice', and 'Id'
numeric_features_excluded = numeric_features.drop(columns=['SalePrice', 'Id'], errors='ignore')

# Step 3: Create an empty figure
fig = go.Figure()

# Step 4: Add a scatter plot for each feature and make them invisible initially
for feature in numeric_features_excluded.columns:
    fig.add_trace(go.Scatter(x=train_df[feature], y=train_df['SalePrice'], mode='markers', name=feature, visible=False))

# Step 5: Make the first feature visible
fig.data[0].visible = True

# Step 6: Create slider steps
slider_steps = [
    {'label': feature, 'method': 'update', 'args': [{'visible': [i == j for j in range(len(fig.data))]}, {'title': f'{feature} vs SalePrice', 'xaxis': {'title': feature}}]}
    for i, feature in enumerate(numeric_features_excluded.columns)
]

# Step 7: Add slider to the figure
fig.update_layout(
    sliders=[{
        'active': 0,
        'pad': {"t": 50},
        'steps': slider_steps
    }],
    title=' vs SalePrice (Excluding LotArea, SalePrice, and Id)',
    xaxis_title='Feature Value',  # This will be updated dynamically
    yaxis_title='SalePrice'
)

fig.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Assuming df is your dataframe
# Step 1: Sort out numeric features
numeric_features = test_df.select_dtypes(include=['int64', 'float64'])

# Step 2: Exclude 'LotArea', 'SalePrice', and 'Id'
numeric_features_excluded = numeric_features.drop(columns=['SalePrice', 'Id'], errors='ignore')

# Step 3: Create an empty figure
fig = go.Figure()

# Step 4: Add a scatter plot for each feature and make them invisible initially
for feature in numeric_features_excluded.columns:
    fig.add_trace(go.Scatter(x=test_df[feature], y=train_df['SalePrice'], mode='markers', name=feature, visible=False))

# Step 5: Make the first feature visible
fig.data[0].visible = True

# Step 6: Create slider steps
slider_steps = [
    {'label': feature, 'method': 'update', 'args': [{'visible': [i == j for j in range(len(fig.data))]}, {'title': f'{feature} vs SalePrice', 'xaxis': {'title': feature}}]}
    for i, feature in enumerate(numeric_features_excluded.columns)
]

# Step 7: Add slider to the figure
fig.update_layout(
    sliders=[{
        'active': 0,
        'pad': {"t": 50},
        'steps': slider_steps
    }],
    title=' vs SalePrice (Excluding LotArea, SalePrice, and Id)',
    xaxis_title='Feature Value',  # This will be updated dynamically
    yaxis_title='SalePrice'
)

fig.show()

In [None]:
# Outliers
# LotFrontage >300
# LotArea > 100000
# MasVnrArea > 1500
# BsmtFinSF1 > 5000
# TotalBsmtSF > 6000
# 1stFlrSF > 4500
# GrLivArea > 5500
# OpenPorchSF > 500
# EnclosedPorch > 500
# 3SsnPorch > 400
# ScreenPorch > 400
# MiscVal > 80000





import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

def find_outliers_for_all_numeric_features(df, multiplier=3):
    # Step 1: Select only the numeric features
    numeric_features = df.select_dtypes(include=['int64', 'float64'])
    
    # Step 2: Exclude specific columns
    columns_to_exclude = ['Id', 'MSSubClass', 'OverallQual', 'OverallCond']
    numeric_features = numeric_features.drop(columns=columns_to_exclude, errors='ignore')
    
    # Step 3: Create a dictionary to store outliers DataFrames
    outliers_dict = {}
    
    # Step 4: Iterate over each numeric feature and find outlier rows using IQR method
    for feature in numeric_features.columns:
        # Step 5: Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df[feature].quantile(0.03)
        Q3 = df[feature].quantile(0.97)
        IQR = Q3 - Q1
        
        # Step 6: Define the lower and upper bounds for outliers with a more stringent multiplier
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR
        
        # Step 7: Find the rows where the value is outside the bounds
        outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
        
        # Step 8: Sort the outliers DataFrame by the feature in descending order
        outliers_sorted = outliers.sort_values(by=feature, ascending=False)
        
        # Step 9: Store the sorted outliers DataFrame in the dictionary
        outliers_dict[feature] = outliers_sorted
    
    return outliers_dict

outliers_dict = find_outliers_for_all_numeric_features(train_df, multiplier=1.5)

# Create a dropdown widget
dropdown = widgets.Dropdown(
    options=outliers_dict.keys(),
    description='Feature:',
    disabled=False,
)

# Function to display the selected DataFrame
def display_outliers(change):
    feature = change['new']
    clear_output(wait=True)
    display(dropdown)
    # Highlight the selected feature
    styled_df = outliers_dict[feature].style.applymap(lambda x: 'background-color: yellow', subset=[feature])
    display(styled_df)

# Set up the event listener for the dropdown
dropdown.observe(display_outliers, names='value')

# Display the dropdown
display(dropdown)

# Display the initial DataFrame with highlighting
initial_feature = next(iter(outliers_dict))
styled_initial_df = outliers_dict[initial_feature].style.applymap(lambda x: 'background-color: yellow', subset=[initial_feature])
display(styled_initial_df)

In [None]:
# Remove Outliers Based on Defined Criteria

# Define outlier conditions
outlier_conditions = (
    (train_df['LotFrontage'] > 300) |
    (train_df['LotArea'] > 100000) |
    (train_df['MasVnrArea'] > 1500) |
    (train_df['BsmtFinSF1'] > 5000) |
    (train_df['TotalBsmtSF'] > 6000) |
    (train_df['1stFlrSF'] > 4500) |
    (train_df['GrLivArea'] > 5500) |
    (train_df['OpenPorchSF'] > 500) |
    (train_df['EnclosedPorch'] > 500) |
    (train_df['3SsnPorch'] > 400) |
    (train_df['ScreenPorch'] > 400) |
    (train_df['MiscVal'] > 80000)
)

# Display number of outliers detected
num_outliers = train_df[outlier_conditions].shape[0]
print(f"Number of outliers detected: {num_outliers}")

# Remove outliers from the training dataset
train_df_cleaned = train_df[~outlier_conditions].reset_index(drop=True)

# Display the new shape of the training dataset
print(f"New training dataset size: {train_df_cleaned.shape}")

# (Optional) Save the cleaned dataset for future use
# train_df_cleaned.to_csv('/path/to/save/cleaned_train.csv', index=False)

In [None]:
# Remove Outliers Based on Defined Criteria

# Define outlier conditions
outlier_conditions = (
    (test_df['LotFrontage'] > 300) |
    (test_df['LotArea'] > 100000) |
    (test_df['MasVnrArea'] > 1500) |
    (test_df['BsmtFinSF1'] > 5000) |
    (test_df['TotalBsmtSF'] > 6000) |
    (test_df['1stFlrSF'] > 4500) |
    (test_df['GrLivArea'] > 5500) |
    (test_df['OpenPorchSF'] > 500) |
    (test_df['EnclosedPorch'] > 500) |
    (test_df['3SsnPorch'] > 400) |
    (test_df['ScreenPorch'] > 400) |
    (test_df['GarageYrBlt'] > 2020) |
    (test_df['MiscVal'] > 80000)
)

# Display number of outliers detected
num_outliers = test_df[outlier_conditions].shape[0]
print(f"Number of outliers detected: {num_outliers}")

# Remove outliers from the training dataset
test_df_cleaned = test_df[~outlier_conditions].reset_index(drop=True)

# Display the new shape of the training dataset
print(f"New training dataset size: {test_df_cleaned.shape}")

# (Optional) Save the cleaned dataset for future use
# train_df_cleaned.to_csv('/path/to/save/cleaned_train.csv', index=False)

In [None]:
pd.DataFrame(train_df_cleaned.isnull().sum().sort_values(ascending=False)).head(20)

pd.DataFrame(test_df_cleaned.isnull().sum().sort_values(ascending=False)).head(20)


In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd

# Identify categorical features
categorical_features = train_df_cleaned.select_dtypes(include=['object', 'category']).columns.tolist()

# Calculate null counts for categorical features
null_counts_dict = {
    feature: train_df_cleaned[feature].isnull().sum()
    for feature in categorical_features
}

# Sort categorical features by null counts
sorted_categorical_features = sorted(categorical_features, key=lambda x: null_counts_dict[x], reverse=True)

# Create a dictionary of unique values and their counts for each categorical feature
unique_counts_dict = {
    feature: train_df_cleaned[feature].value_counts(dropna=False).sort_values(ascending=False)
    for feature in sorted_categorical_features
}

# Create a dropdown widget for selecting categorical features
categorical_dropdown_counts = widgets.Dropdown(
    options=sorted_categorical_features,
    description='Feature:',
    disabled=False,
)

# Function to display unique values and their counts based on selected feature
def display_unique_counts(change):
    feature = change['new']
    clear_output(wait=True)
    display(categorical_dropdown_counts)
    
    counts_series = unique_counts_dict.get(feature, pd.Series(dtype=int))
    
    # Replace NaN with 'NaN' string for better readability
    counts_series = counts_series.rename(index={pd.NA: 'NaN', None: 'NaN'})
    counts_series.index = counts_series.index.fillna('NaN')
    
    # Create a DataFrame for better display
    counts_df = counts_series.reset_index()
    counts_df.columns = [feature, 'Count']
    
    # Display the counts DataFrame
    display(counts_df)

# Set up the observer for the dropdown menu
categorical_dropdown_counts.observe(display_unique_counts, names='value')

# Display the dropdown menu
display(categorical_dropdown_counts)

# Display unique counts for the initially selected feature
if sorted_categorical_features:
    display_unique_counts({'new': categorical_dropdown_counts.value})

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd

# Identify categorical features
categorical_features = test_df_cleaned.select_dtypes(include=['object', 'category']).columns.tolist()

# Calculate null counts for categorical features
null_counts_dict = {
    feature: test_df_cleaned[feature].isnull().sum()
    for feature in categorical_features
}

# Sort categorical features by null counts
sorted_categorical_features = sorted(categorical_features, key=lambda x: null_counts_dict[x], reverse=True)

# Create a dictionary of unique values and their counts for each categorical feature
unique_counts_dict = {
    feature: test_df_cleaned[feature].value_counts(dropna=False).sort_values(ascending=False)
    for feature in sorted_categorical_features
}

# Create a dropdown widget for selecting categorical features
categorical_dropdown_counts = widgets.Dropdown(
    options=sorted_categorical_features,
    description='Feature:',
    disabled=False,
)

# Function to display unique values and their counts based on selected feature
def display_unique_counts(change):
    feature = change['new']
    clear_output(wait=True)
    display(categorical_dropdown_counts)
    
    counts_series = unique_counts_dict.get(feature, pd.Series(dtype=int))
    
    # Replace NaN with 'NaN' string for better readability
    counts_series = counts_series.rename(index={pd.NA: 'NaN', None: 'NaN'})
    counts_series.index = counts_series.index.fillna('NaN')
    
    # Create a DataFrame for better display
    counts_df = counts_series.reset_index()
    counts_df.columns = [feature, 'Count']
    
    # Display the counts DataFrame
    display(counts_df)

# Set up the observer for the dropdown menu
categorical_dropdown_counts.observe(display_unique_counts, names='value')

# Display the dropdown menu
display(categorical_dropdown_counts)

# Display unique counts for the initially selected feature
if sorted_categorical_features:
    display_unique_counts({'new': categorical_dropdown_counts.value})

In [None]:
# Define the order for each ordinal categorical variable with '0' for NaN
ordinal_mappings = {
    'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'BsmtQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'BsmtCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'BsmtExposure': {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4, np.nan: 0},
    'BsmtFinType1': {'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6, np.nan: 0},
    'BsmtFinType2': {'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6, np.nan: 0},
    'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'Functional': {
        'Sal': 1,
        'Sev': 2,
        'Maj2': 3,
        'Maj1': 4,
        'Mod': 5,
        'Min2': 6,
        'Min1': 7,
        'Typ': 8,
        np.nan: 0
    },
    'FireplaceQu': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'GarageFinish': {'Unf': 1, 'RFn': 2, 'Fin': 3, np.nan: 0},
    'GarageQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'GarageCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'PoolQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'Fence': {'MnWw': 1, 'MnPrv': 2, 'GdWo': 3, 'GdPrv': 4, np.nan: 0}
}

# Make a copy of the dataset to avoid modifying the original data
df = train_df_cleaned.copy()
# Encode each ordinal categorical variable and handle NaN values by assigning '0'
for col, mapping in ordinal_mappings.items():
    df[col] = df[col].map(mapping).fillna(0).astype(int)



In [None]:
# Define the order for each ordinal categorical variable with '0' for NaN
ordinal_mappings = {
    'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'BsmtQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'BsmtCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'BsmtExposure': {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4, np.nan: 0},
    'BsmtFinType1': {'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6, np.nan: 0},
    'BsmtFinType2': {'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6, np.nan: 0},
    'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'Functional': {
        'Sal': 1,
        'Sev': 2,
        'Maj2': 3,
        'Maj1': 4,
        'Mod': 5,
        'Min2': 6,
        'Min1': 7,
        'Typ': 8,
        np.nan: 0
    },
    'FireplaceQu': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'GarageFinish': {'Unf': 1, 'RFn': 2, 'Fin': 3, np.nan: 0},
    'GarageQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'GarageCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'PoolQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0},
    'Fence': {'MnWw': 1, 'MnPrv': 2, 'GdWo': 3, 'GdPrv': 4, np.nan: 0}
}

# Make a copy of the dataset to avoid modifying the original data
df = test_df_cleaned.copy()
# Encode each ordinal categorical variable and handle NaN values by assigning '0'
for col, mapping in ordinal_mappings.items():
    df[col] = df[col].map(mapping).fillna(0).astype(int)



# List of nominal categorical variables
nominal_vars = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape',
    'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
    'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
    'Exterior2nd', 'Foundation', 'Heating', 'CentralAir',
    'Electrical', 'SaleType', 'SaleCondition', 'PavedDrive',
    'MiscFeature','MasVnrType','GarageType']

# Fill NaN values in nominal variables with 'None' before one-hot encoding
df[nominal_vars] = df[nominal_vars].fillna('None')





In [None]:
# List of nominal categorical variables
nominal_vars = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape',
    'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
    'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
    'Exterior2nd', 'Foundation', 'Heating', 'CentralAir',
    'Electrical', 'SaleType', 'SaleCondition', 'PavedDrive',
    'MiscFeature','MasVnrType','GarageType']

# Fill NaN values in nominal variables with 'None' before one-hot encoding
df[nominal_vars] = df[nominal_vars].fillna('None')



In [None]:
df.head()
# One-hot encode nominal variables with drop_first to avoid dummy variable trap

In [None]:
# One-hot encode nominal variables with drop_first to avoid dummy variable trap
df = pd.get_dummies(df, columns=nominal_vars, drop_first=True)


In [None]:
df.head()
# One-hot encode nominal variables with drop_first to avoid dummy variable trap

In [None]:
# Identify remaining categorical columns not yet encoded
remaining_categorical = df.select_dtypes(include=['object', 'category']).columns.tolist()

# If there are any remaining categorical columns, encode them
if remaining_categorical:
    print("Encoding the following remaining categorical columns:")
    for col in remaining_categorical:
        print(f"- {col}")
        
        # Ensure 'None' is a category before filling NaN values
        if pd.api.types.is_categorical_dtype(df[col]):
            df[col] = df[col].cat.add_categories(['None'])
        
    # Fill NaN values with 'None' to handle missing data
    df[remaining_categorical] = df[remaining_categorical].fillna('None')
    
    # One-hot encode the remaining categorical variables
    df = pd.get_dummies(df, columns=remaining_categorical, drop_first=True)
else:
    print("No remaining categorical columns to encode.")

In [None]:
# Modified Code to Display Unique Values of Encoded Categorical Variables

import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

# Function to extract original categorical features and their unique categories from encoded DataFrame
def get_encoded_categories(df, nominal_vars):
    """
    Extracts the original categorical features and their unique categories based on one-hot encoded columns.
    
    Parameters:
    - df (pd.DataFrame): The encoded DataFrame.
    - nominal_vars (list): List of nominal categorical variables that were one-hot encoded.
    
    Returns:
    - dict: A dictionary where keys are original features and values are lists of categories.
    """
    encoded_categories = {}
    for var in nominal_vars:
        # Find all columns that start with the nominal variable name followed by an underscore
        pattern = f"{var}_"
        matching_cols = [col for col in df.columns if col.startswith(pattern)]
        # Extract the category names by removing the variable prefix
        categories = [col[len(pattern):] for col in matching_cols]
        encoded_categories[var] = categories
    return encoded_categories

# List of nominal variables that were one-hot encoded
nominal_vars = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape',
    'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
    'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
    'Exterior2nd', 'Foundation', 'Heating', 'CentralAir',
    'Electrical', 'SaleType', 'SaleCondition', 'PavedDrive',
    'MiscFeature','MasVnrType','GarageType','house_age','mkt_cyc'
]

# Get the mapping of original categorical features to their categories
encoded_categories = get_encoded_categories(df, nominal_vars)

# Display the encoded categories using a dropdown
encoded_dropdown = widgets.Dropdown(
    options=encoded_categories.keys(),
    description='Feature:',
    disabled=False,
)

# Function to display categories for the selected feature
def display_encoded_categories(change):
    feature = change['new']
    clear_output(wait=True)
    display(encoded_dropdown)
    
    categories = encoded_categories.get(feature, [])
    categories_df = pd.DataFrame({
        'Encoded Column': [f"{feature}_{cat}" for cat in categories],
        'Original Category': categories
    })
    
    display(categories_df)

# Set up the observer for the dropdown menu
encoded_dropdown.observe(display_encoded_categories, names='value')

# Display the dropdown menu
display(encoded_dropdown)

# Display unique categories for the initially selected feature
if encoded_categories:
    initial_feature = list(encoded_categories.keys())[0]
    display_encoded_categories({'new': initial_feature})

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


train_df=df
# Compute the correlation matrix


# Step 1: Compute the correlation matrix
correlation_matrix = train_df.corr()

# Step 2: Set the threshold for high correlation
threshold = 0.8

# Step 3: Filter the correlation matrix for values above the threshold
# We take the absolute value to capture both positive and negative correlations
high_corr_matrix = correlation_matrix[(correlation_matrix.abs() > threshold) & (correlation_matrix.abs() < 1)]

# Step 4: Drop rows and columns with all NaN values (since the matrix will be sparse after filtering)
high_corr_matrix = high_corr_matrix.dropna(how='all').dropna(axis=1, how='all')

# Step 5: Visualize the filtered correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(high_corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title('Highly Correlated Features (|Correlation| > 0.8)')
plt.show()

# Step 6: (Optional) List the pairs of features with high correlation
# Extract the pairs of features and their correlation values
high_corr_pairs = high_corr_matrix.stack().reset_index()
high_corr_pairs.columns = ['Feature1', 'Feature2', 'Correlation']
high_corr_pairs = high_corr_pairs.sort_values(by='Correlation', ascending=False)

# Display the highly correlated pairs
print(high_corr_pairs)


In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(df):
    """
    Calculate Variance Inflation Factor (VIF) for each feature in the dataframe.
    """
    vif_data = pd.DataFrame()
    vif_data['Feature'] = df.columns
    vif_data['VIF'] = [
        variance_inflation_factor(df.values, i) for i in range(df.shape[1])
    ]
    return vif_data.sort_values(by='VIF', ascending=False)




In [None]:
# Define the target variable
y_train= train_df['SalePrice']
y_train.to_csv('/Users/issacsmacbookpro/.cursor-tutor/projects/Housing Price Prediction/house-prices-advanced-regression-techniques/y_train.csv')

# Drop the target and aggregated variables from features
X_train= train_df.drop(columns=['SalePrice', 'TotalBsmtSF', 'GrLivArea','BldgType_Duplex','Exterior1st_CBlock','Exterior2nd_CBlock', 'MSSubClass_90'])


In [None]:
vif = calculate_vif(train_df)
# Display VIF values
print(vif.sort_values(by='VIF', ascending=False))
# Export VIF values to CSV
vif.sort_values(by='VIF', ascending=False).to_csv('vif_values.csv', index=False)  # Specify the path if needed

# Export VIF values to CSV
vif.to_csv('vif_values.csv', index=False)  # Specify the path if needed

if 'constant' in X_train.columns:
    X_train = X_train.drop(columns=['constant'])



In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

def remove_high_vif_features(df, threshold=10, exclude_features=None):
    """
    Iteratively remove features with VIF > threshold, excluding specified features.
    """
    if exclude_features is None:
        exclude_features = []
        
    iteration = 1
    while True:
        vif = calculate_vif(df)
        # Sort VIF DataFrame by VIF values in descending order
        vif = vif.sort_values('VIF', ascending=False).reset_index(drop=True)
        
        # Find the feature with the highest VIF not in exclude_features
        for idx, row in vif.iterrows():
            if row['Feature'] not in exclude_features:
                max_vif = row['VIF']
                feature_to_remove = row['Feature']
                break
        else:
            # All features are in exclude_features
            break
        
        print(f"Iteration {iteration}:")
        print(vif.head(10))
        print(f"Max VIF: {max_vif} (Feature to remove: {feature_to_remove})\n")
        
        if max_vif > threshold:
            print(f"Removing '{feature_to_remove}' with VIF: {max_vif}\n")
            df = df.drop(columns=[feature_to_remove])
            iteration += 1
        else:
            print("All VIF values are below the threshold or only excluded features have high VIF.\n")
            break
    return df







In [None]:
# Remove exact duplicates
duplicate_columns = X_train.columns[X_train.columns.duplicated()]
if len(duplicate_columns) > 0:
    print(f"Duplicate columns found: {duplicate_columns.tolist()}")
    X_train = X_train.loc[:, ~X_train.columns.duplicated()]
else:
    print("No duplicate columns found.")

# Remove near-duplicates based on high correlation
corr_matrix = X_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9999)]
if to_drop:
    print(f"Removing near-duplicate columns: {to_drop}")
    X_train = X_train.drop(columns=to_drop)
else:
    print("No near-duplicate columns found.")

# If not already done, ensure one-hot encoding with drop_first=True
X_train = pd.get_dummies(X_train, drop_first=True)
print(f"After one-hot encoding, number of features: {X_train.shape[1]}")

# Use the correct column names in exclude_features
X_train_reduced = remove_high_vif_features(X_train, threshold=10, exclude_features=['YrSold','house_age'])
print(f"Shape after VIF reduction: {X_train_reduced.shape}")


final_vif = calculate_vif(X_train_reduced)
print("Final VIF Scores:\n", final_vif)


In [None]:
# Get the columns of the original and reduced DataFrames
original_columns = X_train.columns
reduced_columns = X_train_reduced.columns

# Find removed columns
removed_columns = set(original_columns) - set(reduced_columns)

# Display the removed columns
print("Columns removed from X_train_numeric to X_train_reduced:")
print(removed_columns)
X_train_reduced.to_csv('Users/issacsmacbookpro/.cursor-tutor/projects/Housing Price Prediction/house-prices-advanced-regression-techniques/X_train_reduced.csv', index=False)

In [None]:
x_train=pd.read_csv('/Users/issacsmacbookpro/.cursor-tutor/projects/Housing Price Prediction/house-prices-advanced-regression-techniques/X_train_reduced.csv')
y_train=pd.read_csv('/Users/issacsmacbookpro/.cursor-tutor/projects/Housing Price Prediction/house-prices-advanced-regression-techniques/y_train.csv')
test_df = pd.read_csv('/Users/issacsmacbookpro/.cursor-tutor/projects/Housing Price Prediction/house-prices-advanced-regression-techniques/test.csv')


## Using train_test_split to split the data into training and validation sets (80% train, 20% validation)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Initialize the XGBoost Regressor
xgboost_model = xgb.XGBRegressor(
    n_estimators=1000,   # Number of trees (boosting rounds)
    learning_rate=0.05,  # Learning rate
    max_depth=6,         # Maximum depth of each tree
    subsample=0.8,       # Subsample ratio of the training instance
    colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
    random_state=42
)

# Train the model
xgboost_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],  # Validation set for early stopping
    early_stopping_rounds=50,   # Stop if validation performance does not improve for 50 rounds
    verbose=True  # Print training progress
)


In [None]:
# Predict on the validation set
y_val_pred = xgboost_model.predict(X_val)

# Calculate RMSE
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation RMSE: {rmse:.4f}")


In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import mean_squared_error
# import numpy as np

# # 定义模型
# models = {
#     'XGBoost': xgb.XGBRegressor(),
#     'LightGBM': lgb.LGBMRegressor(),
#     'CatBoost': catboost.CatBoostRegressor(),
#     'Random Forest': RandomForestRegressor(),
#     'Ridge Regression': Ridge(),
#     'Lasso Regression': Lasso()
# }

# # 交叉验证对每个模型进行评估
# for name, model in models.items():
#     scores = cross_val_score(model, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
#     rmse_scores = np.sqrt(-scores)
#     print(f"{name}: Mean RMSE = {rmse_scores.mean():.4f}, Std RMSE = {rmse_scores.std():.4f}")
