In [None]:
import pandas as pd
import numpy as np
import sweetviz as sv
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, mstats, stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from src.data_preprocessing import convert_to_numeric, check_missing_values, misc_val_mapping, detect_outliers_iqr, detect_outliers_zscore, calculate_vif
import sys

warnings.filterwarnings('ignore')
sys.path.append('../src')

test_data = pd.read_csv('../data/raw/test.csv').copy()
train_data = pd.read_csv('../data/raw/train.csv').copy()

In [None]:
test_data.info()

In [None]:
check_missing_values(test_data)

In [None]:
# 1. Categorical columns and qual_mapping strategy
qual_mapping = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1,
    'None': 0  # We can treat missing values as 0
}

# 2. Apply conversion to the specified categorical columns
categorical_columns = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
                       'PoolQC', 'FireplaceQu', 'GarageQual', 'GarageCond', 'KitchenQual']

train_data = convert_to_numeric(train_data, categorical_columns, qual_mapping)
test_data = convert_to_numeric(test_data, categorical_columns, qual_mapping)

# 3. Check the results
print(train_data[categorical_columns].head())
print(test_data[categorical_columns].head())

In [None]:
## Handling missing values for test_data

# 1. PoolQC feature - Fill missing values with 'None' for houses without a pool
test_data['PoolQC'].fillna('None', inplace=True)

# 2. MiscFeature feature - Fill missing values with 'None' for houses without extra features
test_data['MiscFeature'].fillna('None', inplace=True)

# 3. Alley feature - Fill missing values with 'None' for houses without alley access
test_data['Alley'].fillna('None', inplace=True)

# 4. Fence feature - Fill missing values with 'None' for houses without a fence
test_data['Fence'].fillna('None', inplace=True)

# 5. MasVnrType feature - Fill missing values with the most frequent value for houses without masonry veneer
test_data['MasVnrType'].fillna(test_data['MasVnrType'].mode()[0], inplace=True)

# 6. FireplaceQu feature - Fill missing values with 'None' for houses without a fireplace
test_data['FireplaceQu'].fillna('None', inplace=True)

# 7. LotFrontage feature - Fill missing values with the median value of the same neighborhood for houses without lot frontage
test_data['LotFrontage'] = test_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# 8. Garage information features - Fill missing values with 'None' for houses without garage details
garage_cols = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in garage_cols:
    test_data[col].fillna('None', inplace=True)
test_data['GarageYrBlt'].fillna(0, inplace=True)

# 9. Basement information features - Fill missing values with 'None' for houses without basement details
bsmt_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for col in bsmt_cols:
    test_data[col].fillna('None', inplace=True)

# 10. MasVnrArea feature - Fill missing values with 0 for houses without masonry veneer area
test_data['MasVnrArea'].fillna(0, inplace=True)

# 11. Electrical feature - Fill missing values with the most frequent value for houses without electrical system details
test_data['Electrical'].fillna(test_data['Electrical'].mode()[0], inplace=True)

# 12. Fill missing values with the most frequent value for categorical columns
categorical_cols = ['MSZoning', 'Utilities', 'Functional', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'SaleType']
for col in categorical_cols:
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

# 13. Fill missing values with the mean value for numerical columns
numerical_cols = ['BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageCars', 'GarageArea']
for col in numerical_cols:
    test_data[col].fillna(test_data[col].mean(), inplace=True)

# Check the results
print(test_data.isnull().sum().sum())
# Print the number of missing values after filling for test_data
print(test_data.isnull().sum().sum())  

# Confirmation message for successful imputation
print("Missing values in test_data have been successfully filled!")

check_missing_values(test_data)

In [None]:
# Necessary data preprocessing, feature selection, and feature engineering

# 1. Calculate the average house prices by neighborhood and map the average price to each neighborhood in the test_data (using train_data)
# This feature calculates the average sale price for each neighborhood in the training data and maps it to the test data.
neighborhood_avg_price = train_data.groupby('Neighborhood')['SalePrice'].mean()
test_data['Neighborhood_avg_price'] = test_data['Neighborhood'].map(neighborhood_avg_price)

# 2. Mapping categorical GarageType to numerical values
# This feature creates a numeric representation of different garage types, which are initially categorical.
garage_type_mapping = {
    'Basment': 0,   # Basement (Less likely to have a garage, lower impact on price)
    'CarPort': 1,    # CarPort (Less valuable, lower impact)
    '2Types': 2,     # Two types of garage, probably mid-range value
    'BuiltIn': 3,    # Built-In garages (higher value)
    'Detchd': 4,     # Detached garages (higher value)
    'Attchd': 5      # Attached garages (usually most valuable)
}

# Applying the mapping to create GarageType_num feature
test_data['GarageType_num'] = test_data['GarageType'].fillna(0).map(garage_type_mapping)

# 3. Mapping categorical GarageFinish to numerical values
# This feature converts the GarageFinish categories into numeric values.
garage_finish_mapping = {
    'Unf': 1,  # Unfinished
    'RFn': 2,  # Rough Finished
    'Fin': 3   # Finished
}

# Ensure NaN values are mapped to 0
train_data['GarageFinish_num'] = train_data['GarageFinish'].map(garage_finish_mapping).fillna(0)
test_data['GarageFinish_num'] = test_data['GarageFinish'].map(garage_finish_mapping).fillna(0)

# 4. Handle missing values in GarageYrBlt (Garage Year Built), setting NaN to 0
# This feature handles missing values for the Garage Year Built feature by setting NaN to 0.
test_data['GarageYrBlt'] = test_data['GarageYrBlt'].fillna(0)

# 5. Define weight ranges for each garage feature and perform grid search to find the best weight combination
# This step finds the optimal weight combination for the garage features by using a grid search method.
import numpy as np
import pandas as pd

# Define weight ranges for each garage feature and perform grid search to find the best weight combination
garage_cars_weights = np.arange(0.4, 0.61, 0.05)
garage_area_weights = np.arange(0.4, 0.61, 0.05)
garage_finish_weights = np.arange(0.1, 0.21, 0.05)
garage_yrblt_weights = np.arange(0.05, 0.11, 0.05)
garage_qual_weights = np.arange(0.05, 0.11, 0.05)
garage_cond_weights = np.arange(0.05, 0.11, 0.05)

# Initialize variables to store the best correlation and weights
best_correlation = -np.inf
best_weights = {}

# Perform grid search to find the best weight combination for garage features
for g_cars_w in garage_cars_weights:
    for g_area_w in garage_area_weights:
        for g_finish_w in garage_finish_weights:
            for g_yrblt_w in garage_yrblt_weights:
                for g_qual_w in garage_qual_weights:
                    for g_cond_w in garage_cond_weights:
                        # Calculate new Garage_Feature with weighted sum of features for training data
                        train_data['Garage_Feature'] = (
                            g_cars_w * train_data['GarageCars'] + 
                            g_area_w * train_data['GarageArea'] + 
                            g_finish_w * train_data['GarageFinish_num'] + 
                            g_yrblt_w * train_data['GarageYrBlt'] + 
                            g_qual_w * train_data['GarageQual'] + 
                            g_cond_w * train_data['GarageCond']
                        )
                        
                        # Calculate the correlation between the new feature and SalePrice
                        correlation = train_data['Garage_Feature'].corr(train_data['SalePrice'])
                        
                        # Update the best correlation and weights if necessary
                        if correlation > best_correlation:
                            best_correlation = correlation
                            best_weights = {
                                'GarageCars_weight': g_cars_w,
                                'GarageArea_weight': g_area_w,
                                'GarageFinish_weight': g_finish_w,
                                'GarageYrBlt_weight': g_yrblt_w,
                                'GarageQual_weight': g_qual_w,
                                'GarageCond_weight': g_cond_w
                            }

# Now use the best weights found from grid search (for the test data)
test_data['Garage_Feature'] = (
    best_weights['GarageCars_weight'] * test_data['GarageCars'] + 
    best_weights['GarageArea_weight'] * test_data['GarageArea'] + 
    best_weights['GarageFinish_weight'] * test_data['GarageFinish_num'] + 
    best_weights['GarageYrBlt_weight'] * test_data['GarageYrBlt'] + 
    best_weights['GarageQual_weight'] * test_data['GarageQual'] + 
    best_weights['GarageCond_weight'] * test_data['GarageCond']
)

# 6. Living_Area_per_Room Feature
# Calculate the living area per room by dividing the GrLivArea (above-ground living area) by TotRmsAbvGrd (total rooms above ground)
test_data['Living_Area_per_Room'] = test_data['GrLivArea'] / test_data['TotRmsAbvGrd']

# 7. Garage_Capacity_per_Square_Meter Feature
# Calculate the garage capacity per square meter by dividing GarageCars (number of cars the garage can hold) by GarageArea (area of the garage)
test_data['Garage_Capacity_per_Square_Meter'] = test_data['GarageCars'] / test_data['GarageArea']

# 8. Living_Area_per_Room Feature (duplicated line removed)
# This feature is already created, so no need to calculate again
test_data['Living_Area_per_Room'] = test_data['GrLivArea'] / test_data['TotRmsAbvGrd']

# 9. Garage_Capacity_per_Square_Meter Feature
# Calculate the garage capacity per square meter by dividing GarageCars (number of cars the garage can hold) by GarageArea (area of the garage)
test_data['Garage_Capacity_per_Square_Meter'] = test_data['GarageCars'] / test_data['GarageArea']

# 10. Fill missing values in selected columns with their respective mean values
columns_to_fill = ['GarageType_num', 'GarageFinish_num', 'Garage_Feature', 
                   'Living_Area_per_Room', 'Garage_Capacity_per_Square_Meter']

test_data[columns_to_fill].isnull().sum()  # Check missing values

for col in columns_to_fill:
    test_data[col].fillna(test_data[col].mean(), inplace=True)  # Fill with mean

# 11. GrLivArea_per_Room Feature (Total living area per room)
# Calculate total living area per room by dividing GrLivArea (above-ground living area) by TotRmsAbvGrd (total rooms above ground)
test_data['GrLivArea_per_Room'] = test_data['GrLivArea'] / test_data['TotRmsAbvGrd']

# 12. MasVnr_Area_to_TotalArea Feature (Masonry veneer area to total area ratio)
# Calculate masonry veneer area to total area ratio by dividing MasVnrArea (masonry veneer area) by the sum of living area, total rooms, and first floor area
test_data['MasVnr_Area_to_TotalArea'] = test_data['MasVnrArea'] / (test_data['GrLivArea'] + test_data['TotRmsAbvGrd'] + test_data['1stFlrSF'])

# 13. Age_at_Remodel Feature (Age of house at the time of remodeling)
# Calculate the age of the house at the time of remodeling by subtracting the YearBuilt (year built) from the YearRemodAdd (year remodeled)
test_data['Age_at_Remodel'] = test_data['YearRemodAdd'] - test_data['YearBuilt']

# 14. BsmtFinSF_to_TotalArea Feature (Basement finished area to total area ratio)
# Calculate the basement finished area to total area ratio by dividing BsmtFinSF1 (basement finished area) by the sum of living area, total rooms, and first floor area
test_data['BsmtFinSF_to_TotalArea'] = test_data['BsmtFinSF1'] / (test_data['GrLivArea'] + test_data['TotRmsAbvGrd'] + test_data['1stFlrSF'])

# 15. Lot_Frontage_to_Area Feature (Lot frontage to total area ratio)
# Calculate the lot frontage to total area ratio by dividing LotFrontage (lot frontage) by the sum of living area, total rooms, and first floor area
test_data['Lot_Frontage_to_Area'] = test_data['LotFrontage'] / (test_data['GrLivArea'] + test_data['TotRmsAbvGrd'] + test_data['1stFlrSF'])

# 16. TotalOutdoorArea Feature (Total outdoor area, combining different outdoor areas)
# Calculate the total outdoor area by adding WoodDeckSF (wood deck area), 2ndFlrSF (second floor area), and OpenPorchSF (open porch area)
test_data['TotalOutdoorArea'] = test_data['WoodDeckSF'] + test_data['2ndFlrSF'] + test_data['OpenPorchSF']

# 17. Overall_Quality Feature (Sum of multiple quality-related factors)
# Calculate overall quality by adding OverallQual (overall quality), ExterQual (exterior quality), BsmtQual (basement quality), and KitchenQual (kitchen quality)
test_data['Overall_Quality'] = test_data['OverallQual'] + test_data['ExterQual'] + test_data['BsmtQual'] + test_data['KitchenQual']

# 18. Garage_Capacity_per_Square_Meter Feature (Garage capacity per square meter of living area)
# Calculate the garage capacity per square meter of living area by dividing GarageCars (garage cars) by GrLivArea (living area)
test_data['Garage_Capacity_per_Square_Meter'] = test_data['GarageCars'] / test_data['GrLivArea']

# 19. FullBath_to_Bedrooms Feature (Ratio of full baths to number of bedrooms)
# Calculate the ratio of full baths to the number of bedrooms by dividing FullBath (full baths) by TotRmsAbvGrd (total rooms above ground)
test_data['FullBath_to_Bedrooms'] = test_data['FullBath'] / test_data['TotRmsAbvGrd']

# 20. Fireplace_Impact Feature (Impact of fireplaces on the overall quality score)
# Calculate the impact of fireplaces on the overall quality score by multiplying the number of Fireplaces with the quality of FireplaceQu (fireplace quality)
test_data['Fireplace_Impact'] = test_data['Fireplaces'] * test_data['FireplaceQu']

# 21. BsmtQual_to_BsmtFinSF Feature (Ratio of basement quality to basement finished area)
# Calculate the ratio of basement quality to basement finished area by multiplying BsmtQual (basement quality) with BsmtFinSF1 (basement finished area)
test_data['BsmtQual_to_BsmtFinSF'] = test_data['BsmtQual'] * test_data['BsmtFinSF1']

# 22. Overall_Quality_Impact Feature (Overall quality impact based on living area)
# Calculate the overall quality impact based on the living area by multiplying Overall_Quality (sum of quality factors) with GrLivArea (living area)
test_data['Overall_Quality_Impact'] = test_data['Overall_Quality'] * test_data['GrLivArea']

# 23. BsmtExposure Mapping (Higher values indicate better exposure)
bsmt_exposure_mapping = {
    'No': 0,        # No exposure
    'Mn': 1,        # Minimum exposure
    'Av': 2,        # Average exposure
    'Gd': 3         # Good exposure
}
test_data['BsmtExposure_Score'] = test_data['BsmtExposure'].map(bsmt_exposure_mapping)

# 24. BsmtExposure Mapping (Higher values indicate better exposure)
bsmt_exposure_mapping = {
    'No': 0,        # No exposure
    'Mn': 1,        # Minimum exposure
    'Av': 2,        # Average exposure
    'Gd': 3         # Good exposure
}
test_data['BsmtExposure_Score'] = test_data['BsmtExposure'].map(bsmt_exposure_mapping)

# 25. Interaction Between BsmtExposure and BsmtFinType1
test_data['BsmtExposure_BsmtFinType1_Interaction'] = test_data['BsmtExposure_Score'] * test_data['BsmtFinType1'].map({
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 2, 'LwQ': 1, 'Unf': 0
})

# 26. Functional Mapping (Higher score indicates better functionality)
functional_mapping = {
    'Typ': 5,      # Typical
    'Min1': 4,     # Minor Deductions
    'Min2': 3,     # More Deductions
    'Mod': 2,      # Moderate Deductions
    'Maj1': 1,     # Major Deductions
    'Maj2': 0      # Severe Deductions
}
test_data['Functional_Score'] = test_data['Functional'].map(functional_mapping)

# 27. Interaction Between Functional Score and Overall Quality
test_data['Functional_OverallQuality_Interaction'] = test_data['Functional_Score'] * test_data['OverallQual']

# 28. FireplaceQu Mapping (Based on fireplace quality)
test_data['FireplaceQu_Score'] = test_data['FireplaceQu']

# 29. Interaction Between FireplaceQu and OverallQuality
test_data['FireplaceQu_OverallQuality_Interaction'] = test_data['FireplaceQu'] * test_data['OverallQual']

# 30. PavedDrive Mapping (Higher values indicate better driveway quality)
paved_drive_mapping = {
    'Y': 2,       # Fully paved
    'P': 1,       # Partially paved
    'N': 0        # Not paved
}
test_data['PavedDrive_Score'] = test_data['PavedDrive'].map(paved_drive_mapping)

# 31. Interaction Between PavedDrive and LotFrontage
test_data['PavedDrive_LotFrontage_Interaction'] = test_data['PavedDrive_Score'] * test_data['LotFrontage']

# 32. Interaction Between PavedDrive and GarageType
test_data['PavedDrive_GarageType_Interaction'] = test_data['PavedDrive_Score'] * test_data['GarageType'].map({
    '2Types': 5, 'Attchd': 4, 'Detchd': 3, 'Basment': 2, 'BuiltIn': 1, 'None': 0
})

# 33. SaleType Mapping (Higher values indicate more desirable sales)
saletype_mapping = {
    'WD': 5,       # Normal Sale
    'New': 4,      # New Construction
    'COD': 3,      # Contract
    'ConLD': 2,    # Lender Sale
    'ConLI': 1,    # Foreclosure
    'CWD': 0       # Coded Sale
}
test_data['SaleType_Score'] = test_data['SaleType'].map(saletype_mapping)

# 34. SaleCondition Mapping (Higher values indicate better sale conditions)
salecondition_mapping = {
    'Normal': 5,
    'Abnorml': 4,
    'AdjLand': 3,
    'Alloca': 2,
    'Family': 1,
    'Partial': 0
}
test_data['SaleCondition_Score'] = test_data['SaleCondition'].map(salecondition_mapping)

# 35. Interaction Between SaleCondition and SaleType
test_data['SaleCondition_SaleType_Interaction'] = test_data['SaleCondition_Score'] * test_data['SaleType_Score']

# 37. Average SalePrice for each category in CentralAir and Electrical (from the training set)
central_air_avg_price = train_data.groupby('CentralAir')['SalePrice'].mean()
electrical_avg_price = train_data.groupby('Electrical')['SalePrice'].mean()

# For test data, use the average values from training data
test_data['CentralAir_AvgPrice'] = test_data['CentralAir'].map(central_air_avg_price)
test_data['Electrical_AvgPrice'] = test_data['Electrical'].map(electrical_avg_price)

# Displaying the result to verify
print(test_data[['CentralAir_AvgPrice', 'Electrical_AvgPrice']].head())

# 39. CentralAir Mapping (Y -> 1, N -> 0)
test_data['CentralAir_numerical'] = test_data['CentralAir'].map({'Y': 1, 'N': 0})

# 40. Electrical Mapping (Scoring system for electrical systems)
electrical_mapping = {
    'SBrkr': 5,
    'FuseA': 4,
    'FuseF': 3,
    'FuseP': 2,
    'Mix': 1
}
test_data['Electrical_numerical'] = test_data['Electrical'].map(electrical_mapping)


# 41. Average SalePrice for each category in Fence (from the training set)
fence_price = train_data.groupby('Fence')['SalePrice'].mean()

# For test data, use the average values from training data
test_data['Fence_AvgPrice'] = test_data['Fence'].map(fence_price)

# 42. Fence Scoring Strategy (Mapping fence types to scores)
fence_mapping = {
    'None': 2,  # Houses without any fence tend to be less expensive
    'GdPrv': 1,  # Good privacy fencing is associated with higher prices
    'MnPrv': 3,  # Medium privacy fencing corresponds to mid-range prices
    'GdWo': 4,  # Good wooden fences correlate with premium prices
    'MnWw': 5   # Medium wire fence houses generally have lower prices
}
test_data['Fence_Points'] = test_data['Fence'].map(fence_mapping)

# 43. Check if fence mapping was applied correctly
test_data[['Fence', 'Fence_Points']].head()

# 44. Frequency of values in the 'MiscFeature' column
misc_feature_counts = test_data['MiscFeature'].value_counts()

# 45. Average SalePrice for each category in MiscFeature (from the training set)
misc_feature_price = train_data.groupby('MiscFeature')['SalePrice'].mean()

# 46. Average SalePrice based on 'MiscVal' (from the training set)
misc_val_price = train_data.groupby('MiscVal')['SalePrice'].mean()

# For test data, use the average values from the training data
test_data['MiscFeature_AvgPrice'] = test_data['MiscFeature'].map(misc_feature_price)
test_data['MiscVal_AvgPrice'] = test_data['MiscVal'].map(misc_val_price)

# 47. Scoring for MiscVal
# Apply the mapping function to the MiscVal column
test_data['MiscVal_Points'] = test_data['MiscVal'].apply(misc_val_mapping)

# 49. Combine MiscFeature and MiscVal into a single feature
# Combining both features into a single column
test_data['Combined_Misc_Points'] = test_data['MiscFeature_Points'] + test_data['MiscVal_Points']

# Print a success message after all feature engineering steps are completed
print("All feature engineering steps have been successfully completed!")

In [None]:
# Features to check for outliers
features_to_check = ['GrLivArea', 'TotRmsAbvGrd', 'GarageArea', '1stFlrSF', 'TotalBsmtSF']  # Relevant continuous features

# Outlier Detection (IQR and Z-Score)
outliers_iqr_test = detect_outliers_iqr(test_data, features_to_check)
outliers_zscore_test = detect_outliers_zscore(test_data, features_to_check)

# VIF Analysis
continuous_features = ['GrLivArea', 'TotRmsAbvGrd', 'GarageArea', '1stFlrSF', 'TotalBsmtSF', 'Overall_Quality', 'YearBuilt']  # Continuous features
X_test = test_data[continuous_features]
X_const_test = add_constant(X_test)  # Adding constant term for VIF calculation
vif_results_test = calculate_vif(X_const_test, continuous_features)

# Visualize Outlier and VIF Results
print("Outliers (IQR method) in Test Data:")
print(outliers_iqr_test)
print("\nOutliers (Z-score method) in Test Data:")
print(outliers_zscore_test)

print("\nVIF Results in Test Data:")
print(vif_results_test)

# VIF bar plot
sns.barplot(x="VIF", y="features", data=vif_results_test.sort_values(by="VIF", ascending=False))
plt.title("Variance Inflation Factor (VIF) for Features in Test Data")
plt.show()

# 1. Apply logarithmic transformation to newly created features (if applicable)
new_log_transform_columns = ['GrLivArea_per_Room', 'Living_Area_per_Room', 'Neighborhood_avg_price']

for col in new_log_transform_columns:
    test_data[col] = np.log1p(test_data[col])  # Apply log transformation using log1p (log(1+x))

# 2. Apply Winsorization to newly created features to handle outliers (limit 1% and 99%)
new_winsorize_columns = ['GrLivArea_per_Room', 'Living_Area_per_Room', 'Neighborhood_avg_price']

for col in new_winsorize_columns:
    test_data[col] = mstats.winsorize(test_data[col], limits=[0.01, 0.01])  # Winsorize at 1% and 99% limits

# Display summary statistics for the newly transformed features
print("Summary statistics of newly transformed and winsorized features in Test Data:")
print(test_data[new_log_transform_columns + new_winsorize_columns].describe())


In [None]:
categorical_columns = ['ExterQual', 'FireplaceQu', 'GarageFinish', 'BsmtQual', 'KitchenQual', 'PavedDrive', 'Foundation']
for col in categorical_columns:
    print(f"Distribution of {col}:")
    print(test_data[col].value_counts())
    print("\n")

In [None]:
# Sorting the columns alphabetically
test_data = test_data[sorted(test_data.columns)]

# Save the DataFrame to a CSV file after sorting the columns
processed_data_path = '../data/processed/processed_test_data.csv'
test_data.to_csv(processed_data_path, index=False)

# Confirm that the file has been saved successfully
print(f"Processed test data with sorted columns has been saved to: {processed_data_path}")