In [None]:
import pandas as pd
import numpy as np
import sweetviz as sv
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, mstats, stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import sys
from src.data_preprocessing import convert_to_numeric, check_missing_values, misc_val_mapping, detect_outliers_iqr, detect_outliers_zscore, calculate_vif

warnings.filterwarnings('ignore')
sys.path.append('../src')

train_data_raw = pd.read_csv('../data/raw/train.csv')
train_data = train_data_raw.copy()

In [None]:
# A quick analysis on the raw data set with Sweetviz
report = sv.analyze(train_data)
report.show_html("../reports/sweetviz_eda_report.html")

In [None]:
train_data.info()

In [None]:
# If it's a categorical feature, apply ordinal encoding (if needed)
train_data['OverallQual'] = train_data['OverallQual'].astype(int)

# Checking the result
print(train_data[['OverallQual']].head())

In [None]:
# 1. Finding columns with missing values
missing_values = train_data.isnull().sum()

# 2. Filter out only the columns with missing values
missing_values = missing_values[missing_values > 0]

# 3. Calculate the percentage of missing values for each column
missing_percentage = (missing_values / len(train_data)) * 100

# 4. Create a DataFrame to show missing values and their percentages
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})

# 5. Sort the DataFrame by missing value percentage in descending order
missing_df = missing_df.sort_values(by='Percentage', ascending=False)

# 6. Display the result
print(missing_df)

In [None]:
# 1. PoolQC feature - Fill missing values with 'None' for houses without a pool
train_data['PoolQC'].fillna('None', inplace=True)

# 2. MiscFeature feature - Fill missing values with 'None' for houses without extra features
train_data['MiscFeature'].fillna('None', inplace=True)

# 3. Alley feature - Fill missing values with 'None' for houses without alley access
train_data['Alley'].fillna('None', inplace=True)

# 4. Fence feature - Fill missing values with 'None' for houses without a fence
train_data['Fence'].fillna('None', inplace=True)

# 5. MasVnrType feature - Fill missing values with the most frequent value for houses without masonry veneer
train_data['MasVnrType'].fillna(train_data['MasVnrType'].mode()[0], inplace=True)

# 6. FireplaceQu feature - Fill missing values with 'None' for houses without a fireplace
train_data['FireplaceQu'].fillna('None', inplace=True)

# 7. LotFrontage feature - Fill missing values with the median value of the same neighborhood for houses without lot frontage
train_data['LotFrontage'] = train_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# 8. Garage information features - Fill missing values with 'None' for houses without garage details
garage_cols = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in garage_cols:
    train_data[col].fillna('None', inplace=True)
train_data['GarageYrBlt'].fillna(0, inplace=True)

# 9. Basement information features - Fill missing values with 'None' for houses without basement details
bsmt_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for col in bsmt_cols:
    train_data[col].fillna('None', inplace=True)

# 10. MasVnrArea feature - Fill missing values with 0 for houses without masonry veneer area
train_data['MasVnrArea'].fillna(0, inplace=True)

# 11. Electrical feature - Fill missing values with the most frequent value for houses without electrical system details
train_data['Electrical'].fillna(train_data['Electrical'].mode()[0], inplace=True)

# Print the number of missing values after filling
print(train_data.isnull().sum().sum())  

# Confirmation message for successful imputation
print("Missing values have been successfully filled!")

In [None]:
# 1. Categorical columns and qual_mapping strategy
qual_mapping = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1,
    'None': 0  # We can treat missing values as 0
}

# 2. Apply conversion to the specified categorical columns
categorical_columns = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
                       'PoolQC', 'FireplaceQu', 'GarageQual', 'GarageCond', 'KitchenQual']

train_data = convert_to_numeric(train_data, categorical_columns, qual_mapping)

# 3. Check the results
train_data[categorical_columns].head()

In [None]:
# 1. Calculate the average house prices by neighborhood
neighborhood_avg_price = train_data.groupby('Neighborhood')['SalePrice'].mean().sort_values(ascending=False)

# 2. Print the results
print("Average House Prices by Neighborhood:")
print(neighborhood_avg_price)

# 3. Visualize the results
plt.figure(figsize=(12, 8))  # Set the figure size
sns.barplot(x=neighborhood_avg_price.index, y=neighborhood_avg_price.values, palette='viridis')  # Create a bar plot
plt.xticks(rotation=90)  # Rotate x-axis labels for readability
plt.title('Average House Prices by Neighborhood')  # Add a title
plt.xlabel('Neighborhood')  # Label for x-axis
plt.ylabel('Average SalePrice')  # Label for y-axis
plt.show()  # Display the plot

In [None]:
# 1. Calculate the average house prices by neighborhood
neighborhood_avg_price = train_data.groupby('Neighborhood')['SalePrice'].mean()

# 2. Add the new feature to the dataset by mapping the average price to each neighborhood
train_data['Neighborhood_avg_price'] = train_data['Neighborhood'].map(neighborhood_avg_price)

In [None]:
# Define the relevant columns for garage features
garage_columns = ['GarageType', 'GarageFinish', 'SalePrice']

# Calculate correlation between the features and SalePrice
correlation = train_data[garage_columns].apply(lambda x: x.astype('category').cat.codes).corr()

# Display the correlation between GarageType, GarageFinish, and SalePrice
print("Correlation of GarageType and GarageFinish with SalePrice:")
print(correlation.loc['SalePrice', ['GarageType', 'GarageFinish']])

# Mapping categorical GarageType to numerical values
garage_type_mapping = {
    'Basment': 0,   # Basement (Less likely to have a garage, lower impact on price)
    'CarPort': 1,    # CarPort (Less valuable, lower impact)
    '2Types': 2,     # Two types of garage, probably mid-range value
    'BuiltIn': 3,    # Built-In garages (higher value)
    'Detchd': 4,     # Detached garages (higher value)
    'Attchd': 5      # Attached garages (usually most valuable)
}

# Applying the mapping to create GarageType_num feature
train_data['GarageType_num'] = train_data['GarageType'].fillna('NA').map(garage_type_mapping)

# Mapping categorical GarageFinish to numerical values
garage_finish_mapping = {
    'Unf': 1,  # Unfinished
    'RFn': 2,  # Rough Finished
    'Fin': 3   # Finished
}

# Treat NaN values as missing or not applicable, so map NaN to 0
train_data['GarageFinish_num'] = train_data['GarageFinish'].fillna('NA').map(garage_finish_mapping)

# Checking the results
print("\nFirst 5 rows with GarageType and GarageFinish features:")
print(train_data[['GarageType', 'GarageFinish', 'GarageType_num', 'GarageFinish_num']].head())

# Handle missing values in GarageYrBlt (Garage Year Built), setting NaN to 0
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].fillna(0)

# Checking for null values in specific garage-related columns
print("\nNull values in GarageCars, GarageArea, and GarageYrBlt:")
print(train_data[['GarageCars', 'GarageArea', 'GarageYrBlt']].isnull().sum())

# Calculate correlations between several garage features and SalePrice
correlation_features = ['GarageCars', 'GarageArea', 'GarageYrBlt', 'GarageType_num', 'GarageFinish_num', 'GarageQual', 'GarageCond']
correlations = train_data[correlation_features].corrwith(train_data['SalePrice'])

# Display the correlations with SalePrice
print("\nGarage Features Correlation with SalePrice:")
print(correlations)

# Defining weight ranges for each garage feature
garage_cars_weights = np.arange(0.4, 0.61, 0.05)  # From 0.4 to 0.6, step 0.05
garage_area_weights = np.arange(0.4, 0.61, 0.05)
garage_finish_weights = np.arange(0.1, 0.21, 0.05)
garage_yrblt_weights = np.arange(0.05, 0.11, 0.05)
garage_qual_weights = np.arange(0.05, 0.11, 0.05)
garage_cond_weights = np.arange(0.05, 0.11, 0.05)

# Initialize variables to store the best correlation and weights
best_correlation = -np.inf
best_weights = {}

# Perform grid search to find the best weight combination for garage features
for g_cars_w in garage_cars_weights:
    for g_area_w in garage_area_weights:
        for g_finish_w in garage_finish_weights:
            for g_yrblt_w in garage_yrblt_weights:
                for g_qual_w in garage_qual_weights:
                    for g_cond_w in garage_cond_weights:
                        # Calculate new Garage_Feature with weighted sum of features
                        train_data['Garage_Feature'] = (
                            g_cars_w * train_data['GarageCars'] + 
                            g_area_w * train_data['GarageArea'] + 
                            g_finish_w * train_data['GarageFinish_num'] + 
                            g_yrblt_w * train_data['GarageYrBlt'] + 
                            g_qual_w * train_data['GarageQual'] + 
                            g_cond_w * train_data['GarageCond']
                        )
                        
                        # Calculate the correlation between the new feature and SalePrice
                        correlation = train_data['Garage_Feature'].corr(train_data['SalePrice'])
                        
                        # Update the best correlation and weights if necessary
                        if correlation > best_correlation:
                            best_correlation = correlation
                            best_weights = {
                                'GarageCars_weight': g_cars_w,
                                'GarageArea_weight': g_area_w,
                                'GarageFinish_weight': g_finish_w,
                                'GarageYrBlt_weight': g_yrblt_w,
                                'GarageQual_weight': g_qual_w,
                                'GarageCond_weight': g_cond_w
                            }

# Display the best correlation and the corresponding weights
print(f"Best correlation: {best_correlation}")
print("Best weights:", best_weights)

# Create the weighted Garage_Feature based on the best weights
train_data['Garage_Feature'] = (
    0.6 * train_data['GarageCars'] + 
    0.4 * train_data['GarageArea'] + 
    0.2 * train_data['GarageFinish_num'] + 
    0.1 * train_data['GarageYrBlt'] + 
    0.1 * train_data['GarageQual'] + 
    0.1 * train_data['GarageCond']
)

# Calculate the correlation between the new weighted Garage_Feature and SalePrice
garage_feature_correlation = train_data['Garage_Feature'].corr(train_data['SalePrice'])

# Display the correlation
print(f"\nGarage_Feature Correlation with SalePrice: {garage_feature_correlation}")


In [None]:
train_data[['GrLivArea', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea']]

In [None]:
train_data[['GrLivArea', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea']].isnull().sum()

In [None]:
#1. Living_Area_per_Room Özelliği
train_data['Living_Area_per_Room'] = train_data['GrLivArea'] / train_data['TotRmsAbvGrd']

#2. Garage_Capacity_per_Square_Meter Özelliği
train_data['Garage_Capacity_per_Square_Meter'] = train_data['GarageCars'] / train_data['GarageArea']

In [None]:
# Filling missing values with 0 for the specified columns
columns_to_fill = ['GarageType_num', 'GarageFinish_num', 'Garage_Feature', 
                   'Living_Area_per_Room', 'Garage_Capacity_per_Square_Meter']

train_data[columns_to_fill] = train_data[columns_to_fill].fillna(0)

# Check the result to ensure that missing values have been filled
train_data[columns_to_fill].isnull().sum()


In [None]:
train_data['GarageType_num'].value_counts()

In [None]:
# 1. Living_Area_per_Room Feature
# Calculate the living area per room by dividing the GrLivArea (above-ground living area) by TotRmsAbvGrd (total rooms above ground)
train_data['Living_Area_per_Room'] = train_data['GrLivArea'] / train_data['TotRmsAbvGrd']

# 2. Garage_Capacity_per_Square_Meter Feature
# Calculate the garage capacity per square meter by dividing GarageCars (number of cars the garage can hold) by GarageArea (area of the garage)
train_data['Garage_Capacity_per_Square_Meter'] = train_data['GarageCars'] / train_data['GarageArea']

# 3. GrLivArea_per_Room Feature (Total living area per room)
# Calculate total living area per room by dividing GrLivArea (above-ground living area) by TotRmsAbvGrd (total rooms above ground)
train_data['GrLivArea_per_Room'] = train_data['GrLivArea'] / train_data['TotRmsAbvGrd']

# 4. MasVnr_Area_to_TotalArea Feature (Masonry veneer area to total area ratio)
# Calculate masonry veneer area to total area ratio by dividing MasVnrArea (masonry veneer area) by the sum of living area, total rooms, and first floor area
train_data['MasVnr_Area_to_TotalArea'] = train_data['MasVnrArea'] / (train_data['GrLivArea'] + train_data['TotRmsAbvGrd'] + train_data['1stFlrSF'])

# 5. Age_at_Remodel Feature (Age of house at the time of remodeling)
# Calculate the age of the house at the time of remodeling by subtracting the YearBuilt (year built) from the YearRemodAdd (year remodeled)
train_data['Age_at_Remodel'] = train_data['YearRemodAdd'] - train_data['YearBuilt']

# 6. BsmtFinSF_to_TotalArea Feature (Basement finished area to total area ratio)
# Calculate the basement finished area to total area ratio by dividing BsmtFinSF1 (basement finished area) by the sum of living area, total rooms, and first floor area
train_data['BsmtFinSF_to_TotalArea'] = train_data['BsmtFinSF1'] / (train_data['GrLivArea'] + train_data['TotRmsAbvGrd'] + train_data['1stFlrSF'])

# 7. Lot_Frontage_to_Area Feature (Lot frontage to total area ratio)
# Calculate the lot frontage to total area ratio by dividing LotFrontage (lot frontage) by the sum of living area, total rooms, and first floor area
train_data['Lot_Frontage_to_Area'] = train_data['LotFrontage'] / (train_data['GrLivArea'] + train_data['TotRmsAbvGrd'] + train_data['1stFlrSF'])

# 8. TotalOutdoorArea Feature (Total outdoor area, combining different outdoor areas)
# Calculate the total outdoor area by adding WoodDeckSF (wood deck area), 2ndFlrSF (second floor area), and OpenPorchSF (open porch area)
train_data['TotalOutdoorArea'] = train_data['WoodDeckSF'] + train_data['2ndFlrSF'] + train_data['OpenPorchSF']

# 9. Overall_Quality Feature (Sum of multiple quality-related factors)
# Calculate overall quality by adding OverallQual (overall quality), ExterQual (exterior quality), BsmtQual (basement quality), and KitchenQual (kitchen quality)
train_data['Overall_Quality'] = train_data['OverallQual'] + train_data['ExterQual'] + train_data['BsmtQual'] + train_data['KitchenQual']

# 10. Garage_Capacity_per_Square_Meter Feature (Garage capacity per square meter of living area)
# Calculate the garage capacity per square meter of living area by dividing GarageCars (garage cars) by GrLivArea (living area)
train_data['Garage_Capacity_per_Square_Meter'] = train_data['GarageCars'] / train_data['GrLivArea']

# 11. FullBath_to_Bedrooms Feature (Ratio of full baths to number of bedrooms)
# Calculate the ratio of full baths to the number of bedrooms by dividing FullBath (full baths) by TotRmsAbvGrd (total rooms above ground)
train_data['FullBath_to_Bedrooms'] = train_data['FullBath'] / train_data['TotRmsAbvGrd']

# 12. Fireplace_Impact Feature (Impact of fireplaces on the overall quality score)
# Calculate the impact of fireplaces on the overall quality score by multiplying the number of Fireplaces with the quality of FireplaceQu (fireplace quality)
train_data['Fireplace_Impact'] = train_data['Fireplaces'] * train_data['FireplaceQu']

# 13. BsmtQual_to_BsmtFinSF Feature (Ratio of basement quality to basement finished area)
# Calculate the ratio of basement quality to basement finished area by multiplying BsmtQual (basement quality) with BsmtFinSF1 (basement finished area)
train_data['BsmtQual_to_BsmtFinSF'] = train_data['BsmtQual'] * train_data['BsmtFinSF1']

# 14. Overall_Quality_Impact Feature (Overall quality impact based on living area)
# Calculate the overall quality impact based on the living area by multiplying Overall_Quality (sum of quality factors) with GrLivArea (living area)
train_data['Overall_Quality_Impact'] = train_data['Overall_Quality'] * train_data['GrLivArea']

In [None]:
# 1. BsmtExposure Mapping (Higher values indicate better exposure)
bsmt_exposure_mapping = {
    'No': 0,        # No exposure
    'Mn': 1,        # Minimum exposure
    'Av': 2,        # Average exposure
    'Gd': 3         # Good exposure
}
train_data['BsmtExposure_Score'] = train_data['BsmtExposure'].map(bsmt_exposure_mapping)

# 2. Interaction Between BsmtExposure and BsmtFinType1
bsmt_fin_type_mapping = {
    'GLQ': 5,
    'ALQ': 4,
    'BLQ': 3,
    'Rec': 2,
    'LwQ': 1,
    'Unf': 0
}
train_data['BsmtExposure_BsmtFinType1_Interaction'] = train_data['BsmtExposure_Score'] * train_data['BsmtFinType1'].map(bsmt_fin_type_mapping)

# 3. Functional Mapping (Higher values indicate better functionality)
functional_mapping = {
    'Typ': 5,      # Typical
    'Min1': 4,     # Minor Deductions
    'Min2': 3,     # More Deductions
    'Mod': 2,      # Moderate Deductions
    'Maj1': 1,     # Major Deductions
    'Maj2': 0      # Severe Deductions
}
train_data['Functional_Score'] = train_data['Functional'].map(functional_mapping)

# 4. Interaction Between Functional Score and Overall Quality
train_data['Functional_OverallQuality_Interaction'] = train_data['Functional_Score'] * train_data['OverallQual']

# 5. FireplaceQu Mapping (Based on fireplace quality)
train_data['FireplaceQu_Score'] = train_data['FireplaceQu']
train_data['FireplaceQu_OverallQuality_Interaction'] = train_data['FireplaceQu'] * train_data['OverallQual']

# 6. PavedDrive Mapping (Higher values indicate better driveway quality)
paved_drive_mapping = {
    'Y': 2,       # Fully paved
    'P': 1,       # Partially paved
    'N': 0        # Not paved
}
train_data['PavedDrive_Score'] = train_data['PavedDrive'].map(paved_drive_mapping)

# 7. Interaction Between PavedDrive and LotFrontage
train_data['PavedDrive_LotFrontage_Interaction'] = train_data['PavedDrive_Score'] * train_data['LotFrontage']

# 8. Interaction Between PavedDrive and GarageType
garage_type_mapping = {
    '2Types': 5,
    'Attchd': 4,
    'Detchd': 3,
    'Basment': 2,
    'BuiltIn': 1,
    'None': 0
}
train_data['PavedDrive_GarageType_Interaction'] = train_data['PavedDrive_Score'] * train_data['GarageType'].map(garage_type_mapping)

# 9. SaleType Mapping (Higher values indicate more desirable sales)
saletype_mapping = {
    'WD': 5,       # Normal Sale
    'New': 4,      # New Construction
    'COD': 3,      # Contract
    'ConLD': 2,    # Lender Sale
    'ConLI': 1,    # Foreclosure
    'CWD': 0       # Coded Sale
}
train_data['SaleType_Score'] = train_data['SaleType'].map(saletype_mapping)

# 10. SaleCondition Mapping (Higher values indicate better sale conditions)
salecondition_mapping = {
    'Normal': 5,
    'Abnorml': 4,
    'AdjLand': 3,
    'Alloca': 2,
    'Family': 1,
    'Partial': 0
}
train_data['SaleCondition_Score'] = train_data['SaleCondition'].map(salecondition_mapping)

# 11. Interaction Between SaleCondition and SaleType
train_data['SaleCondition_SaleType_Interaction'] = train_data['SaleCondition_Score'] * train_data['SaleType_Score']

# 12. Heating Mapping (Higher values indicate better heating systems)
heating_mapping = {
    'GasA': 6,  # Most common system (1428 occurrences)
    'GasW': 5,
    'Grav': 4,
    'Wall': 3,  # New category added
    'OthW': 2,
    'Floor': 1  # Least common (1 occurrence)
}
train_data['Heating_Score'] = train_data['Heating'].map(heating_mapping)

# 13. Interaction Between Heating and ExterCond
exter_cond_mapping = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1
}
train_data['Heating_ExterCond_Interaction'] = train_data['Heating_Score'] * train_data['ExterCond'].map(exter_cond_mapping)

# 14. Interaction Between Heating and ExterQual
exter_qual_mapping = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1
}
train_data['Heating_ExterQual_Interaction'] = train_data['Heating_Score'] * train_data['ExterQual'].map(exter_qual_mapping)

In [None]:
# BsmtExposure mapping based on common interpretation of exposure levels
bsmt_exposure_mapping = {
    'No': 0,        # No exposure
    'Mn': 1,        # Minimum exposure
    'Av': 2,        # Average exposure
    'Gd': 3         # Good exposure
}
train_data['BsmtExposure_Score'] = train_data['BsmtExposure'].map(bsmt_exposure_mapping)

train_data['BsmtExposure_BsmtFinType1_Interaction'] = train_data['BsmtExposure_Score'] * train_data['BsmtFinType1'].map({
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 2, 'LwQ': 1, 'Unf': 0
})


# Functional mapping (Higher score is better functionality)
functional_mapping = {
    'Typ': 5,      # Typical
    'Min1': 4,     # Minor Deductions
    'Min2': 3,     # More Deductions
    'Mod': 2,      # Moderate Deductions
    'Maj1': 1,     # Major Deductions
    'Maj2': 0      # Severe Deductions
}
train_data['Functional_Score'] = train_data['Functional'].map(functional_mapping)

# Combine with Overall_Quality for more insights
train_data['Functional_OverallQuality_Interaction'] = train_data['Functional_Score'] * train_data['OverallQual']


# Interaction with OverallQual
train_data['FireplaceQu_OverallQuality_Interaction'] = train_data['FireplaceQu_Score'] * train_data['OverallQual']

# PavedDrive mapping
paved_drive_mapping = {
    'Y': 2,       # Yes
    'P': 1,       # Partial
    'N': 0        # No
}
train_data['PavedDrive_Score'] = train_data['PavedDrive'].map(paved_drive_mapping)

# Interaction with LotFrontage and GarageType (indicating importance of driveway for access)
train_data['PavedDrive_LotFrontage_Interaction'] = train_data['PavedDrive_Score'] * train_data['LotFrontage']
train_data['PavedDrive_GarageType_Interaction'] = train_data['PavedDrive_Score'] * train_data['GarageType'].map({
    '2Types': 5, 'Attchd': 4, 'Detchd': 3, 'Basment': 2, 'BuiltIn': 1, 'None': 0
})


# SaleType mapping
saletype_mapping = {
    'WD': 5,       # Normal Sale
    'New': 4,      # New Construction
    'COD': 3,      # Contract (typical)
    'ConLD': 2,    # Lender Sale
    'ConLI': 1,    # Foreclosure
    'CWD': 0       # Coded Sale
}
train_data['SaleType_Score'] = train_data['SaleType'].map(saletype_mapping)


# SaleCondition mapping
salecondition_mapping = {
    'Normal': 5,   # Normal Sale
    'Abnorml': 4,  # Abnormal Sale
    'AdjLand': 3,  # Adjoining Land
    'Alloca': 2,   # Allocation Sale
    'Family': 1,   # Family Sale
    'Partial': 0   # Partial Sale
}
train_data['SaleCondition_Score'] = train_data['SaleCondition'].map(salecondition_mapping)

# Interaction with SaleType for more insights
train_data['SaleCondition_SaleType_Interaction'] = train_data['SaleCondition_Score'] * train_data['SaleType_Score']

In [None]:
# Perform analysis and chi-square test on the CentralAir and Electrical columns

# Calculate the average SalePrice for each category in CentralAir and Electrical
central_air_avg_price = train_data.groupby('CentralAir')['SalePrice'].mean()
electrical_avg_price = train_data.groupby('Electrical')['SalePrice'].mean()

# Perform the Chi-square test
# Chi-square test for CentralAir vs. SalePrice
central_air_crosstab = pd.crosstab(train_data['CentralAir'], train_data['SalePrice'])
chi2_central_air, p_central_air, _, _ = chi2_contingency(central_air_crosstab)

# Chi-square test for Electrical vs. SalePrice
electrical_crosstab = pd.crosstab(train_data['Electrical'], train_data['SalePrice'])
chi2_electrical, p_electrical, _, _ = chi2_contingency(electrical_crosstab)

# Print the results of the average SalePrice and p-values from chi-square tests
print(central_air_avg_price, electrical_avg_price, p_central_air, p_electrical)

# Convert CentralAir categories to numerical values (Y -> 1, N -> 0)
train_data['CentralAir_numerical'] = train_data['CentralAir'].map({'Y': 1, 'N': 0})

# Map Electrical categories to numerical scores based on hierarchy
electrical_mapping = {
    'SBrkr': 5,
    'FuseA': 4,
    'FuseF': 3,
    'FuseP': 2,
    'Mix': 1
}
train_data['Electrical_numerical'] = train_data['Electrical'].map(electrical_mapping)

# Display a preview of the transformed columns
print(train_data[['CentralAir', 'CentralAir_numerical', 'Electrical', 'Electrical_numerical']].head())

In [None]:
# Checking for columns with all missing values
empty_columns = train_data.columns[train_data.isnull().all()]

# Printing the names of columns with all missing values
print("Columns with all missing values:")
print(empty_columns)

In [None]:
# Check the frequency of unique values in the 'Fence' column
train_data['Fence'].value_counts()

# Let's examine the average SalePrice for each category in 'Fence'
fence_price = train_data.groupby('Fence')['SalePrice'].mean()

# Print the result to see the average prices for each category
print(fence_price)

# Fence scoring strategy
fence_mapping = {
    'None': 2,  # Houses without any fence tend to be less expensive
    'GdPrv': 1,  # Good privacy fencing is associated with higher prices
    'MnPrv': 3,  # Medium privacy fencing corresponds to mid-range prices
    'GdWo': 4,  # Good wooden fences correlate with premium prices
    'MnWw': 5   # Medium wire fence houses generally have lower prices
}

# Update the 'Fence' feature by mapping the categorical values to scores
train_data['Fence_Points'] = train_data['Fence'].map(fence_mapping)

# Check the results to ensure the mapping was applied correctly
train_data[['Fence', 'Fence_Points']].head()

In [None]:
# Analyze the frequency of unique values in the 'MiscFeature' column
misc_feature_counts = train_data['MiscFeature'].value_counts()

# Calculate the average SalePrice for each MiscFeature category
misc_feature_price = train_data.groupby('MiscFeature')['SalePrice'].mean()

# Examine the relationship between 'MiscVal' and 'SalePrice' by calculating average SalePrice
misc_val_price = train_data.groupby('MiscVal')['SalePrice'].mean()

# Print the frequency counts and average SalePrice values
print(misc_feature_counts)
print(misc_feature_price)
print(misc_val_price)

# 1. Assign scores to MiscFeature categories based on perceived importance
misc_feature_mapping = {
    'None': 0,    # No additional feature
    'Shed': 1,    # Shed present
    'Gar2': 5,    # Second garage
    'Othr': 2,    # Other miscellaneous features
    'TenC': 10    # Tennis court
}

# Map MiscFeature categories to numerical scores
train_data['MiscFeature_Points'] = train_data['MiscFeature'].map(misc_feature_mapping)

# 2. Assign scores to MiscVal based on its value range
# Apply the scoring function to MiscVal
train_data['MiscVal_Points'] = train_data['MiscVal'].apply(misc_val_mapping)

# 3. Combine MiscFeature and MiscVal into a single feature score
train_data['Combined_Misc_Points'] = train_data['MiscFeature_Points'] + train_data['MiscVal_Points']

# Display the first few rows of the transformed features
train_data[['MiscFeature', 'MiscFeature_Points', 'MiscVal', 'MiscVal_Points', 'Combined_Misc_Points']].head()

In [None]:
# Define relevant numerical features
features_to_check = ['GrLivArea', 'TotRmsAbvGrd', 'GarageArea', '1stFlrSF', 'TotalBsmtSF']
continuous_features = features_to_check + ['Overall_Quality', 'YearBuilt']

# Detect outliers
outliers_iqr = detect_outliers_iqr(train_data, features_to_check)
outliers_zscore = detect_outliers_zscore(train_data, features_to_check)

# VIF Analysis
X = add_constant(train_data[continuous_features])  # Add constant for VIF calculation
vif_results = calculate_vif(X, continuous_features)

# Display results
print("Outliers (IQR method):", outliers_iqr)
print("\nOutliers (Z-score method):", outliers_zscore)
print("\nVIF Results:\n", vif_results)

# Visualize VIF scores
sns.barplot(x="VIF", y="features", data=vif_results.sort_values(by="VIF", ascending=False))
plt.title("Variance Inflation Factor (VIF)")
plt.show()

In [None]:
# 1. Apply logarithmic transformation to newly created features (if applicable)
new_log_transform_columns = ['GrLivArea_per_Room', 'Living_Area_per_Room', 'Neighborhood_avg_price']

for col in new_log_transform_columns:
    train_data[col] = np.log1p(train_data[col])  # Apply log transformation using log1p (log(1+x))

# 2. Apply Winsorization to newly created features to handle outliers (limit 1% and 99%)
new_winsorize_columns = ['GrLivArea_per_Room', 'Living_Area_per_Room', 'Neighborhood_avg_price']

for col in new_winsorize_columns:
    train_data[col] = mstats.winsorize(train_data[col], limits=[0.01, 0.01])  # Winsorize at 1% and 99% limits

# Display summary statistics for the newly transformed features
print("Summary statistics of newly transformed and winsorized features:")
print(train_data[new_log_transform_columns + new_winsorize_columns].describe())

In [None]:
categorical_columns = ['ExterQual', 'FireplaceQu', 'GarageFinish', 'BsmtQual', 'KitchenQual', 'PavedDrive', 'Foundation']
for col in categorical_columns:
    print(f"Distribution of {col}:")
    print(train_data[col].value_counts())
    print("\n")

In [None]:
# Sorting the columns alphabetically
train_data = train_data[sorted(train_data.columns)]

# Save the DataFrame to a CSV file after sorting the columns
processed_data_path = '../data/processed/processed_train_data.csv'
train_data.to_csv(processed_data_path, index=False)

# Confirm that the file has been saved successfully
print(f"Processed train data with sorted columns has been saved to: {processed_data_path}")

In [None]:
processed_train_data = pd.read_csv(processed_data_path).copy()
processed_train_data.head()

report = sv.analyze(processed_train_data)
report.show_html("../reports/processed_train_data_sweetviz_eda_report.html")