In [None]:
# Step 1: Install Kaggle API
!pip install -q kaggle

# Step 2: Create kaggle.json with your credentials
import json
import os

kaggle_token = {
    "username": "himanshusoni001",
    "key": "7888d0ba07df9b4b51271fe3c97fac80"
}

# Write the token to the correct location
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump(kaggle_token, f)

# Set permission
!chmod 600 /root/.kaggle/kaggle.json

: 

In [None]:
# Step 3: Download dataset using Kaggle API
!kaggle competitions download -c house-prices-advanced-regression-techniques

# Step 4: Unzip the downloaded dataset
!unzip -o house-prices-advanced-regression-techniques.zip

In [None]:
# Step 5: Load and use the dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('train.csv')

# Optional: Preview data
data.head()

In [None]:
data.tail(10)

#                    **DATA AUDIT AND AVAILABILITY CHECK**

In [None]:
# 1. Shape of the dataset
print("\nShape of dataset:", data.shape)


In [None]:
# 2. Data types of each column
print("\nData types of each column:")
print(data.dtypes)

In [None]:
# 3. Missing values per column
print("\nMissing values per column:")
print(data.isnull().sum())

In [None]:
# 4. Data Information
data.info()

In [None]:
# 5. Check for negative or invalid values
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
print(numeric_cols)
print("\nNegative values:")
for col in numeric_cols:
    if (data[col] < 0).any():
        print(f"\033[91mColumn '{col}' has negative values.\033[0m")
        print(data[data[col] < 0][[col]])
    else:
        print(f"\033[92mColumn '{col}' has no negative values.\033[0m")

In [None]:
# 6. Duplicate rows
print("\nNumber of duplicate rows:", data.duplicated().sum())

# **EXPLORATORY DATA ANALYSIS**

In [None]:
#Summary statistics for numeric columns
print("\nSummary statistics:")
print(data.describe())

In [None]:
#Summary statistics for categorical columns
for col in data.columns:
    if (data[col].dtype=='object'):
        print(f"\n{col}:")
        print(data[col].value_counts())
    else:
        pass

In [None]:
# Calculate average property prices across neighborhoods
average_price_by_neighborhood = data.groupby('Neighborhood')['SalePrice'].mean().sort_values()

# Identify the most affordable neighborhoods
affordable_neighborhoods = average_price_by_neighborhood.head(5)

# Identify the most expensive neighborhoods
expensive_neighborhoods = average_price_by_neighborhood.tail(5)

# Average property size and price by building type
average_size_and_price = data.groupby('BldgType')[['GrLivArea', 'SalePrice']].mean()

# Average price by overall quality
average_price_by_quality = data.groupby('OverallQual')['SalePrice'].mean().sort_values()

print("Average Price by Neighborhood:")
print(average_price_by_neighborhood)

print("\nMost Affordable Neighborhoods:")
print(affordable_neighborhoods)

print("\nMost Expensive Neighborhoods:")
print(expensive_neighborhoods)

print("\nAverage Living Area and Price by Building Type:")
print(average_size_and_price)

print("\nAverage Price by Overall Quality:")
print(average_price_by_quality)

In [None]:
# UNIVARIATE VISUALISATION
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(data[col], kde=True, bins=20, color='skyblue')
    plt.title(f'Histogram & KDE of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Correlation heat map
numeric_data = data.select_dtypes(include=[np.number])
plt.figure(figsize=(12, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#BIVARIATE VISUALISATION
# Average price by Neighborhood
plt.figure(figsize=(15, 6))
sns.pointplot(x=data['Neighborhood'], y=data['SalePrice'], errorbar=None)
plt.xticks(rotation=90)
plt.title('Average Sale Price by Neighborhood')
plt.ylabel('Sale Price')
plt.xlabel('Neighborhood')
plt.tight_layout()
plt.show()

In [None]:
# Average price by Overall Quality
plt.figure(figsize=(10, 6))
sns.pointplot(x=data['OverallQual'], y=data['SalePrice'], errorbar=None)
plt.title('Average Sale Price by Overall Quality')
plt.ylabel('Sale Price')
plt.xlabel('Overall Quality')
plt.tight_layout()
plt.show()

In [None]:
# Average price by Building Type
plt.figure(figsize=(10, 6))
sns.pointplot(x=data['BldgType'], y=data['SalePrice'], errorbar=None)
plt.title('Average Sale Price by Building Type')
plt.ylabel('Sale Price')
plt.xlabel('Building Type')
plt.tight_layout()
plt.show()

# **DATA CLEANING**

In [None]:
# Handle missing values
# First, let's identify columns with high missing percentages (>50%)
missing_pct = (data.isnull().sum() / len(data)) * 100
high_missing = missing_pct[missing_pct > 50].index.tolist()
print(f"Columns with >50% missing values: {high_missing}")

# Drop columns with high missing percentages and the Id column
columns_to_drop = ['Id'] + high_missing
datan = data.drop(columns=columns_to_drop, errors='ignore').copy()

# Handle missing values in remaining columns
# For numerical columns: impute with median
numeric_cols_with_missing = datan.select_dtypes(include=[np.number]).columns[datan.select_dtypes(include=[np.number]).isnull().any()].tolist()
for col in numeric_cols_with_missing:
    datan.loc[:, col] = datan[col].fillna(datan[col].median())

# For categorical columns: impute with mode or 'None' for meaningful features
categorical_cols_with_missing = datan.select_dtypes(include=['object']).columns[datan.select_dtypes(include=['object']).isnull().any()].tolist()
for col in categorical_cols_with_missing:
    # For features where 'None' is meaningful (e.g., no garage, no basement)
    if col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 
               'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
               'FireplaceQu', 'Fence', 'MiscFeature']:
        datan.loc[:, col] = datan[col].fillna('None')
    else:
        datan.loc[:, col] = datan[col].fillna(datan[col].mode()[0])

print(f"\nShape after cleaning: {datan.shape}")
print(f"Remaining missing values: {datan.isnull().sum().sum()}")
datan.head()

In [None]:
# OUTLIER DETECTION
numeric_col = datan.select_dtypes(include=['int64', 'float64']).columns
outlier_indices = {}

for col in numeric_col:
    Q1 = datan[col].quantile(0.25)
    Q3 = datan[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = datan[(datan[col] < lower_bound) | (datan[col] > upper_bound)]
    outlier_indices[col] = outliers.index.tolist()

    if len(outliers) > 0:
        print(f"{col}: {len(outliers)} outliers")

print(f"\nTotal columns checked: {len(numeric_col)}")

# Note: Outliers found but not removing them as they may be legitimate high-value properties
# For house price prediction, extreme values can be real luxury homes

# **FEATURE ENGINEERING**

In [None]:
# CREATE NEW FEATURES FROM EXISTING COLUMNS

# 1. Total Square Footage
datan['TotalSF'] = datan['TotalBsmtSF'] + datan['1stFlrSF'] + datan['2ndFlrSF']

# 2. Property Age (from year sold)
datan['PropertyAge'] = datan['YrSold'] - datan['YearBuilt']

# 3. Years Since Remodel
datan['RemodAge'] = datan['YrSold'] - datan['YearRemodAdd']

# 4. Binary features - Has Garage
datan['HasGarage'] = (datan['GarageArea'] > 0).astype(int)

# 5. Binary features - Has Basement
datan['HasBasement'] = (datan['TotalBsmtSF'] > 0).astype(int)

# 6. Binary features - Has Fireplace
datan['HasFireplace'] = (datan['Fireplaces'] > 0).astype(int)

# 7. Binary features - Has 2nd Floor
datan['Has2ndFloor'] = (datan['2ndFlrSF'] > 0).astype(int)

# 8. Quality Score (interaction feature)
datan['QualityScore'] = datan['OverallQual'] * datan['OverallCond']

# 9. Total Bathrooms
datan['TotalBath'] = datan['FullBath'] + 0.5 * datan['HalfBath'] + datan['BsmtFullBath'] + 0.5 * datan['BsmtHalfBath']

# 10. Total Porch Area
datan['TotalPorchSF'] = datan['OpenPorchSF'] + datan['EnclosedPorch'] + datan['3SsnPorch'] + datan['ScreenPorch']

print("New features created:")
print("TotalSF, PropertyAge, RemodAge, HasGarage, HasBasement, HasFireplace,")
print("Has2ndFloor, QualityScore, TotalBath, TotalPorchSF")
print(f"\nUpdated shape: {datan.shape}")
datan[['TotalSF', 'PropertyAge', 'RemodAge', 'HasGarage', 'HasBasement', 'QualityScore', 'TotalBath']].head()

In [None]:
# One-hot encode categorical variables
# Select only categorical columns
categorical_cols = datan.select_dtypes(include=['object']).columns.tolist()

# Limit to important categorical features to avoid too many dummy variables
important_categoricals = ['Neighborhood', 'BldgType', 'HouseStyle', 'ExterQual', 'ExterCond',
                          'Foundation', 'HeatingQC', 'CentralAir', 'KitchenQual',
                          'GarageType', 'GarageFinish', 'PavedDrive', 'SaleCondition']

# Keep only those that exist in our data
categorical_to_encode = [col for col in important_categoricals if col in categorical_cols]

# Create dummy variables
datam = pd.get_dummies(datan, columns=categorical_to_encode, drop_first=True)

print(f"Shape after one-hot encoding: {datam.shape}")
print(f"Encoded categorical columns: {categorical_to_encode}")
datam.head()

In [None]:
datam.head()

In [None]:
# Using LabelEncoder to transform remaining categorical variables to numeric variables

from sklearn.preprocessing import LabelEncoder

# Separate features and target
X = datam.drop(columns=['SalePrice'], axis=1)
y = datam['SalePrice']

# Find remaining categorical columns (those not one-hot encoded)
cat_f = X.select_dtypes(include='object').columns
print(f"Remaining categorical columns to encode: {list(cat_f)}")

# Encode remaining categorical features
X_encoded = X.copy()
label_encoders = {}
for col in cat_f:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])
    label_encoders[col] = le

print(f"\nFinal feature shape: {X_encoded.shape}")
print(f"Target variable shape: {y.shape}")
X_encoded.head()

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# Calculate mutual information scores for all features
selector = SelectKBest(score_func=mutual_info_regression, k='all')
selector.fit(X_encoded, y)

mi_scores = pd.DataFrame({
    'Feature': X_encoded.columns,
    'MI Score': selector.scores_
}).sort_values(by='MI Score', ascending=False)

print("Top 20 Features by Mutual Information Score:")
print(mi_scores.head(20))

In [None]:
from sklearn.feature_selection import f_regression

# Apply f_regression
f_selector = SelectKBest(score_func=f_regression, k='all')
f_selector.fit(X_encoded, y)

# Collect scores
f_scores = pd.DataFrame({
    'Feature': X_encoded.columns,
    'F_Score': f_selector.scores_,
    'p_value': f_selector.pvalues_
}).sort_values(by='F_Score', ascending=False)

print("Top 20 Features by F-Score (Linear Relation with SalePrice):")
print(f_scores.head(20))

In [None]:
# Train Random Forest model to get feature importance
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_encoded, y)

# Get feature importance
importances = pd.Series(rf.feature_importances_, index=X_encoded.columns)
top_features = importances.sort_values(ascending=False).head(20)

print("Top 20 Features by Random Forest Importance:")
print(top_features)

# Visualize top features
plt.figure(figsize=(10, 8))
top_features.plot(kind='barh')
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()