<a href="https://colab.research.google.com/github/idebroy/ml-ds/blob/main/first-shot-knn-regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

house_prices_advanced_regression_techniques_path = kagglehub.competition_download('house-prices-advanced-regression-techniques')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning models and metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, classification_report,
                             precision_score, recall_score, f1_score,
                             confusion_matrix)

import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
def load_csv(file_path):
  try:
    df = pd.read_csv(file_path)
    return df
  except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    return None
  except Exception as e:
    print(f"An error occurred while loading the file: {e}")
    return None

# Example usage:
trainFilePath = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'
df = load_csv(trainFilePath)
print(f"Kaggle \"connected to:\" {trainFilePath}")

testFilePath = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'
df_test = load_csv(testFilePath)
print(f"Kaggle \"connected to:\" {testFilePath}")

# If the file is loaded successfully, you can work with the DataFrame 'df'.
# For example, you can print the first few rows:
print(df.info())

Kaggle "connected to:" /kaggle/input/house-prices-advanced-regression-techniques/train.csv
Kaggle "connected to:" /kaggle/input/house-prices-advanced-regression-techniques/test.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Con

In [None]:
def impute_categorical_nan(data, column):
  """
  Imputes missing values in a categorical column with the most frequent value.

  Args:
    data: Pandas DataFrame.
    column: Name of the categorical column with missing values.

  Returns:
    Pandas DataFrame with imputed values.
  """

  # Find the most frequent value
  most_frequent_value = data[column].mode()[0]
  print(most_frequent_value)

  # Fill missing values with the most frequent value
  data[column] = data[column].fillna(most_frequent_value)

  return data

def impute_numerical_nan(data, column):
  """
  Imputes missing values in a numerical column with the mean.
  Args:
    data: Pandas DataFrame.
    column: Name of the numerical column with missing values.
  Returns:
    Pandas DataFrame with imputed values.
  """
  # Calculate the mean
  mean = data[column].mean()

  # Fill missing values with the mean
  data[column] = data[column].fillna(mean)

  return data

def impute_nan(data, column):
  """
  Imputes missing values in a column based on its data type.

  Args:
    data: Pandas DataFrame.
    column: Name of the column with missing values.

  Returns:
    Pandas DataFrame with imputed values.
  """
  if column in data.select_dtypes(include=['number']).columns:
    data = impute_numerical_nan(data, column)
  else:
    data = impute_categorical_nan(data, column)

  return data

def find_cat_correlation(data, cat_cols, target_col):
  """
  Finds the categorical column most highly correlated (using Cramér's V) with the target column.

  Args:
      data: Pandas DataFrame.
      cat_cols: List of categorical columns.
      target_col: Name of the target column (must be categorical).

  Returns:
      Name of the most highly correlated categorical column.
  """
  from scipy.stats import chi2_contingency

  correlations = {}

  for col in cat_cols:
    if col != target_col:
      contingency_table = data.pivot_table(index=target_col, columns=col, aggfunc='size', fill_value=0)
      chi2, _, _, _ = chi2_contingency(contingency_table)
      n = contingency_table.sum().sum()
      phi2 = chi2 / n
      min_dim = min(contingency_table.shape) - 1
      cramers_v = np.sqrt(phi2 / min_dim)
      correlations[col] = cramers_v

  most_correlated_column = max(correlations, key=correlations.get)
  return most_correlated_column

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [None]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [None]:
# List of columns to drop based on previous audit
cols_to_drop = ['Id',         # Identifier
    'PoolQC',     # >99% missing
    'MiscFeature',# >96% missing
    'Alley',      # >93% missing
    'Fence',      # >80% missing
    'Street',     # Almost all same value
    'LotFrontage', # LotFrontage is numerical, better to impute
    'GarageYrBlt','MSZoning_RH', 'LotShape_IR3', 'Neighborhood_Blueste', 'Neighborhood_SawyerW', 'Condition2_Feedr', 'Condition2_Norm', 'Condition2_PosA', 'Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn', 'HouseStyle_SFoyer', 'RoofStyle_Mansard', 'RoofStyle_Shed', 'RoofMatl_Membran', 'Exterior1st_AsphShn', 'Exterior1st_BrkComm', 'Exterior1st_CBlock', 'Exterior1st_MetalSd', 'Exterior1st_Stone', 'Exterior2nd_AsphShn', 'Exterior2nd_CBlock', 'Exterior2nd_CmentBd', 'Exterior2nd_Stucco', 'ExterCond_Po', 'ExterCond_TA', 'BsmtFinType1_Unf', 'BsmtFinType2_Unf', 'Heating_GasW', 'HeatingQC_Po', 'Electrical_FuseP', 'Electrical_Mix', 'FireplaceQu_Gd', 'GarageCond_Gd', 'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth']
#  [
#     'Id',         # Identifier
#     'PoolQC',     # >99% missing
#     'MiscFeature',# >96% missing
#     'Alley',      # >93% missing
#     'Fence',      # >80% missing
#     'Street',     # Almost all same value
#     'LotFrontage', # LotFrontage is numerical, better to impute
#     'GarageYrBlt',
#     # Add any others you identify (e.g. 'Utilities', 'MiscVal', etc. if analysis shows)
# ]

# Actually drop those columns if they exist
df_cleaned = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
## df_test = df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns])


# Impute remaining missing values
for col in df_cleaned.columns:
  if df_cleaned[col].isnull().any():
    df_cleaned = impute_nan(df_cleaned, col)


# Confirm removal and imputation
print("Remaining columns after drop:", df_cleaned.columns.tolist())
print("Shape of cleaned dataframe:", df_cleaned.shape)
print(f"nan in train df : {df_cleaned.isna().sum().sum()}")

BrkFace
TA
TA
No
Unf
Unf
SBrkr
Gd
Attchd
Unf
TA
TA
Remaining columns after drop: ['MSSubClass', 'MSZoning', 'LotArea', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPo

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data (simple encoding of categoricals)
df_model = pd.get_dummies(df_cleaned, drop_first=True)
X = df_model.drop('SalePrice', axis=1)
y = df_model['SalePrice']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Initialize the KNN model (Regressor)
knn = KNeighborsRegressor()

# Fit the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluation of the model using regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40]
}

# Set up the Grid Search with Cross-Validation (using a regression scoring metric)
grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

# Fit the model with the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params_knn = grid_search.best_params_
best_mse = -grid_search.best_score_ # GridSearchCV returns negative MSE for maximization

print(f"Best Parameters: {best_params_knn}")
print(f"Best Mean Squared Error (from CV): {best_mse}")

              precision    recall  f1-score   support

       35311       0.00      0.00      0.00         1
       37900       0.00      0.00      0.00         0
       40000       0.00      0.00      0.00         1
       55000       0.00      0.00      0.00         0
       55993       0.00      0.00      0.00         1
       58500       0.00      0.00      0.00         0
       60000       0.00      0.00      0.00         1
       62383       0.00      0.00      0.00         0
       64500       0.00      0.00      0.00         1
       66500       0.00      0.00      0.00         1
       67000       0.00      0.00      0.00         2
       68400       0.00      0.00      0.00         1
       68500       0.00      0.00      0.00         1
       72500       0.00      0.00      0.00         0
       73000       0.00      0.00      0.00         0
       75000       0.00      0.00      0.00         1
       75500       0.00      0.00      0.00         1
       76000       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best Parameters: {'algorithm': 'auto', 'leaf_size': 20, 'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'uniform'}


In [None]:
# prompt: Best Parameters: {'algorithm': 'auto', 'leaf_size': 20, 'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
# Best Mean Squared Error (from CV): 1852418936.0715637  Use this params now to predict the test data and save it as id : price format

import pandas as pd
# Impute missing values in the test set
for col in df_test.columns:
  if col in df_test.columns and df_test[col].isnull().any():
    # Check if the column was in the training data and handle it accordingly
    if col in df.columns: # Use original training data to determine imputation strategy
      if df[col].dtype in ['int64', 'float64']:
        impute_numerical_nan(df_test, col)
      else:
        impute_categorical_nan(df_test, col)
    else: # If column was not in training data, we might need a different strategy or drop
        print(f"Warning: Column '{col}' found in test set but not in training. Skipping imputation.")
        # Optionally, you might want to drop this column or impute with a default value

# Drop columns from the test set that were dropped from the training set
df_test_cleaned = df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns])

# Align test set columns with training set columns after one-hot encoding
# This handles potential missing columns in the test set after one-hot encoding
X_test_model = pd.get_dummies(df_test_cleaned, drop_first=True)

# Get the columns from the training data after one-hot encoding
train_cols = X.columns

# Reindex the test data to have the same columns as the training data, filling missing with 0
X_test_aligned = X_test_model.reindex(columns=train_cols, fill_value=0)

# Initialize the KNN Regressor with the best found parameters
best_knn = KNeighborsRegressor(**best_params_knn)

# Fit the best model on the entire training data (or you can choose to refit on the original split)
# For submission, it's often better to train on the full training data.
# best_knn.fit(X_train, y_train) # Fit on split data
best_knn.fit(X, y) # Fit on full training data for potentially better generalization

# Make predictions on the processed test data
predictions = best_knn.predict(X_test_aligned)

# Create the submission DataFrame in 'id : price' format
submission_df = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': predictions})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

print("Submission file created successfully!")
print(submission_df.head())
