In [114]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from category_encoders import TargetEncoder
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler
# Ordinal Encoding for 'size'
import re

In [115]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [116]:
# read dist_from_city_center.csv 
dist = pd.read_csv('dist_from_city_centre.csv')
dist.count()

# merge train and dist data frames on location
train_dataset = pd.merge(train_df, dist, on='location', how='left')
train_dataset.head()
test_dataset = pd.merge(test_df, dist, on='location', how='left')
train_dataset.info()
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10656 entries, 0 to 10655
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              10656 non-null  int64  
 1   area_type       10656 non-null  object 
 2   availability    10656 non-null  object 
 3   location        10655 non-null  object 
 4   size            10642 non-null  object 
 5   society         6228 non-null   object 
 6   total_sqft      10656 non-null  object 
 7   bath            10591 non-null  float64
 8   balcony         10152 non-null  float64
 9   price           10656 non-null  float64
 10  dist_from_city  9630 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 915.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2664 entries, 0 to 2663
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              2664 non-null   int64  


In [117]:
# lets add avg rent from avg rent csv
avg_rent = pd.read_csv('avg_rent.csv')

# Lets merge train and avg_rent on location

train_dataset = pd.merge(train_dataset, avg_rent, on='location', how='left')
test_dataset = pd.merge(test_dataset, avg_rent, on='location', how='left')
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10656 entries, 0 to 10655
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              10656 non-null  int64  
 1   area_type       10656 non-null  object 
 2   availability    10656 non-null  object 
 3   location        10655 non-null  object 
 4   size            10642 non-null  object 
 5   society         6228 non-null   object 
 6   total_sqft      10656 non-null  object 
 7   bath            10591 non-null  float64
 8   balcony         10152 non-null  float64
 9   price           10656 non-null  float64
 10  dist_from_city  9630 non-null   float64
 11  avg_2bhk_rent   3665 non-null   float64
dtypes: float64(5), int64(1), object(6)
memory usage: 999.1+ KB


In [118]:
# analyse total_sqft
train_dataset['total_sqft'].describe()

# convert to sqft

# Sq Meter to sqft convert
def sqmt_to_sqft(x):
    # if x is string and contains Sq. Meter
    if type(x) == str and 'Sq. Meter' in x:
        return float(x.split('Sq. Meter')[0]) * 10.7639
    return x

# Sq Yards to sqft convert
def sqyd_to_sqft(x):
    if type(x) == str and 'Sq. Yards' in x:
        return float(x.split('Sq. Yards')[0]) * 9
    return x

# Perch to sqft convert
def perch_to_sqft(x):
    if type(x) == str and 'Perch' in x:
        return float(x.split('Perch')[0]) * 272.25
    return x

# Acres to sqft convert
def acres_to_sqft(x):
    if type(x) == str and 'Acres' in x:
        return float(x.split('Acres')[0]) * 43560
    return x

# Cents to sqft convert
def cents_to_sqft(x):
    if type(x) == str and 'Cents' in x:
        return float(x.split('Cents')[0]) * 435.6
    return x

# Guntha to sqft convert
def guntha_to_sqft(x):
    if type(x) == str and 'Guntha' in x:
        return float(x.split('Guntha')[0]) * 1089
    return x

# Grounds to sqft convert
def grounds_to_sqft(x):
    if type(x) == str and 'Grounds' in x:
        return float(x.split('Grounds')[0]) * 2400
    return x

# convert to sqft
train_dataset['total_sqft'] = train_dataset['total_sqft'].apply(sqmt_to_sqft)
train_dataset['total_sqft'] = train_dataset['total_sqft'].apply(sqyd_to_sqft)
train_dataset['total_sqft'] = train_dataset['total_sqft'].apply(perch_to_sqft)
train_dataset['total_sqft'] = train_dataset['total_sqft'].apply(acres_to_sqft)
train_dataset['total_sqft'] = train_dataset['total_sqft'].apply(cents_to_sqft)
train_dataset['total_sqft'] = train_dataset['total_sqft'].apply(guntha_to_sqft)
train_dataset['total_sqft'] = train_dataset['total_sqft'].apply(grounds_to_sqft)

train_dataset['total_sqft'].describe()

train_dataset['total_sqft_min'] = train_dataset['total_sqft'].str.split('-').str[0]
train_dataset['total_sqft_max'] = train_dataset['total_sqft'].str.split('-').str[1] 

# fill max with min if max is nan
train_dataset['total_sqft_max'] = train_dataset['total_sqft_max'].fillna(train_dataset['total_sqft_min'])

# get avg of max and min
train_dataset['total_sqft'] = (train_dataset['total_sqft_min'].astype(float) + train_dataset['total_sqft_max'].astype(float))/2

train_dataset['total_sqft'].describe()

count    10616.000000
mean      1559.791263
std       1273.981228
min          1.000000
25%       1100.000000
50%       1275.000000
75%       1680.000000
max      52272.000000
Name: total_sqft, dtype: float64

In [119]:
# analyse total_sqft
test_dataset['total_sqft'].describe()

# convert to sqft

# Sq Meter to sqft convert
def sqmt_to_sqft(x):
    # if x is string and contains Sq. Meter
    if type(x) == str and 'Sq. Meter' in x:
        return float(x.split('Sq. Meter')[0]) * 10.7639
    return x

# Sq Yards to sqft convert
def sqyd_to_sqft(x):
    if type(x) == str and 'Sq. Yards' in x:
        return float(x.split('Sq. Yards')[0]) * 9
    return x

# Perch to sqft convert
def perch_to_sqft(x):
    if type(x) == str and 'Perch' in x:
        return float(x.split('Perch')[0]) * 272.25
    return x

# Acres to sqft convert
def acres_to_sqft(x):
    if type(x) == str and 'Acres' in x:
        return float(x.split('Acres')[0]) * 43560
    return x

# Cents to sqft convert
def cents_to_sqft(x):
    if type(x) == str and 'Cents' in x:
        return float(x.split('Cents')[0]) * 435.6
    return x

# Guntha to sqft convert
def guntha_to_sqft(x):
    if type(x) == str and 'Guntha' in x:
        return float(x.split('Guntha')[0]) * 1089
    return x

# Grounds to sqft convert
def grounds_to_sqft(x):
    if type(x) == str and 'Grounds' in x:
        return float(x.split('Grounds')[0]) * 2400
    return x

# convert to sqft
test_dataset['total_sqft'] = test_dataset['total_sqft'].apply(sqmt_to_sqft)
test_dataset['total_sqft'] = test_dataset['total_sqft'].apply(sqyd_to_sqft)
test_dataset['total_sqft'] = test_dataset['total_sqft'].apply(perch_to_sqft)
test_dataset['total_sqft'] = test_dataset['total_sqft'].apply(acres_to_sqft)
test_dataset['total_sqft'] = test_dataset['total_sqft'].apply(cents_to_sqft)
test_dataset['total_sqft'] = test_dataset['total_sqft'].apply(guntha_to_sqft)
test_dataset['total_sqft'] = test_dataset['total_sqft'].apply(grounds_to_sqft)

test_dataset['total_sqft'].describe()

test_dataset['total_sqft_min'] = test_dataset['total_sqft'].str.split('-').str[0]
test_dataset['total_sqft_max'] = test_dataset['total_sqft'].str.split('-').str[1] 

# fill max with min if max is nan
test_dataset['total_sqft_max'] = test_dataset['total_sqft_max'].fillna(test_dataset['total_sqft_min'])

# get avg of max and min
test_dataset['total_sqft'] = (test_dataset['total_sqft_min'].astype(float) + test_dataset['total_sqft_max'].astype(float))/2

test_dataset['total_sqft'].describe()

count     2658.000000
mean      1558.969409
std       1084.960472
min         60.000000
25%       1115.000000
50%       1290.000000
75%       1683.250000
max      30400.000000
Name: total_sqft, dtype: float64

In [120]:
def preprocess_data(df):

    location_mode = df['location'].mode()[0]
    df['location'].fillna(location_mode, inplace=True)

    # Impute 'size' with a default value ('2 BHK')
    df['size'].fillna('2 BHK', inplace=True)

    # Extract numerical value from 'size'
    def extract_bhk(value):
        match = re.search(r'\d+', str(value))
        return int(match.group()) if match else None

    df['size'] = df['size'].apply(extract_bhk)

    df['society'].fillna('Unknown', inplace=True)

    bath_median = df['bath'].median()
    df['bath'].fillna(bath_median, inplace=True)

    balcony_median = df['balcony'].median()
    df['balcony'].fillna(balcony_median, inplace=True)
    
    # Preprocessing for `dist_from_city`
    dist_imputer = SimpleImputer(strategy="median")
    df['dist_from_city'] = dist_imputer.fit_transform(df[['dist_from_city']])

    # Feature Engineering: Binning `dist_from_city`
    binner = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
    df['dist_from_city_bin'] = binner.fit_transform(df[['dist_from_city']]).astype(int)

    # Preprocessing for `avg_2bhk_rent`
    # Add a missing indicator
    df['avg_2bhk_rent_missing'] = df['avg_2bhk_rent'].isna().astype(int)

    # Impute missing values in `avg_2bhk_rent` with the mean
    rent_imputer = SimpleImputer(strategy="mean")
    df['avg_2bhk_rent'] = rent_imputer.fit_transform(df[['avg_2bhk_rent']])

    # Normalize `avg_2bhk_rent`
    scaler = StandardScaler()
    df['avg_2bhk_rent_scaled'] = scaler.fit_transform(df[['avg_2bhk_rent']])


    return df

In [121]:
# Preprocess the train and test data
train_dataset = preprocess_data(train_dataset)
test_dataset = preprocess_data(test_dataset)



In [122]:
print("Missing values after imputation and encoding:\n", train_dataset.isnull().sum())
print("Missing values after imputation and encoding:\n", test_dataset.isnull().sum())

Missing values after imputation and encoding:
 ID                        0
area_type                 0
availability              0
location                  0
size                      0
society                   0
total_sqft               40
bath                      0
balcony                   0
price                     0
dist_from_city            0
avg_2bhk_rent             0
total_sqft_min           40
total_sqft_max           40
dist_from_city_bin        0
avg_2bhk_rent_missing     0
avg_2bhk_rent_scaled      0
dtype: int64
Missing values after imputation and encoding:
 ID                       0
area_type                0
availability             0
location                 0
size                     0
society                  0
total_sqft               6
bath                     0
balcony                  0
dist_from_city           0
avg_2bhk_rent            0
total_sqft_min           6
total_sqft_max           6
dist_from_city_bin       0
avg_2bhk_rent_missing    0
avg_2bhk_ren

In [123]:
def test_Encoding_Data(df):
# Custom Parsing for 'total_sqft'
    def parse_total_sqft(value):
        if '-' in str(value):
            values = value.split('-')
            return (float(values[0]) + float(values[1])) / 2
        try:
            return float(value)
        except ValueError:
            return None

    df['total_sqft_min'] = df['total_sqft_min'].apply(parse_total_sqft)
    total_sqft_median = df['total_sqft_min'].median()
    df['total_sqft_min'].fillna(total_sqft_median, inplace=True)

    df['total_sqft'] = df['total_sqft'].apply(parse_total_sqft)
    total_sqft_median = df['total_sqft'].median()
    df['total_sqft'].fillna(total_sqft_median, inplace=True)
    
    df['total_sqft_max'] = df['total_sqft_max'].apply(parse_total_sqft)
    total_sqft_median = df['total_sqft_max'].median()
    df['total_sqft_max'].fillna(total_sqft_median, inplace=True)
    # One-Hot Encoding for 'area_type'
    df = pd.get_dummies(df, columns=['area_type'], prefix='area_type')

    # Label Encoding for 'availability'
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    df['availability'] = le.fit_transform(df['availability'])

    # Target Encoding for 'location' and 'society'
    if target_encoder:
        df[['location', 'society']] = target_encoder.transform(df[['location', 'society']])

    return df

# Create and fit the TargetEncoder on the train data
target_encoder = TargetEncoder(cols=['location', 'society'])
train_df[['location', 'society']] = target_encoder.fit_transform(train_df[['location', 'society']], train_df['price'])

In [124]:
train_dataset = test_Encoding_Data(train_dataset)
test_dataset = test_Encoding_Data(test_dataset)

In [125]:
train_dataset.to_csv('train_dataset1.csv', index=False)
test_dataset.to_csv('test_dataset1.csv', index=False)

In [126]:
print("Missing values after imputation and encoding:\n", train_dataset.isnull().sum())
print("Missing values after imputation and encoding:\n", test_dataset.isnull().sum())

#train_dataset.to_csv('trainnew.csv', index=False)
#test_dataset.to_csv('testnew.csv', index=False)

Missing values after imputation and encoding:
 ID                                0
availability                      0
location                          0
size                              0
society                           0
total_sqft                        0
bath                              0
balcony                           0
price                             0
dist_from_city                    0
avg_2bhk_rent                     0
total_sqft_min                    0
total_sqft_max                    0
dist_from_city_bin                0
avg_2bhk_rent_missing             0
avg_2bhk_rent_scaled              0
area_type_Built-up  Area          0
area_type_Carpet  Area            0
area_type_Plot  Area              0
area_type_Super built-up  Area    0
dtype: int64
Missing values after imputation and encoding:
 ID                                0
availability                      0
location                          0
size                              0
society                      

In [127]:
# Ensure the test data has the same features as the train data
#X_train = train_dataset.drop(columns=['price'])
#y_train = train_dataset['price']
X_test = test_dataset.drop(columns=['ID'])
X = train_dataset.drop(columns=['ID', 'price'])
y = train_dataset['price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [128]:
# Define the model
model = GradientBoostingRegressor(random_state=42)

# Define the hyperparameters distribution
param_dist = {
    'n_estimators': [100, 200, 500, 1000],  # Number of boosting stages (trees)
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Learning rate
    'max_depth': [3, 5, 7, 10],              # Max depth of individual trees
    'subsample': [0.8, 0.9, 1.0],            # Fraction of samples used for fitting each tree
    'min_samples_split': [2, 5, 10],         # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],           # Minimum number of samples required to be at a leaf node
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=50, cv=5, 
                                   scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)

# Best hyperparameters found
print("Best hyperparameters found: ", random_search.best_params_)

# Best model with the best hyperparameters
best_model = random_search.best_estimator_

# Evaluate the best model
print("Best model score: ", best_model.score(X_valid, y_valid))


Best hyperparameters found:  {'subsample': 0.8, 'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 7, 'learning_rate': 0.01}
Best model score:  0.8141739338490053


In [129]:
# Predict the price for the train data
predictions = best_model.predict(X_valid)

In [130]:
rmse_valid = np.sqrt(mean_squared_error(y_valid, predictions))
print(f"Validation RMSE: {rmse_valid}")

Validation RMSE: 60.76825219807747


In [131]:
# Predict the price for the train data
test_predictions = best_model.predict(X_test)

In [132]:
# Add the predictions to the test DataFrame
test_dataset['price'] = test_predictions

In [133]:
submission = test_dataset[['ID', 'price']]
submission.to_csv('submission.csv', index=False)