## preprocessing of data 

In [117]:
import pandas as pd
import numpy as np

In [118]:
df=pd.read_csv("House Price Prediction Dataset.csv")
df.head(5)

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,636056


In [119]:
#checking null values in each column
df.isnull().sum()

Id           0
Area         0
Bedrooms     0
Bathrooms    0
Floors       0
YearBuilt    0
Location     0
Condition    0
Garage       0
Price        0
dtype: int64

In [120]:
df.dtypes

Id            int64
Area          int64
Bedrooms      int64
Bathrooms     int64
Floors        int64
YearBuilt     int64
Location     object
Condition    object
Garage       object
Price         int64
dtype: object

In [121]:
df.columns

Index(['Id', 'Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt',
       'Location', 'Condition', 'Garage', 'Price'],
      dtype='object')

## handling cateorical columns

#### 1. Handling Location Column

In [122]:
df['Location'].unique()

array(['Downtown', 'Suburban', 'Urban', 'Rural'], dtype=object)

In [123]:
df['Location'].value_counts()

Location
Downtown    558
Urban       485
Suburban    483
Rural       474
Name: count, dtype: int64

##### If you’re using linear regression or distance-based models (like KNN), this can mislead the model, because numbers like 1, 2, 3, 4 imply an order and distance between categories, which may not be real.

##### For tree-based models (Random Forest, XGBoost), label encoding usually works fine because the tree just splits on values.


### we are  going to use xgboost regressor ,so we are going to use the label encodig

In [124]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Location']=le.fit_transform(df['Location'])
df['Location']

0       0
1       0
2       0
3       2
4       0
       ..
1995    2
1996    2
1997    1
1998    3
1999    2
Name: Location, Length: 2000, dtype: int64

In [125]:
# we can also use replace mothod to encode the values
# Direct mapping
df['Location'] = df['Location'].replace({
    'Downtown': 1,
    'Urban': 2,
    'Suburban': 3,
    'Rural': 4
})


df['Location']

0       0
1       0
2       0
3       2
4       0
       ..
1995    2
1996    2
1997    1
1998    3
1999    2
Name: Location, Length: 2000, dtype: int64

## 2.Handling condition column 

In [126]:
df['Condition'].unique()

array(['Excellent', 'Good', 'Fair', 'Poor'], dtype=object)

In [127]:
# Direct mapping
df['Condition'] = df['Condition'].replace({
    'Excellent': 1,
    'Good': 2,
    'Fair': 3,
    'Poor': 4
})


df['Condition']

  df['Condition'] = df['Condition'].replace({


0       1
1       1
2       2
3       3
4       3
       ..
1995    4
1996    4
1997    4
1998    1
1999    3
Name: Condition, Length: 2000, dtype: int64

#### 3.Handling Garage column

In [128]:
df['Garage'].unique()

array(['No', 'Yes'], dtype=object)

In [129]:
# Direct mapping
df['Garage'] = df['Garage'].replace({
    'No': 0,
    'Yes': 1
})

df['Garage']

  df['Garage'] = df['Garage'].replace({


0       0
1       0
2       0
3       1
4       1
       ..
1995    0
1996    1
1997    0
1998    1
1999    0
Name: Garage, Length: 2000, dtype: int64

In [130]:
df.head(5)

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,0,1,0,149919
1,2,4272,5,4,3,1958,0,1,0,424998
2,3,3592,2,2,3,1938,0,2,0,266746
3,4,966,4,2,2,1902,2,3,1,244020
4,5,4926,1,4,2,1975,0,3,1,636056


In [131]:
df['YearBuilt'].min()

np.int64(1900)

In [132]:
df['YearBuilt'].max()

np.int64(2023)

In [133]:
df=df.drop('Id',axis=1)
df

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1360,5,4,3,1970,0,1,0,149919
1,4272,5,4,3,1958,0,1,0,424998
2,3592,2,2,3,1938,0,2,0,266746
3,966,4,2,2,1902,2,3,1,244020
4,4926,1,4,2,1975,0,3,1,636056
...,...,...,...,...,...,...,...,...,...
1995,4994,5,4,3,1923,2,4,0,295620
1996,3046,5,2,1,2019,2,4,1,580929
1997,1062,5,1,2,1903,1,4,0,476925
1998,4062,3,1,2,1936,3,1,1,161119


#### selecting x and y for training purpose

In [134]:
df_y=df['Price']
df_y

0       149919
1       424998
2       266746
3       244020
4       636056
         ...  
1995    295620
1996    580929
1997    476925
1998    161119
1999    482525
Name: Price, Length: 2000, dtype: int64

In [135]:
df_x=df.drop(columns=['Price'])
df_x

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage
0,1360,5,4,3,1970,0,1,0
1,4272,5,4,3,1958,0,1,0
2,3592,2,2,3,1938,0,2,0
3,966,4,2,2,1902,2,3,1
4,4926,1,4,2,1975,0,3,1
...,...,...,...,...,...,...,...,...
1995,4994,5,4,3,1923,2,4,0
1996,3046,5,2,1,2019,2,4,1
1997,1062,5,1,2,1903,1,4,0
1998,4062,3,1,2,1936,3,1,1


In [144]:
# splitting the data 

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(df_x,df_y)

In [147]:
x_train

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage
344,4570,3,3,2,1915,2,3,0
512,2605,5,4,2,2015,0,1,0
475,2356,3,2,2,1937,1,3,0
1193,1889,4,1,1,1984,0,2,1
1218,2530,4,3,3,1988,3,1,0
...,...,...,...,...,...,...,...,...
392,4607,1,4,3,2004,3,3,1
1844,4906,5,2,2,2003,3,2,1
1677,3871,3,4,2,2000,1,3,0
1345,4526,5,1,1,2009,3,3,1


In [148]:
y_train

344     279602
512     752858
475     891741
1193    311538
1218    902119
         ...  
392     506089
1844    483595
1677    659019
1345    614562
348     558928
Name: Price, Length: 1500, dtype: int64

In [149]:
x_test

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage
1912,2138,2,3,3,2015,3,1,1
607,1858,1,2,2,1971,1,4,0
1835,4225,2,2,2,2013,0,2,0
1625,3791,4,2,1,2012,2,4,0
1120,1434,1,4,1,2003,3,2,1
...,...,...,...,...,...,...,...,...
336,1512,4,4,3,1922,3,3,0
83,2778,2,2,1,1947,0,3,1
1250,2662,5,2,1,1974,1,4,1
719,742,1,2,3,1979,3,1,1


In [150]:
y_test

1912    405113
607     811299
1835    314531
1625    682592
1120    210353
         ...  
336      80924
83       92192
1250    269854
719     830343
1811    881776
Name: Price, Length: 500, dtype: int64

## Model Training

In [154]:
from xgboost import XGBRegressor


model=XGBRegressor(n_estimators=50 ,learning_rate=0.1, max_depth=6)
model.fit(x_train,y_train)

In [157]:
model.predict([[2138,2,3,3,2015,3,1,1]])

array([383765.4], dtype=float32)

In [160]:
y_pred=model.predict(x_test)

In [161]:
from sklearn.metrics import r2_score

r2_score(y_test,y_pred)

-0.15091419219970703