# Imports

In [5]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Creating DataFrame

In [6]:
housing_data={
    'No_of_rooms':[2,3,4,3,2,None,4,3,None,2],
    'No_of_Bathrooms':[1,2,3,2,1,2,None,2,3,None],
    'Area_in_Sqfit':[800,1200,1500,None,900,1000,2000,1800,None,750],
    'Location':['Urban','Suburban','Urban','Rural','Urban','Suburban','Rural','Urban','Suburban',None],
    'Price':[500000,650000,850000,400000,520000,600000,1000000,950000,700000,None]
}

In [7]:
df=pd.DataFrame(housing_data)

In [8]:
df

Unnamed: 0,No_of_rooms,No_of_Bathrooms,Area_in_Sqfit,Location,Price
0,2.0,1.0,800.0,Urban,500000.0
1,3.0,2.0,1200.0,Suburban,650000.0
2,4.0,3.0,1500.0,Urban,850000.0
3,3.0,2.0,,Rural,400000.0
4,2.0,1.0,900.0,Urban,520000.0
5,,2.0,1000.0,Suburban,600000.0
6,4.0,,2000.0,Rural,1000000.0
7,3.0,2.0,1800.0,Urban,950000.0
8,,3.0,,Suburban,700000.0
9,2.0,,750.0,,


# Handling Missing Values

In [9]:
df.isna().sum()

No_of_rooms        2
No_of_Bathrooms    2
Area_in_Sqfit      2
Location           1
Price              1
dtype: int64

In [11]:
df['No_of_rooms']=df['No_of_rooms'].fillna(math.ceil(df['No_of_rooms'].mean()))
df['No_of_Bathrooms']=df['No_of_Bathrooms'].fillna(df['No_of_Bathrooms'].median())
df['Area_in_Sqfit']=df['Area_in_Sqfit'].fillna(df['Area_in_Sqfit'].mode()[0])

In [12]:
df['Price']=df['Price'].fillna(df['Price'].max())

# Encoding

In [13]:
obect_cols=df.select_dtypes(include=object).columns
df=pd.get_dummies(df,columns=obect_cols)
df

Unnamed: 0,No_of_rooms,No_of_Bathrooms,Area_in_Sqfit,Price,Location_Rural,Location_Suburban,Location_Urban
0,2.0,1.0,800.0,500000.0,False,False,True
1,3.0,2.0,1200.0,650000.0,False,True,False
2,4.0,3.0,1500.0,850000.0,False,False,True
3,3.0,2.0,750.0,400000.0,True,False,False
4,2.0,1.0,900.0,520000.0,False,False,True
5,3.0,2.0,1000.0,600000.0,False,True,False
6,4.0,2.0,2000.0,1000000.0,True,False,False
7,3.0,2.0,1800.0,950000.0,False,False,True
8,3.0,3.0,750.0,700000.0,False,True,False
9,2.0,2.0,750.0,1000000.0,False,False,False


In [15]:
bool_cols=df.select_dtypes(include=bool).columns
df[bool_cols]=df[bool_cols].astype(int)
df

Unnamed: 0,No_of_rooms,No_of_Bathrooms,Area_in_Sqfit,Price,Location_Rural,Location_Suburban,Location_Urban
0,2.0,1.0,800.0,500000.0,0,0,1
1,3.0,2.0,1200.0,650000.0,0,1,0
2,4.0,3.0,1500.0,850000.0,0,0,1
3,3.0,2.0,750.0,400000.0,1,0,0
4,2.0,1.0,900.0,520000.0,0,0,1
5,3.0,2.0,1000.0,600000.0,0,1,0
6,4.0,2.0,2000.0,1000000.0,1,0,0
7,3.0,2.0,1800.0,950000.0,0,0,1
8,3.0,3.0,750.0,700000.0,0,1,0
9,2.0,2.0,750.0,1000000.0,0,0,0


# Spliting

In [16]:
y=df['Price']
x=df.drop(columns=['Price'])

In [17]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# Normalization

In [18]:
normalizer=MinMaxScaler()

In [19]:
x_train=pd.DataFrame(normalizer.fit_transform(x_train),columns=x_train.columns)
x_test=pd.DataFrame(normalizer.transform(x_test),columns=x_test.columns)

# Training

In [20]:
model=DecisionTreeRegressor()

In [21]:
model.fit(x_train,y_train)

# Evaluation

In [22]:
y_pred=model.predict(x_test)

In [23]:
mse=mean_squared_error(y_test,y_pred)
mse

np.float64(46250000000.0)