# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data Importing

In [None]:
data=pd.read_csv("/kaggle/input/pakistan-house-price-prediction/Entities.csv")
data

# Data Describe

In [None]:
data.head(5)

In [None]:
data.info()

In [None]:
data.describe()

# Data Preprocessing & Cleaning

In [None]:
df=data.copy()
df

In [None]:
df.info()

In [None]:
df = df.drop(['Unnamed: 0'],axis=1)

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap( df.isnull(),cmap=sns.cubehelix_palette(as_cmap=True))

# filling the null values

In [None]:
fill_list = df['agency'].dropna()
df['agency'] = df['agency'].fillna(pd.Series(np.random.choice(fill_list , size = len(df.index))))

In [None]:
fill_list=df['agent'].dropna()
df['agent']=df['agent'].fillna(pd.Series(np.random.choice(fill_list,size=len(df.index))))

In [None]:
df.isnull().sum()

now we don't have null values

In [None]:
sns.heatmap( df.isnull(),cmap=sns.cubehelix_palette(as_cmap=True))

# Data Visualization

In [None]:
import plotly.graph_objects as go

labels = ['House','Flat','Upper Portion','Lower Portion','Room','Farm House','Penthouse']
values = [105468, 38238,13774,9229,685,657,395]

# Use `hole` to create a donut-like pie chart
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.show()

The house property type is most frequent

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='property_type',y='price',data=df)

The most expensive property type farm house

In [None]:
fig = px.pie(df, names='province_name', title='province_name',color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

Punjab is the most regions are in demand

In [None]:
sns.barplot(x='province_name',y='price',data=df)

The most expensive province is the sindh

In [None]:
sns.histplot(data=df, x="baths", kde=True)

In [None]:
fig = px.pie(df,names='purpose',title='purpose',color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.show()

The percentage of homes sold is much higher than homes for rent

In [None]:
sns.histplot(data=df, x="bedrooms", kde=True)

In [None]:
sns.scatterplot(data=df, x="bedrooms", y="price")

As the number of bedrooms increases, the price of the house increases

In [None]:
px.scatter(df.sample(2000), 
           title='Total_Area vs price.',
           x='Total_Area', 
           y='price', 
           )

As the area of   the house increases, the price increases

In [None]:
px.scatter(df.sample(2000), 
           title='Total_Area vs price.',
           x='Total_Area', 
           y='price',
           color='purpose')


This graph shows us the price of the same house if it was bought or rented

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data=df, x="property_type", hue="province_name")

The most common province in which to buy is punjab and the most property type is the house

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data=df, x="purpose", hue="province_name")

Punjab province comes first in home sales, followed by Sindh province

Islamabad province comes first in home rent followed by Sindh province

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(x= df["city"], palette="Set2")

The most demanded city is Karachi

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data=df, x="purpose", hue="city")

Karachi city comes first in home sales, followed by Lahore city

Islamabad city comes first in home rent followed by karachi city

In [None]:
sns.barplot(x='city',y='price',data=df)

The most expensive city in the price of renting or buying houses is Lahore city is the capital of punjab province

In [None]:
import folium
from folium.plugins import FastMarkerCluster
latitudes = np.array(df['latitude'])
longitudes = np.array(df['longitude'])
la_mean = latitudes.mean()
lo_mean = longitudes.mean()
locations = list(zip(latitudes, longitudes))

m = folium.Map(location=[la_mean, lo_mean], zoom_start= 11.5)
FastMarkerCluster(data=locations).add_to(m)
m

# Encoding the String Dataset

In [None]:
df.describe(include=object)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
cols = ['page_url', 'property_type', 'location','city','province_name','purpose','date_added','agency','agent']

df[cols] = df[cols].apply(LabelEncoder().fit_transform)

In [None]:
df.info()

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(df.corr() , annot = True , cmap = "coolwarm")

# train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x=df.drop('price',axis=1).values

In [None]:
y=df['price'].values

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y, test_size=  0.25, random_state= 42)

# modeling

# RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_forest=RandomForestRegressor()
random_forest.fit(x_train , y_train)
print("Random Forest Training Accuracy:", random_forest.score(x_train , y_train))
print("Random Forest Testing Accuracy:", random_forest.score(x_test , y_test))

# Model Evaluation

In [None]:
y_pred = random_forest.predict(x_test)
y_pred

In [None]:

y_pred = random_forest.predict(x_test)
df4 = pd.DataFrame({"Y_test": y_test , "Y_pred" : y_pred})
df4.head(20)

In [None]:
plt.figure(figsize= (20,6))

plt.plot(df4[:500])
plt.legend(["Actual" , "Predicted"])

In [None]:
from sklearn.metrics import r2_score

In [None]:
random_forest_r2 = r2_score(y_test , y_pred)
random_forest_r2

In [None]:
from sklearn.metrics import mean_absolute_error , mean_absolute_percentage_error , mean_squared_error

In [None]:
mse = mean_squared_error (y_test , y_pred)
print (mse)

mae = mean_absolute_error (y_test , y_pred)
print (mae)

mape = mean_absolute_percentage_error (y_test , y_pred)
print (mape)