# Exploratory Data Analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from clean_dataset import import_csv_data
import scipy.stats as stats
from clean_immo_datasetV2 import encode_categorical_features, preprocess_missing_values

#### Load clean csv

In [2]:
file_path = 'cleaned_real_estate_data.csv'
raw_df = import_csv_data(file_path)
raw_df

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_real_estate_data.csv'

In [None]:
df_wo_missing = preprocess_missing_values(raw_df)

In [None]:
df_wo_missing

In [None]:

df_encoded, encoders = encode_categorical_features(df_wo_missing)


In [None]:
encoders

In [None]:
df_encoded

#### Identifying the type of variables

In [None]:
df_encoded.dtypes

## Outliers

In [None]:
def remove_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

In [None]:
def identify_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] <= lower)], df[(df[col] >= upper)]

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df_encoded['price'], bins=50, color='blue', edgecolor='white')
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Count')
plt.show()

#### Outliers for price

In [None]:
df_encoded['price'].describe()

In [None]:
lower_outliers, upper_outliers = identify_outliers_iqr(df_encoded, 'price')

In [None]:
lower_outliers

In [None]:
upper_outliers['subtype']

In [None]:
df_wo_price_outliers = remove_outliers_iqr(df_encoded, 'price')

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df_wo_price_outliers['price'], bins=50, color='green', edgecolor='white')
plt.title('Price Distribution (without Outliers)')
plt.xlabel('Price')
plt.ylabel('Count')
plt.show()

#### Analyzing Outliers

##### By Subtype

In [None]:
num_bins = len(upper_outliers['subtype'].unique())
num_bins

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=upper_outliers, x='subtype', color='pink', edgecolor='white')
plt.xticks(rotation=90)
plt.title('Outliers price dsitribution by subtype')

## Correlation Analysis

### Correlation for Continuous variables

In [None]:
df_continuous = df_encoded.select_dtypes(include='float64')
df_continuous = df_continuous.drop(columns=['id','locality_encoded'])
df_continuous.columns

In [None]:
df_continuous = df_continuous.loc[:, ['price','bedroomCount', 'bathroomCount', 'habitableSurface', 'roomCount',
       'buildingConstructionYear', 'facedeCount', 'floorCount',
       'streetFacadeWidth', 'kitchenSurface', 'landSurface',
       'livingRoomSurface', 'gardenSurface', 'parkingCountIndoor',
       'parkingCountOutdoor', 'toiletCount', 'terraceSurface', 
       'epcScore_encoded']]  

print(df_continuous)

In [None]:
corr_matrix_cont = df_continuous.corr()
corr_matrix_cont

In [None]:
plt.figure(figsize=(16, 16))
sns.heatmap(corr_matrix_cont, cmap ='RdBu', vmin =-1, vmax=1, annot =True, square = True)

#### Correlation for boolean

In [None]:
bool_cols= ['price'] + list(df_encoded.select_dtypes(include='bool').columns)
bool_cols

In [None]:
df_boolean = df_encoded[bool_cols]

##### Analysis with Point-biserial correlation

In [None]:
point_biserial_corr = dict()

for col in df_boolean.columns:
    point_biserial_corr[col] = stats.pointbiserialr(df_encoded[col], df_encoded['price']).statistic

In [None]:
for k,v in point_biserial_corr.items():
    print(f"Correlation with {k} : {v}")

In [None]:
sorted_bool_correlation = dict(sorted(point_biserial_corr.items(), key=lambda x:x[1], reverse = True))
sorted_bool_correlation 

#### Swimming Pool

In [None]:

swimming_plot = sns.boxplot(x='hasSwimmingPool_encoded', y='price', data=df_encoded , showfliers=False )
plt.title('Price by hasSwimmingPool')
plt.show()

In [None]:
correlation_swimm_pool = df_encoded['hasSwimmingPool_encoded'].corr(df_encoded['price'])
print(f"Pearson correlation for 'price' vs 'hasSwimmingPool': {correlation_swimm_pool}")

In [None]:
stats.pointbiserialr(df_encoded['hasSwimmingPool'], df_encoded['price'] )

#### Province

In [None]:
df_province =df_encoded[['price', 'province']]
df_province 

In [None]:
df_province[['province']].value_counts()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='province', y='price', data=df_encoded,  showfliers=False )
plt.xticks(rotation=45)
plt.title('Price by Province')
plt.show()

#### Subtype

In [None]:
df_subtype = df_encoded[['price', 'subtype', 'subtype_encoded']]
df_subtype

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='subtype', y='price', data=df_subtype,  showfliers=False )
plt.xticks(rotation=60)
plt.title('Price by subtype')
plt.show()

#### Habitable Surface

In [None]:
df_encoded['habitableSurface'].describe()

In [None]:
df_surface = df_encoded[['habitableSurface', 'price', 'type', 'subtype']]

In [None]:
df_surface_wo_out = remove_outliers_iqr(df_surface, 'habitableSurface')

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_surface_wo_out, x='habitableSurface', y='price',  hue='type', palette='Set2' )
plt.title('Scatter Plot: Price vs Habitable Surface ')
plt.xlabel('Habitable Surface')
plt.ylabel('Price')
plt.show()

In [None]:
correlation_surface= df_encoded['habitableSurface'].corr(df_encoded['price'])
print(f"Pearson correlation for 'price' vs 'habitableSurface': {correlation_surface}")

##### Surface histogram

In [None]:
df_surface_wo_out = remove_outliers_iqr(df_surface, 'habitableSurface')

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df_surface_wo_out['habitableSurface'], bins=50, color='lightblue', edgecolor='black')
plt.title('Surface Distribution')
plt.xlabel('Surface')
plt.ylabel('Count')
plt.show()

#### Number of bedrooms

In [None]:
df_encoded['bedroomCount'].value_counts()

In [None]:
df_bedroom = df_encoded[['bedroomCount', 'price']]
df_bedroom

In [None]:
bedroom_plot = sns.boxplot(x='bedroomCount', y='price', data=df_bedroom , showfliers=False )
plt.title('Price by bedroom')
plt.show()

In [None]:
correlation_bedroom = df_encoded['bedroomCount'].corr(df_encoded['price'])
print(f"Pearson correlation for 'price' vs 'bedroomCount': {correlation_bedroom}")

In [None]:
#### Number of bedrooms


In [None]:
df_encoded['bathroomCount'].value_counts()
df_bedroom = df_encoded[['bathroomCount', 'price']]
df_bedroom


In [None]:
bedroom_plot = sns.boxplot(x='bathroomCount', y='price', data=df_bedroom , showfliers=False )
plt.title('Price by bathroom')
plt.show()
correlation_bedroom = df_encoded['bathroomCount'].corr(df_encoded['price'])
print(f"Pearson correlation for 'price' vs 'bedroomCount': {correlation_bedroom}")