In [None]:

# Load essential libraries for data manipulation, visualization, and processing
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from scipy.stats import zscore
import numpy as np


In [None]:

# Load dataset from CSV file and inspect the first few rows
data = pd.read_csv('appartements_data.csv')
data.head()


In [None]:

# Create a summary of each column: data types, non-null counts, and missing value percentages
data_summary = pd.DataFrame({
    "Column": data.columns,
    "Data Type": data.dtypes,
    "Non-Null Count": data.notnull().sum(),
    "Percentage Missing": data.isnull().mean() * 100
})
data_summary


In [None]:

# Plot missing data percentages for better understanding of missing values in each column
missing_pct = data.isnull().mean() * 100
missing_pct = missing_pct[missing_pct > 0]

plt.figure(figsize=(12, 6))
sns.barplot(x=missing_pct.index, y=missing_pct, palette="coolwarm")
plt.title("Percentage of Missing Values by Column")
plt.xlabel("Columns")
plt.ylabel("Percentage Missing")
plt.xticks(rotation=45)
plt.show()


In [None]:

# Remove non-numeric characters from 'price' (e.g., "DH") and convert to numeric values
data['price_numeric'] = data['price'].str.replace(r'[^\d.]', '', regex=True).astype(float)
data[['price', 'price_numeric']].head()


In [None]:

# Replace missing values in numeric columns with the median, which is less affected by outliers
for column in ['salon', 'chamber', 'toilete', 'area']:
    data[column].fillna(data[column].median(), inplace=True)

# Use KNN for more precise imputation in 'price_numeric'
imputer = KNNImputer(n_neighbors=5)
data[['price_numeric']] = imputer.fit_transform(data[['price_numeric']])


In [None]:

# Function to determine 'type_de_vente' using keywords found in 'title'
def deduce_sale_type(text):
    text = str(text).lower()
    if any(word in text for word in ["louer", "location", "loué"]):
        return "rent"
    elif any(word in text for word in ["vendre", "vente", "à vendre"]):
        return "sale"
    return None

# Apply the function to populate 'type_de_vente' based on title content
data['type_de_vente'] = data.apply(
    lambda row: row['type_de_vente'] if pd.notnull(row['type_de_vente']) else deduce_sale_type(row['title']),
    axis=1
)
data['type_de_vente'].fillna("unknown", inplace=True)
data['type_de_vente'].value_counts()


In [None]:

# Detect outliers in 'price_numeric' using Z-score: values far from the mean (±3 standard deviations)
data['price_z'] = zscore(data['price_numeric'])
price_outliers = data[(data['price_z'] > 3) | (data['price_z'] < -3)]
print("Outliers in 'price_numeric':", len(price_outliers))


In [None]:

# Use a boxplot to visualize outliers in 'price_numeric'
plt.figure(figsize=(10, 6))
sns.boxplot(data['price_numeric'], color="skyblue")
plt.title("Boxplot of Property Prices Showing Outliers")
plt.xlabel("Price")
plt.show()


In [None]:

# Calculate Interquartile Range (IQR) for 'area' and detect outliers based on spread
Q1 = data['area'].quantile(0.25)
Q3 = data['area'].quantile(0.75)
IQR = Q3 - Q1
area_outliers = data[(data['area'] < (Q1 - 1.5 * IQR)) | (data['area'] > (Q3 + 1.5 * IQR))]
print("Outliers in 'area':", len(area_outliers))

# Visualize area with a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(data['area'], color="lightgreen")
plt.title("Boxplot of Property Area with Outliers")
plt.xlabel("Area (sq meters)")
plt.show()


In [None]:

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standardize 'price_numeric' and 'area' to have mean=0 and std=1
scaler = StandardScaler()
data[['std_price', 'std_area']] = scaler.fit_transform(data[['price_numeric', 'area']])

# Normalize 'price_numeric' and 'area' between 0 and 1 for consistent range
normalizer = MinMaxScaler()
data[['norm_price', 'norm_area']] = normalizer.fit_transform(data[['price_numeric', 'area']])
data[['price_numeric', 'std_price', 'norm_price', 'area', 'std_area', 'norm_area']].head()


In [None]:

# Plot distributions to compare original, standardized, and normalized data
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
sns.histplot(data['price_numeric'], kde=True, ax=axes[0, 0], color="grey").set(title='Original Price')
sns.histplot(data['std_price'], kde=True, ax=axes[0, 1], color="blue").set(title='Standardized Price')
sns.histplot(data['norm_price'], kde=True, ax=axes[1, 0], color="green").set(title='Normalized Price')
sns.histplot(data['area'], kde=True, ax=axes[1, 1], color="purple").set(title='Original Area')
plt.tight_layout()
plt.show()


In [None]:

from imblearn.over_sampling import SMOTE

# Convert 'type_de_vente' to numerical codes for SMOTE
data['sale_type_code'] = data['type_de_vente'].astype('category').cat.codes

# Balance classes with SMOTE to address any imbalance in 'sale_type_code'
smote = SMOTE(random_state=42)
features = data[['std_price', 'std_area', 'salon', 'chamber', 'toilete']]
target = data['sale_type_code']
balanced_features, balanced_target = smote.fit_resample(features, target)

# Check class distribution after SMOTE
pd.Series(balanced_target).value_counts()


In [None]:

from sklearn.decomposition import PCA

# Reduce data to 2 principal components with PCA
pca = PCA(n_components=2)
pca_components = pca.fit_transform(balanced_features)

# Create a DataFrame for PCA and plot
pca_df = pd.DataFrame(pca_components, columns=['PC1', 'PC2'])
pca_df['sale_type_code'] = balanced_target

# Plot PCA results
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='sale_type_code', palette='coolwarm')
plt.title("2D PCA of Property Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title='Sale Type Code')
plt.show()
