In [None]:
#Importation des bibliothèques nécessaires

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sqlalchemy import create_engine

# Database connection details - replace these with your own
database_username = 'postgres'
database_password = '123456'
database_ip       = '172.26.0.2'
database_name     = 'airbnb_paris'
database_port     = '5432'

# SQLAlchemy engine for PostgreSQL
engine = create_engine(f'postgresql://{database_username}:{database_password}@{database_ip}:{database_port}/{database_name}')


Chargeement le dataset

In [None]:
# Chargement et préparation du dataset


query = "SELECT * FROM listing"  # Replace 'your_table_name' with the name of your table
df = pd.read_sql(query, engine)

print("Forme du jeu de données avant le nettoyage: ", df.shape)

Partie 1: Analyse du Jeu de Données

In [None]:

# Suppression des colonnes non pertinentes

In [None]:
columns_to_drop = ['ID', 'Listing Url', 'Scrape ID', 'Last Scraped', 'Name', 'Summary',
       'Space', 'Description', 'Experiences Offered', 'Neighborhood Overview',
       'Notes', 'Transit', 'Access', 'Interaction', 'House Rules',
       'Thumbnail Url', 'Medium Url', 'Picture Url', 'XL Picture Url',
       'Host ID', 'Host URL', 'Host Name', 'Host Since', 'Host Location',
       'Host About', 'Host Response Time', 'Host Response Rate',
       'Host Acceptance Rate', 'Host Thumbnail Url', 'Host Picture Url',
       'Host Neighbourhood', 'Host Listings Count',
       'Host Total Listings Count', 'Host Verifications', 'Street',
       'Neighbourhood', 'Neighbourhood Cleansed',
       'Neighbourhood Group Cleansed', 'City', 'State', 'Zipcode', 'Market',
       'Smart Location', 'Country Code', 'Country', 'Latitude', 'Longitude',
        'Room Type','Bed Type', 'Amenities', 'Square Feet', 'Price', 'Weekly Price',
       'Monthly Price', 'Security Deposit', 'Cleaning Fee', 'Guests Included',
       'Extra People', 'Minimum Nights', 'Maximum Nights', 'Calendar Updated',
       'Has Availability', 'Availability 30', 'Availability 60',
       'Availability 90', 'Availability 365', 'Calendar last Scraped',
       'Number of Reviews', 'First Review', 'Last Review',
       'Review Scores Accuracy',
       'Review Scores Cleanliness', 'Review Scores Checkin',
       'Review Scores Communication', 'Review Scores Location',
       'Review Scores Value', 'License', 'Jurisdiction Names',
       'Cancellation Policy', 'Calculated host listings count',
       'Reviews per Month', 'Geolocation', 'Features']

df.drop(columns=columns_to_drop, inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
df.head(1)

In [None]:
# Traitement des valeurs manquantes
df.dropna(inplace=True)
print("Forme de l'ensemble de données après nettoyage: ", df.shape)

In [None]:
# Encodage des caractéristiques catégorielles
encoder = LabelEncoder()
df['Property Type Encoded'] = encoder.fit_transform(df['Property Type'])


In [None]:
# Statistiques descriptives
print(df.describe())


In [None]:
# Normalisation des données numériques
scaler = StandardScaler()
cols_numeriques = df.select_dtypes(include=[np.number]).columns.tolist()
data_normalized = scaler.fit_transform(df[cols_numeriques])

In [None]:
n_cols = 3
n_rows = (len(cols_numeriques) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4))  # La taille de chaque subplot est de 5x4

for i, col in enumerate(cols_numeriques, 1):  # Commence l'indexation à 1 pour les subplots
    plt.subplot(n_rows, n_cols, i)
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution de {col}')

plt.tight_layout()  # Ajuste automatiquement les paramètres de subplot pour donner un padding spécifié
plt.show()

In [None]:
# Visualisation de la corrélation entre les variables
plt.figure(figsize=(10, 8))
correlation_matrix = df[cols_numeriques].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Matrice de corrélation')
plt.show()

In [None]:
# Réduction de dimensionnalité avec PCA
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_normalized)
# Visualisation PCA
plt.figure(figsize=(10, 7))
sns.scatterplot(x=data_pca[:, 0], y=data_pca[:, 1], hue=df['Property Type'], palette='viridis')
plt.title('Visualisation PCA des données Airbnb')
plt.xlabel('Composante Principale 1')
plt.ylabel('Composante Principale 2')
plt.legend(title='Type de Propriété', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Prédiction avec Régression Linéaire
X = df[cols_numeriques].drop('Review Scores Rating', axis=1)  # Exclure la cible de la prédiction
y = df['Review Scores Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R^2: {r2_score(y_test, y_pred)}")


In [None]:
# Clustering avec K-Means
kmeans = KMeans(n_clusters=5, n_init=10, random_state=42)
kmeans.fit(data_normalized)
labels = kmeans.labels_
print(f"Silhouette Score: {silhouette_score(data_normalized, labels)}")

In [None]:
# Visualisation des clusters formés par K-Means avec PCA
plt.figure(figsize=(10, 7))
sns.scatterplot(x=data_pca[:, 0], y=data_pca[:, 1], hue=labels, palette='viridis')
plt.title('Clusters K-Means avec PCA')
plt.xlabel('Première Composante Principale')
plt.ylabel('Deuxième Composante Principale')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()