# projet UBER

## import des bibliotheques

In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN

# Import plotly
import plotly.express as px
import plotly.graph_objects as go






## Data année 2014

In [None]:
#import des dataframe 



uber_2014_1 = pd.read_csv(r"D:\jedha\full_stack\projet\machine_learning\ML_unsupervised-UBER\uber-raw-data-apr14.csv")
uber_2014_2 = pd.read_csv(r"D:\jedha\full_stack\projet\machine_learning\ML_unsupervised-UBER\uber-raw-data-may14.csv")
uber_2014_3 = pd.read_csv(r"D:\jedha\full_stack\projet\machine_learning\ML_unsupervised-UBER\uber-raw-data-jun14.csv")
uber_2014_4 = pd.read_csv(r"D:\jedha\full_stack\projet\machine_learning\ML_unsupervised-UBER\uber-raw-data-jul14.csv")
uber_2014_5 = pd.read_csv(r"D:\jedha\full_stack\projet\machine_learning\ML_unsupervised-UBER\uber-raw-data-aug14.csv")
uber_2014_6 = pd.read_csv(r"D:\jedha\full_stack\projet\machine_learning\ML_unsupervised-UBER\uber-raw-data-sep14.csv")


In [None]:
# Concaténation des DataFrames
df_uber2014 = pd.concat([uber_2014_1, uber_2014_2,uber_2014_3,uber_2014_4,uber_2014_5,uber_2014_6])
df_uber2014

In [None]:
df_uber2014.describe (include='all')

In [None]:
print("Number of rows : {}".format(df_uber2014.shape[0]))
print()

print("Number of columns : {}".format(df_uber2014.shape[1]))
print()

print("Display of df_uber2014: ")
display(df_uber2014.head())
print()

print("Basics statistics: ")
data_desc = df_uber2014.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df_uber2014.isnull().sum()/df_uber2014.shape[0])

print("Columns type")
display(df_uber2014.info())

## data engeneering

In [None]:
df_uber2014 = df_uber2014.rename(columns={'Date/Time':'Pickup_date'})

df_uber2014['Pickup_date'] = pd.to_datetime(df_uber2014['Pickup_date'], format='%m/%d/%Y %H:%M:%S')

df_uber2014['annee'] = df_uber2014['Pickup_date'].dt.year.astype(str)
df_uber2014['mois'] = df_uber2014['Pickup_date'].dt.month.astype(str)
df_uber2014['jour'] = df_uber2014['Pickup_date'].dt.day.astype(str)
df_uber2014['jour_sem'] = df_uber2014['Pickup_date'].dt.day_name().astype(str)
df_uber2014['heure'] = df_uber2014['Pickup_date'].dt.hour.astype(str)
#df_uber2014= df_uber2014.drop(columns=['annee','Pickup_date'])
df_uber2014.head()

### creation de la colonne week end et jours feriés

In [None]:
df_uber2014['weekend'] = df_uber2014['jour_sem'].isin(['Saturday', 'Sunday'])
df_uber2014

In [None]:
# Exemple : compter les True dans la colonne "converted"
nb_true = df_uber2014["weekend"].sum()
print(f"Nombre de True : {nb_true}")


In [None]:
# Ajouter une colonne pour la couleur
df_uber2014["couleur"] = df_uber2014["jour_sem"].apply(
    lambda x: "red" if x in ["Saturday", "Sunday"] else "steelblue"
)

# Histogramme avec couleurs personnalisées
fig = px.histogram(
    df_uber2014,
    x="jour_sem",
    color="couleur",
    color_discrete_map="identity",  # pour utiliser les vraies couleurs
    title='Répartition par jour de semaine',
    height=400
)

fig.update_layout(title_x=0.5, showlegend=False)  # Pas besoin de légende sur les couleurs
fig.show()

fig = px.histogram(df_uber2014, x= "weekend", title ='repartition weekend semaine ', height=400)
fig.update_layout(title_x=0.5)
fig.show()

fig = px.histogram(df_uber2014, x= "heure", title ='repartition par heure', height=400)
fig.update_layout(title_x=0.5)
fig.show()




In [None]:
col_drop=['Base','annee','mois','jour','couleur']
df_uber2014_clean = df_uber2014.drop(col_drop, axis=1)

# Filtrer les données dans les limites de New York City
df_uber2014_clean = df_uber2014_clean[
    (df_uber2014_clean['Lat'] >= 40.4774) & (df_uber2014_clean['Lat'] <= 40.9176) &
    (df_uber2014_clean['Lon'] >= -74.2591) & (df_uber2014_clean['Lon'] <= -73.7004)
]

df_uber2014_clean= df_uber2014_clean.sample (10000)

df_uber2014_clean

## Preprocessing

In [None]:
# We chose the latitude and longitude features to train our model
X = df_uber2014_clean[["Lat", "Lon"]]

X[:5]

## Kmeans model

### Elbow

In [None]:
wcss = []
for k in range(2, 15):
  kmeans = KMeans(n_clusters=k, random_state=42)
  kmeans.fit(X)
  wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
px.line(x=range(2, 15), y=wcss)

### silhouette score

In [None]:


ss = []
for k in range(2, 15):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)  # Entraîne le modèle ET retourne les labels
    score = silhouette_score(X, labels)
    ss.append(score)

# Optionnel : afficher les scores
print(ss)



In [None]:

df_scores = pd.DataFrame({
    'k': list(range(2, 15)),
    'silhouette_score': ss
})

fig = px.bar(df_scores,
             x='k',
             y='silhouette_score',
             text='silhouette_score',
             labels={'k': 'Nombre de clusters (k)', 'silhouette_score': 'Silhouette Score'},
             title='Silhouette Score en fonction de k')

fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(yaxis=dict(range=[0, max(ss) + 0.1]), uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()



### Fit 

In [None]:
kmeans = KMeans(n_clusters= 6, random_state=42)  # Choisir le nombre de clusters
kmeans.fit(X)

### Visualisation

In [None]:
df_uber2014_clean.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
df_uber2014_clean.head()


In [None]:
# Définir les limites géographiques approximatives de New York
lat_min, lat_max = 40.4774, 40.9176
lon_min, lon_max = -74.2591, -73.7004


fig = px.scatter_mapbox(
    df_uber2014_clean,
    lat='Lat',
    lon='Lon',
    color='Cluster_KMeans',
    mapbox_style='carto-positron',
    color_discrete_sequence=px.colors.qualitative.Set2  # ou Set2, Pastel1, Bold, etc.
)


# Définir les limites de la vue de la carte
fig.update_layout(
    mapbox=dict(
        center=dict(lat=(lat_min + lat_max) / 2, lon=(lon_min + lon_max) / 2),  # Centrer sur New York
        zoom=10,  # Zoom initial
        style="carto-positron", # Style de la carte
        
        layers=[],  # Aucun autre layer additionnel (facultatif)
    ),
    height=800,
    title='Clusters KMeans')
fig.show()


## DBSCAN model

In [None]:


X = df_uber2014_clean[['Lat', 'Lon']].values

db = DBSCAN(eps=0.01, min_samples=50, metric="manhattan")
db.fit(X)
labels_dbscan = db.labels_

df_uber2014_clean['Cluster_DBSCAN'] = labels_dbscan

df_dbscan_filtered = df_uber2014_clean[df_uber2014_clean['Cluster_DBSCAN'] != -1].copy()
df_dbscan_filtered['Cluster_DBSCAN'] = df_dbscan_filtered['Cluster_DBSCAN'].astype(str)

# 4. Ajout d'une colonne "weekend" (si pas déjà fait)
df_dbscan_filtered['weekend'] = df_dbscan_filtered['jour_sem'].isin(['Saturday', 'Sunday'])

fig = px.scatter_mapbox(df_dbscan_filtered,
                        lat='Lat',
                        lon='Lon',
                        color='Cluster_DBSCAN',
                        mapbox_style='carto-positron',
                        zoom=10,
                        height=800,
                        title='Clusters DBSCAN')
fig.show()


## Evolution par jour de la semaine 

In [None]:
ordre_jours = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# S'assurer que la colonne jour_sem est catégorielle et ordonnée
df_dbscan_filtered['jour_sem'] = pd.Categorical(df_dbscan_filtered['jour_sem'],
                                                categories=ordre_jours,
                                                ordered=True)

# Important : convertir les clusters en chaînes pour forcer couleurs catégorielles
df_dbscan_filtered['Cluster_DBSCAN'] = df_dbscan_filtered['Cluster_DBSCAN'].astype(str)

# Trier les données pour forcer l’ordre dans l’animation
df_dbscan_filtered.sort_values('jour_sem', inplace=True)


In [None]:

fig = px.scatter_mapbox(
    df_dbscan_filtered,
    lat='Lat',
    lon='Lon',
    color='Cluster_DBSCAN',
    animation_frame='jour_sem',
    mapbox_style='carto-positron',
    zoom=10,
    height=800,
    title='Clusters DBSCAN animés par jour de la semaine',
    color_discrete_sequence=px.colors.qualitative.Set1
)

fig.update_layout(
    mapbox_center={"lat": df_dbscan_filtered['Lat'].mean(), "lon": df_dbscan_filtered['Lon'].mean()},
    updatemenus=[{
        "type": "buttons",
        "buttons": [
            {
                "label": "Play",
                "method": "animate",
                "args": [None, {
                    "frame": {"duration": 1200, "redraw": True},
                    "fromcurrent": True,
                    "transition": {"duration": 500}
                }]
            },
            {
                "label": "Pause",
                "method": "animate",
                "args": [[None], {
                    "frame": {"duration": 0, "redraw": False},
                    "mode": "immediate",
                    "transition": {"duration": 0}
                }]
            }
        ]
    }]
)

fig.show()


## Evolution au fil de la journée

In [None]:
# S'assurer que la colonne 'heure' est de type entier et triée
df_dbscan_filtered['heure'] = df_dbscan_filtered['heure'].astype(int)

# Optionnel : trier pour forcer l’ordre d’affichage
df_dbscan_filtered.sort_values('heure', inplace=True)

# Important : Cluster_DBSCAN doit être en chaîne pour éviter problèmes de couleur
df_dbscan_filtered['Cluster_DBSCAN'] = df_dbscan_filtered['Cluster_DBSCAN'].astype(str)


In [None]:

fig = px.scatter_mapbox(
    df_dbscan_filtered,
    lat='Lat',
    lon='Lon',
    color='Cluster_DBSCAN',
    animation_frame='heure',
    mapbox_style='carto-positron',
    zoom=10,
    height=800,
    title='Clusters DBSCAN animés par heure de la journée',
    color_discrete_sequence=px.colors.qualitative.Set1
)

fig.update_layout(
    mapbox_center={"lat": df_dbscan_filtered['Lat'].mean(), "lon": df_dbscan_filtered['Lon'].mean()},
    updatemenus=[{
        "type": "buttons",
        "buttons": [
            {
                "label": "Play",
                "method": "animate",
                "args": [None, {
                    "frame": {"duration": 800, "redraw": True},
                    "fromcurrent": True,
                    "transition": {"duration": 300}
                }]
            },
            {
                "label": "Pause",
                "method": "animate",
                "args": [[None], {
                    "frame": {"duration": 0, "redraw": False},
                    "mode": "immediate",
                    "transition": {"duration": 0}
                }]
            }
        ]
    }]
)

fig.show()


### Comparaison week end et jour de semaine 

In [None]:
# 5. Séparation semaine / week-end
df_semaine = df_dbscan_filtered[df_dbscan_filtered['weekend'] == False].copy()
df_weekend = df_dbscan_filtered[df_dbscan_filtered['weekend'] == True].copy()

In [None]:
df_semaine['Cluster_DBSCAN'] = df_semaine['Cluster_DBSCAN'].astype(int)
df_weekend['Cluster_DBSCAN'] = df_weekend['Cluster_DBSCAN'].astype(int)



In [None]:
fig = go.Figure()

# Trace pour la semaine
fig.add_trace(go.Scattermapbox(
    lat=df_semaine['Lat'],
    lon=df_semaine['Lon'],
    mode='markers',
    marker=dict(
        size=6,
        color=df_semaine['Cluster_DBSCAN'].astype(int),
        colorscale='Viridis',  # ✅ couleurs valides pour Plotly
        showscale=False
    ),
    name='Semaine',
    visible=True
))

# Trace pour le week-end
fig.add_trace(go.Scattermapbox(
    lat=df_weekend['Lat'],
    lon=df_weekend['Lon'],
    mode='markers',
    marker=dict(
        size=6,
        color=df_weekend['Cluster_DBSCAN'].astype(int),
        colorscale='Viridis',
        showscale=False
    ),
    name='Week-end',
    visible=False
))

fig.update_layout(
    title='Clusters DBSCAN - Semaine vs Week-end',
    mapbox=dict(
        style='carto-positron',
        zoom=10,
        center=dict(lat=df_dbscan_filtered['Lat'].mean(), lon=df_dbscan_filtered['Lon'].mean())
    ),
    height=800,
    updatemenus=[
        dict(
            type='buttons',
            direction='right',
            x=0.5,
            xanchor='center',
            y=1.1,
            yanchor='top',
            buttons=[
                dict(label='Semaine', method='update', args=[{'visible': [True, False]}]),
                dict(label='Week-end', method='update', args=[{'visible': [False, True]}]),
            ]
        )
    ]
)

fig.show()
