# ***importing libs***

In [35]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder

from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score
 
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings


import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# ***Importing dataset and some descriptive stats***

In [36]:
df=pd.read_csv('/content/uber-raw-data-aug14.csv')

In [37]:
df.shape

(612787, 4)

In [38]:
df.head(6)

Unnamed: 0,Date/Time,Lat,Lon,Base
0,8/1/2014 0:03:00,40.7366,-73.9906,B02512
1,8/1/2014 0:09:00,40.726,-73.9918,B02512
2,8/1/2014 0:12:00,40.7209,-74.0507,B02512
3,8/1/2014 0:12:00,40.7387,-73.9856,B02512
4,8/1/2014 0:12:00,40.7323,-74.0077,B02512
5,8/1/2014 0:13:00,40.7349,-74.0033,B02512


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612787 entries, 0 to 612786
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date/Time  612787 non-null  object 
 1   Lat        612787 non-null  float64
 2   Lon        612786 non-null  float64
 3   Base       612786 non-null  object 
dtypes: float64(2), object(2)
memory usage: 18.7+ MB


In [40]:
df['Base'].value_counts()

B02617    355803
B02598    220129
B02512     31472
B02682      5382
Name: Base, dtype: int64

In [41]:
df.isnull().any()

Date/Time    False
Lat          False
Lon           True
Base          True
dtype: bool

# ***we take a sample bcs colab's notebook crash when we do plots with all data ***

In [42]:
# data_sample =df.sample(28000)
# data_sample.head()
data_sample=df.copy()

In [43]:
start_time = time.time()

data_sample.loc[:, "Date/Time"] = pd.to_datetime(data_sample["Date/Time"])
data_sample.loc[:, "year"] = data_sample["Date/Time"].dt.year
data_sample.loc[:, "month"] = data_sample["Date/Time"].dt.month
data_sample.loc[:, "day"] = data_sample["Date/Time"].dt.day
data_sample.loc[:, "hour"] = data_sample["Date/Time"].dt.hour

weekdays_dict = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}
dataset=data_sample
dataset.loc[:, "weekday"] = dataset["Date/Time"].dt.weekday.map(weekdays_dict)
dataset = dataset.drop("Date/Time", axis = 1)

print("--- %s seconds ---" % (time.time() - start_time))

dataset.head()

--- 58.92890191078186 seconds ---


Unnamed: 0,Lat,Lon,Base,year,month,day,hour,weekday
0,40.7366,-73.9906,B02512,2014,8,1,0,Friday
1,40.726,-73.9918,B02512,2014,8,1,0,Friday
2,40.7209,-74.0507,B02512,2014,8,1,0,Friday
3,40.7387,-73.9856,B02512,2014,8,1,0,Friday
4,40.7323,-74.0077,B02512,2014,8,1,0,Friday


# ***let's take a specifc day and time to  work on***

In [44]:
friday_data = dataset[(dataset["weekday"] == 'Friday') & (dataset["hour"] == 12)]
friday_data.shape

(4150, 8)

# ***fig before clustering***

In [45]:
#la densité des B dans la map 
start_time = time.time()
fig = px.scatter_mapbox(
        friday_data, 
        lat="Lat", 
        lon="Lon",
        color="Base",
        mapbox_style="carto-positron",
        zoom=10
)

fig.show()

print("--- %s seconds ---" % (time.time() - start_time))


--- 0.05290675163269043 seconds ---


# ***Preprocessing part before silhouette nd elbow***

In [46]:
# Create pipeline for numeric features
numeric_features = ['Lon','Lat'] # Names of numeric columns in X_train/X_test
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median',missing_values=np.nan)), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])
# Create pipeline for categorical features
categorical_features = ['Base','year','month','day','hour'] # Names of categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [47]:
print("Performing preprocessings on train set...")
print(friday_data.head())
X_train = preprocessor.fit_transform(friday_data)
print('...Done.')
print(X_train[0:5]) 
print()

Performing preprocessings on train set...
         Lat      Lon    Base  year  month  day  hour weekday
391  40.7137 -74.0107  B02512  2014      8    1    12  Friday
392  40.7686 -73.8625  B02512  2014      8    1    12  Friday
393  40.7845 -73.9542  B02512  2014      8    1    12  Friday
394  40.7152 -74.0021  B02512  2014      8    1    12  Friday
395  40.7102 -74.0044  B02512  2014      8    1    12  Friday
...Done.
[[-0.61650828 -0.61997646  0.          0.          0.          0.
   0.          0.          0.        ]
 [ 1.58582496  0.62838048  0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.22311269  0.98992647  0.          0.          0.          0.
   0.          0.          0.        ]
 [-0.48870757 -0.58586834  0.          0.          0.          0.
   0.          0.          0.        ]
 [-0.52288683 -0.69956205  0.          0.          0.          0.
   0.          0.          0.        ]]



# ***what is the best  number of clusters using ELBOW methode***

In [48]:
# Utilisation de la méthode Elbow pour trouver le nombre optimal de clusters
start_time = time.time()

wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_train)
    wcss.append(kmeans.inertia_)
    
print(wcss)

print("--- %s seconds ---" % (time.time() - start_time))





















[10785.183656309786, 9123.797397269913, 7866.757375709684, 7152.627435438097, 6554.887901363427, 6142.290597327015, 5888.770929365677, 5376.892129674659, 4966.885176030063]
--- 7.347017765045166 seconds ---


# From figure we notice that the optimal number of clusters is betwen 4 and 6 

In [49]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

# ***now silhouette methode ***

In [50]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
start_time = time.time()

s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_train)
    s_score.append(silhouette_score(X_train, kmeans.predict(X_train)))

print(s_score)

print("--- %s seconds ---" % (time.time() - start_time))





















[0.5277423078549452, 0.16021045248344035, 0.23147242900390555, 0.2222903727534389, 0.22414392135558758, 0.21491938771217298, 0.2294221179347226, 0.21329481677512588, 0.26139287110239595]
--- 11.724109172821045 seconds ---


In [51]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,11), y = s_score)
fig.show()

# ***the number optiaml of cluerts is 5***

In [54]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 5)
kmeans.fit(X_train)





In [55]:
friday_data.loc[:,'Cluster_KMeans'] = kmeans.predict(X_train)
friday_data.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lat,Lon,Base,year,month,day,hour,weekday,Cluster_KMeans
391,40.7137,-74.0107,B02512,2014,8,1,12,Friday,3
392,40.7686,-73.8625,B02512,2014,8,1,12,Friday,0
393,40.7845,-73.9542,B02512,2014,8,1,12,Friday,0
394,40.7152,-74.0021,B02512,2014,8,1,12,Friday,3
395,40.7102,-74.0044,B02512,2014,8,1,12,Friday,3


# ***culsters in geo map***

In [132]:
fig1=px.scatter_mapbox(
    friday_data,
    lat="Lat",
    lon="Lon",
    color="Cluster_KMeans",
    mapbox_style="carto-positron",
)
fig1.show()


# ***Now DBSCAN methode ***

In [115]:
# import DBSCAN from sklearn 
from sklearn.cluster import DBSCAN


# Instanciate DBSCAN 
db = DBSCAN(eps=0.05, min_samples=10, metric="manhattan")

# Fit on data 
## No need to normalize data, it already is! 
db.fit(X_train)


**numbers of db labels**

In [116]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4])

# ***Now we add new col correspond to cluster number***

In [117]:
friday_data["cluster"] = db.labels_
friday_data.head(2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lat,Lon,Base,year,month,day,hour,weekday,Cluster_KMeans,cluster
391,40.7137,-74.0107,B02512,2014,8,1,12,Friday,3,-1
392,40.7686,-73.8625,B02512,2014,8,1,12,Friday,0,-1


In [87]:
type(X_train)

numpy.ndarray

In [118]:
# Visualize with plotly 

## Import go to build our own figure
import plotly.graph_objects as go

fig = go.Figure()

# # Loop through each label for our cluster
# for i in np.unique(db.labels_):
#     label = X_train[db.labels_ == i]
#     fig.add_trace(go.Scatter(x=label[:, 0], y=label[:, 1], mode="markers", name="Cluster {}".format(i)))

# fig.show()

# ***it depends on  how strong ur pc to look for best params ***

In [135]:
fig2=px.scatter_mapbox(
    friday_data.loc[friday_data['cluster'] != -1, :],
    lat="Lat",
    lon="Lon",
    color="cluster",
    mapbox_style="carto-positron",
    zoom=11.9
)
fig2.show()

# ***dbscan vs kmeans for a specific day and specific time***

In [136]:
fig1.show()
fig2.show()

# ***Dbscan is not sensible to outliers  comapring to kmeans we just need to choose good settings (min_samples and the epsilon) =(densité et distance)***