# Getaround project researchs

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots



## Load data

In [2]:
rawdata = pd.read_excel('../data/get_around_delay_analysis.xlsx')
rawdata.head()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,


## Basics stats

In [3]:
# Basic stats
print("Taille du dataset:")
print("Number of rows : {}".format(rawdata.shape[0]))
print("Number of columns : {}".format(rawdata.shape[1]))
print()
print("---------------------------")
print()

print("Basics infos:")
print()
display(rawdata.info())
print()
print("---------------------------")
print()

print("Basics statistics: ")
print()
data_desc = rawdata.describe(include='all')
display(data_desc)
print()
print("---------------------------")
print()

print("Unique elements by feature: ")
print()
display(rawdata.nunique().sort_values())
print()
print("---------------------------")
print()

print("Percentage of missing values: ")
print()
display(100*rawdata.isnull().sum()/rawdata.shape[0])## 3. 

Taille du dataset:
Number of rows : 21310
Number of columns : 7

---------------------------

Basics infos:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21310 entries, 0 to 21309
Data columns (total 7 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   rental_id                                   21310 non-null  int64  
 1   car_id                                      21310 non-null  int64  
 2   checkin_type                                21310 non-null  object 
 3   state                                       21310 non-null  object 
 4   delay_at_checkout_in_minutes                16346 non-null  float64
 5   previous_ended_rental_id                    1841 non-null   float64
 6   time_delta_with_previous_rental_in_minutes  1841 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 1.1+ MB


None


---------------------------

Basics statistics: 



Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
count,21310.0,21310.0,21310,21310,16346.0,1841.0,1841.0
unique,,,2,2,,,
top,,,mobile,ended,,,
freq,,,17003,18045,,,
mean,549712.880338,350030.603426,,,59.701517,550127.411733,279.28843
std,13863.446964,58206.249765,,,1002.561635,13184.023111,254.594486
min,504806.0,159250.0,,,-22433.0,505628.0,0.0
25%,540613.25,317639.0,,,-36.0,540896.0,60.0
50%,550350.0,368717.0,,,9.0,550567.0,180.0
75%,560468.5,394928.0,,,67.0,560823.0,540.0



---------------------------

Unique elements by feature: 



checkin_type                                      2
state                                             2
time_delta_with_previous_rental_in_minutes       25
delay_at_checkout_in_minutes                   1745
previous_ended_rental_id                       1788
car_id                                         8143
rental_id                                     21310
dtype: int64


---------------------------

Percentage of missing values: 



rental_id                                      0.000000
car_id                                         0.000000
checkin_type                                   0.000000
state                                          0.000000
delay_at_checkout_in_minutes                  23.294228
previous_ended_rental_id                      91.360863
time_delta_with_previous_rental_in_minutes    91.360863
dtype: float64

**Observations:**

Nous avons 2 données de type catégorielle :
- checkin_type: mobile/connect
- state: canceled/ended

et nous avons des données avec beaucoup de champs non renseignés :
- previous_ended_rental_id --> 91.36%
- time_delta_with_previous_rental_in_minutes --> 91.36%
- previous_ended_rental_id -- 23/29%

Il serait intéressant de:
- Voir le nombre location se faisant soit via le mobile soit directement.
- D'investiguer sur les données manquantes et prendre une décision sur comment les traiter.
- Voir s'il existe des valeures abérantes pour les données : delay_at_checkout_in_minutes, time_delta_with_previous_rental_in_minutes

## EDA

In [4]:
dataset = rawdata.copy()

In [5]:
fig = px.pie(
    dataset,
    names='checkin_type',
    title='Distribution de Checkin Type'
)
fig.show()


### Nettoyage des données

Nous pouvons supprimer les données pour les courses annulées, nous n'en avons pas besoin pour étudier les retards pour rendre les véhicules.

In [6]:
dataset = dataset[dataset["state"] == "ended"]

Ajout d'une colonne représentant le nombre de fois qu'un véhicule a été loué.

In [7]:
dataset['rental_count'] = dataset.groupby('car_id')['car_id'].transform('count')
dataset.head()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,rental_count
1,507750,269550,mobile,ended,-81.0,,,3
2,508131,359049,connect,ended,70.0,,,8
4,511440,313932,mobile,ended,,,,1
5,511626,398802,mobile,ended,-203.0,,,9
6,511639,370585,connect,ended,-15.0,563782.0,570.0,5


In [18]:
test = dataset[dataset["car_id"] == 359049]
test.head(10)

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,rental_count
2,508131,359049,connect,ended,70.0,,,8
6391,546894,359049,connect,ended,2.0,,,8
8219,544433,359049,connect,ended,-110.0,,,8
8992,553735,359049,connect,ended,33.0,550645.0,390.0,8
9754,537126,359049,connect,ended,-156.0,539408.0,720.0,8
13600,539408,359049,connect,ended,-6.0,,,8
15866,547579,359049,connect,ended,32.0,,,8
17483,550645,359049,connect,ended,249.0,,,8


In [17]:
test2 = dataset[dataset["rental_id"] == 550645]
test2.head()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,rental_count
17483,550645,359049,connect,ended,249.0,,,8


On peut voir que la donnée previous_ended_rental_id n'est pas forcément renseigné.

In [9]:
# Visualisation de la distribution des données et repérage des outliers
fig = make_subplots(rows=1, cols=2, subplot_titles=("Délai d'attente pour rendre le véhicule en minutes", "Délai par rapport à la location précédente en minutes"))

fig.add_trace(go.Box(y=dataset['delay_at_checkout_in_minutes'], name='Delay'), row=1, col=1)
fig.add_trace(go.Box(y=dataset['time_delta_with_previous_rental_in_minutes'], name='Time delta'), row=1, col=2)

# Afficher la figure
fig.show()

Il y a des délais énormes 

In [10]:
# Show outliers
# Drop lines containing invalid values or outliers  [Xˉ−3σ,Xˉ+3σ][Xˉ−3σ,Xˉ+3σ]
def ouliers_viewver(dataset, columns=[]):
    """
    Display outliers from Pandas dataset.

    Parameters:
    dataset (pd.DataFrame): Pandas dataset
    columns (list): list of the columns in dataset to check outliers. All by default. 
    
    Returns:
    Void
    """
    outliers_count = {}
    if len(columns) < 1:
        columns = dataset.columns
        
    for col in columns:
        mean = dataset[col].mean()
        std = dataset[col].std()
        
        # 3 sigmas rules
        lower_bound = mean - 3 * std
        upper_bound = mean + 3 * std

        #print(f"For col {col}, lower is {lower_bound} and upper is {upper_bound}")
        
        # Create mask
        outliers = (dataset[col] < lower_bound) | (dataset[col] > upper_bound)
        outliers_count[col] = outliers.sum()

    outliers_df = pd.DataFrame(list(outliers_count.items()), columns=['Column', 'Outliers'])
    display(outliers_df)
    fig = px.bar(outliers_df, x='Column', y='Outliers', title='Outliers count by column')
    fig.show()


ouliers_viewver(dataset, columns=["delay_at_checkout_in_minutes"])

Unnamed: 0,Column,Outliers
0,delay_at_checkout_in_minutes,70


In [11]:
# Drop lines containing invalid values or outliers  [Xˉ−3σ,Xˉ+3σ][Xˉ−3σ,Xˉ+3σ]
def delete_ouliers(dataset, columns=[]):
    """
    Delete outliers from Pandas dataset.

    Parameters:
    dataset (pd.DataFrame): Pandas dataset
    columns (list): list of the columns in dataset to check outliers. All by default. 
    
    Returns:
    pd.DataFrame: clean dataset
    """
    masks = []
    if len(columns) < 1:
        columns = dataset.columns
        
    for col in columns:
        mean = dataset[col].mean()
        std = dataset[col].std()
        
        # 3 sigmas rules
        lower_bound = mean - 3 * std
        upper_bound = mean + 3 * std
        #print(f"For col {col}, lower is {lower_bound} and upper is {upper_bound}")
        
        # Create mask
        mask = (dataset[col] >= lower_bound) & (dataset[col] <= upper_bound)
        masks.append(mask)

    # Apply mask in all columns
    # example: 
    # row1 = [0,1,1] -> [0]
    # row2 = [1,1,1] -> [1]
    final_mask = pd.concat(masks, axis=1).all(axis=1)
    filtered_df = dataset.loc[final_mask, :]
    return filtered_df

print("old dataset shape:", dataset.shape)
dataset = delete_ouliers(dataset, columns=["delay_at_checkout_in_minutes"])
print("New dataset shape:", dataset.shape)

# Visualisation de la distribution des données et repérage des outliers
fig = make_subplots(rows=1, cols=1, subplot_titles=("Délai d'attente pour rendre le véhicule en minutes"))

fig.add_trace(go.Box(y=dataset['delay_at_checkout_in_minutes'], name='Delay'), row=1, col=1)

# Afficher la figure
fig.show()

old dataset shape: (18045, 8)
New dataset shape: (16275, 8)


In [12]:
# Correlation matrix
corr_dataset = dataset[['rental_id','car_id','checkin_type','delay_at_checkout_in_minutes','previous_ended_rental_id','time_delta_with_previous_rental_in_minutes','rental_count']]
corr_dataset['checkin_type']  = corr_dataset['checkin_type'].apply(lambda x: 1 if x == 'connect' else 0)

corr_dataset.head()
corr_matrix = corr_dataset.corr().round(2)
fig = ff.create_annotated_heatmap(corr_matrix.values,
    x = corr_matrix.columns.tolist(),
    y = corr_matrix.index.tolist(),
    colorscale='Viridis',
    showscale=True,
    zmin=-1,
    zmax=1
)
fig.update_layout(
    width=800,
    height=600,
    margin=dict(t=50, r=50, l=50, b=50) 
)
                                  
fig.show()

Il n'y a pas de fortes corrélations entres les données. Juste un peu entre le checking_type et le nombre de location pour un véhicule. Ce la est surement dû au fait qu'un utilisateur louant beaucoup son véhicule utilise un type de checkin (connect ou  mobile) et doit rester sur ce mode.

In [13]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('Mobile', 'Connect'))

# Ajouter le premier graphique (histogramme)
fig.add_trace(
    go.Histogram(
        x=dataset[dataset['checkin_type'] == 'mobile']['delay_at_checkout_in_minutes'],
        name='Mobile',
        marker_color='blue'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Histogram(
        x=dataset[dataset['checkin_type'] == 'connect']['delay_at_checkout_in_minutes'],
        name='Connect',
        marker_color='orange'
    ),
    row=1, col=2
)

fig.update_layout(
    title_text='Nombre de location pour différents créneaux de temps',
)

# Afficher le plot
fig.show()

Nettoyage des données.