In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Importamos datos de paseos
df_paseos = pd.read_csv("/Users/pablo/Desktop/temp/DA/miBici/mibici_2025/data/raw/mibici_2014-2024.csv", nrows = 50000)
df_paseos.head()

Unnamed: 0.1,Unnamed: 0,Trip_Id,User_Id,Sex,Birth_year,Trip_start,Trip_end,Origin_Id,Destination_Id,Age,Duration
0,0,32244893,1470734,M,1981,2024-01-31 23:59:33,2024-02-01 00:11:15,24,86,43,0 days 00:11:42
1,1,32244892,2731702,M,1994,2024-01-31 23:59:06,2024-02-01 00:10:49,48,279,30,0 days 00:11:43
2,2,32244891,1431452,M,2001,2024-01-31 23:58:48,2024-02-01 00:01:42,273,383,23,0 days 00:02:54
3,3,32244890,2312602,F,2003,2024-01-31 23:58:44,2024-02-01 00:01:58,273,383,21,0 days 00:03:14
4,4,32244889,2266427,M,1999,2024-01-31 23:58:44,2024-02-01 00:01:39,273,383,25,0 days 00:02:55


In [3]:
# Importamos datos de estaciones
estaciones = pd.read_csv('/Users/pablo/Desktop/temp/DA/miBici/mibici_2025/data/raw/nomenclatura_2025_10.csv', encoding="latin1")
estaciones.head()

Unnamed: 0,id,name,obcn,location,latitude,longitude,status
0,2,(GDL-001) C. Epigmenio Glez./ Av. 16 de Sept.,GDL-001,POLÍGONO CENTRAL,20.666378,-103.34882,IN_SERVICE
1,3,(GDL-002) C. Colonias / Av. Niños héroes,GDL-002,POLÍGONO CENTRAL,20.667228,-103.366,IN_SERVICE
2,4,(GDL-003) C. Vidrio / Av. Chapultepec,GDL-003,POLÍGONO CENTRAL,20.66769,-103.368252,IN_SERVICE
3,5,(GDL-004) C. Ghilardi /C. Miraflores,GDL-004,POLÍGONO CENTRAL,20.691847,-103.362549,IN_SERVICE
4,6,(GDL-005) C. San Diego /Calzada Independencia,GDL-005,POLÍGONO CENTRAL,20.681158,-103.339363,IN_SERVICE


In [4]:
df_paseos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      50000 non-null  int64 
 1   Trip_Id         50000 non-null  int64 
 2   User_Id         50000 non-null  int64 
 3   Sex             50000 non-null  object
 4   Birth_year      50000 non-null  int64 
 5   Trip_start      50000 non-null  object
 6   Trip_end        50000 non-null  object
 7   Origin_Id       50000 non-null  int64 
 8   Destination_Id  50000 non-null  int64 
 9   Age             50000 non-null  int64 
 10  Duration        50000 non-null  object
dtypes: int64(7), object(4)
memory usage: 4.2+ MB


In [5]:
df_paseos[['Birth_year', 'Age']].describe()

Unnamed: 0,Birth_year,Age
count,50000.0,50000.0
mean,1989.56786,34.43214
std,10.809846,10.809846
min,1920.0,17.0
25%,1984.0,27.0
50%,1992.0,32.0
75%,1997.0,40.0
max,2007.0,104.0


In [6]:
# Eliminamos columna unnamed
df_paseos.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
# Cambiamos columnas a bajas
df_paseos.columns = df_paseos.columns.str.lower().str.replace(' ', '_').str.strip()

# Convertimos trip_end y trip_start a datetime
df_paseos.trip_end = pd.to_datetime(df_paseos.trip_end, errors=('coerce'))
df_paseos.trip_start = pd.to_datetime(df_paseos.trip_start, errors=('coerce'))

# Sacamos fecha del viaje, mes y año
df_paseos['date'] = df_paseos.trip_start.dt.to_period('M')

# Agregamos columna de duración del viaje, en minutos
df_paseos['duration_m'] = ((df_paseos.trip_end - df_paseos.trip_start).dt.total_seconds() / 60).round().astype(int)

df_paseos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   trip_id         50000 non-null  int64         
 1   user_id         50000 non-null  int64         
 2   sex             50000 non-null  object        
 3   birth_year      50000 non-null  int64         
 4   trip_start      50000 non-null  datetime64[ns]
 5   trip_end        50000 non-null  datetime64[ns]
 6   origin_id       50000 non-null  int64         
 7   destination_id  50000 non-null  int64         
 8   age             50000 non-null  int64         
 9   duration        50000 non-null  object        
 10  date            50000 non-null  period[M]     
 11  duration_m      50000 non-null  int64         
dtypes: datetime64[ns](2), int64(7), object(2), period[M](1)
memory usage: 4.6+ MB


In [8]:
df_paseos.head()

Unnamed: 0,trip_id,user_id,sex,birth_year,trip_start,trip_end,origin_id,destination_id,age,duration,date,duration_m
0,32244893,1470734,M,1981,2024-01-31 23:59:33,2024-02-01 00:11:15,24,86,43,0 days 00:11:42,2024-01,12
1,32244892,2731702,M,1994,2024-01-31 23:59:06,2024-02-01 00:10:49,48,279,30,0 days 00:11:43,2024-01,12
2,32244891,1431452,M,2001,2024-01-31 23:58:48,2024-02-01 00:01:42,273,383,23,0 days 00:02:54,2024-01,3
3,32244890,2312602,F,2003,2024-01-31 23:58:44,2024-02-01 00:01:58,273,383,21,0 days 00:03:14,2024-01,3
4,32244889,2266427,M,1999,2024-01-31 23:58:44,2024-02-01 00:01:39,273,383,25,0 days 00:02:55,2024-01,3


In [9]:
estaciones.head()

Unnamed: 0,id,name,obcn,location,latitude,longitude,status
0,2,(GDL-001) C. Epigmenio Glez./ Av. 16 de Sept.,GDL-001,POLÍGONO CENTRAL,20.666378,-103.34882,IN_SERVICE
1,3,(GDL-002) C. Colonias / Av. Niños héroes,GDL-002,POLÍGONO CENTRAL,20.667228,-103.366,IN_SERVICE
2,4,(GDL-003) C. Vidrio / Av. Chapultepec,GDL-003,POLÍGONO CENTRAL,20.66769,-103.368252,IN_SERVICE
3,5,(GDL-004) C. Ghilardi /C. Miraflores,GDL-004,POLÍGONO CENTRAL,20.691847,-103.362549,IN_SERVICE
4,6,(GDL-005) C. San Diego /Calzada Independencia,GDL-005,POLÍGONO CENTRAL,20.681158,-103.339363,IN_SERVICE


In [10]:
estaciones.location.value_counts()

location
POLÍGONO CENTRAL     287
TLQ-CORREDORATLAS     49
ZAPOPAN CENTRO        47
Name: count, dtype: int64

In [11]:
estaciones.status.value_counts()

status
IN_SERVICE        366
NOT_IN_SERVICE     17
Name: count, dtype: int64

In [12]:
# limpiamos columna name
estaciones = estaciones.rename(columns={'obcn': 'station_code'})
estaciones["name"] = estaciones["name"].str.replace(r"\([^)]*\)\s*", "", regex=True)
estaciones.head()

Unnamed: 0,id,name,station_code,location,latitude,longitude,status
0,2,C. Epigmenio Glez./ Av. 16 de Sept.,GDL-001,POLÍGONO CENTRAL,20.666378,-103.34882,IN_SERVICE
1,3,C. Colonias / Av. Niños héroes,GDL-002,POLÍGONO CENTRAL,20.667228,-103.366,IN_SERVICE
2,4,C. Vidrio / Av. Chapultepec,GDL-003,POLÍGONO CENTRAL,20.66769,-103.368252,IN_SERVICE
3,5,C. Ghilardi /C. Miraflores,GDL-004,POLÍGONO CENTRAL,20.691847,-103.362549,IN_SERVICE
4,6,C. San Diego /Calzada Independencia,GDL-005,POLÍGONO CENTRAL,20.681158,-103.339363,IN_SERVICE


In [13]:
# Creamos dataset de origins

df_origin = estaciones[['id', 'station_code']]
df_origin = df_origin.rename(columns= {'id': 'origin_id', 'station_code': 'station_code_origin'})
df_origin.head()

Unnamed: 0,origin_id,station_code_origin
0,2,GDL-001
1,3,GDL-002
2,4,GDL-003
3,5,GDL-004
4,6,GDL-005


In [14]:
# Dataset destinations

df_destination = estaciones[['id', 'station_code']]
df_destination = df_destination.rename(columns={'id': 'destination_id', 'station_code': 'station_code_destination'})
df_destination.head()

Unnamed: 0,destination_id,station_code_destination
0,2,GDL-001
1,3,GDL-002
2,4,GDL-003
3,5,GDL-004
4,6,GDL-005


In [18]:
# Juntamos todas las tablas

df = pd.merge(df_paseos, df_origin, on='origin_id')
df = pd.merge(df, df_destination, on = 'destination_id')

df.head()

Unnamed: 0,trip_id,user_id,sex,birth_year,trip_start,trip_end,origin_id,destination_id,age,duration,date,duration_m,station_code_origin,station_code_destination
0,32244893,1470734,M,1981,2024-01-31 23:59:33,2024-02-01 00:11:15,24,86,43,0 days 00:11:42,2024-01,12,GDL-022,GDL-084
1,32244892,2731702,M,1994,2024-01-31 23:59:06,2024-02-01 00:10:49,48,279,30,0 days 00:11:43,2024-01,12,GDL-046,GDL-206
2,32244891,1431452,M,2001,2024-01-31 23:58:48,2024-02-01 00:01:42,273,383,23,0 days 00:02:54,2024-01,3,GDL-200,GDL-240
3,32244890,2312602,F,2003,2024-01-31 23:58:44,2024-02-01 00:01:58,273,383,21,0 days 00:03:14,2024-01,3,GDL-200,GDL-240
4,32244889,2266427,M,1999,2024-01-31 23:58:44,2024-02-01 00:01:39,273,383,25,0 days 00:02:55,2024-01,3,GDL-200,GDL-240
