This script purpose is just to merge and clean trips2022.csv, trips2023.csv and trips2024.csv 
It is separated from the main file for optimization of resources and kernel.

In [1]:
import pandas as pd

### Import

In [2]:
import pandas as pd
import os

base_path = "/home/chona/code/ignaciogomenuka/ChallengeDecentraland/data/Rides/"


years = [2022, 2023, 2024]

chunk_size = 1_000_000

dataframes = []

print("📥 Merging raw data from 2022, 2023, and 2024...")

# Load each year in chunks to optimize Kernel
for year in years:
    file_path = f"{base_path}trips_{year}.csv"

    if os.path.exists(file_path):
        print(f"🔄 Processing {year}...")

        for chunk in pd.read_csv(file_path, low_memory=True, dtype=str, chunksize=chunk_size):
            chunk["year"] = year  
            dataframes.append(chunk)

        print(f"✅ {year} loaded successfully!")
    else:
        print(f"⚠️ Skipping {year}: File not found!")

rides_df_raw = pd.concat(dataframes, ignore_index=True)

merged_file_path = f"{base_path}trips_2022_2024_raw.csv"
rides_df_raw.to_csv(merged_file_path, index=False)

print(f"✅ Merged dataset saved at: {merged_file_path}")
print(f"✅ Final dataset size: {len(rides_df_raw)} rows, {len(rides_df_raw.columns)} columns")


📥 Merging raw data from 2022, 2023, and 2024...
🔄 Processing 2022...
✅ 2022 loaded successfully!
🔄 Processing 2023...
✅ 2023 loaded successfully!
🔄 Processing 2024...
✅ 2024 loaded successfully!
✅ Merged dataset saved at: /home/chona/code/ignaciogomenuka/ChallengeDecentraland/data/Rides/trips_2022_2024_raw.csv
✅ Final dataset size: 8480620 rows, 21 columns


In [3]:
import pandas as pd

null_counts = rides_df_raw.isnull().sum().sort_values(ascending=False)

null_percentage = (null_counts / len(rides_df_raw)) * 100

missing_data = pd.DataFrame({
    "Missing Values": null_counts,
    "Percentage (%)": null_percentage
})

print("Missing Values Per Column:")
print(missing_data[missing_data["Missing Values"] > 0])


📊 Missing Values Per Column:
                            Missing Values  Percentage (%)
Género                             5574121       65.727753
X                                  5557815       65.535480
género                             2944951       34.725657
Unnamed: 0                         2935484       34.614026
nombre_estacion_destino                  2        0.000024
direccion_estacion_destino               2        0.000024
lat_estacion_destino                     2        0.000024
id_estacion_destino                      2        0.000024
long_estacion_destino                    2        0.000024


In [4]:
import pandas as pd


rides_df_raw.columns = rides_df_raw.columns.str.lower()

if "género" in rides_df_raw.columns and "género" in rides_df_raw.columns:
    rides_df_raw["género"] = rides_df_raw["género"].fillna(rides_df_raw["género"])   

print(rides_df_raw["género"].head(10))
print(f"'género' column unified successfully!")


   género género
0  FEMALE    NaN
1    MALE    NaN
2  FEMALE    NaN
3  FEMALE    NaN
4   OTHER    NaN
5   OTHER    NaN
6  FEMALE    NaN
7    MALE    NaN
8  FEMALE    NaN
9    MALE    NaN
'género' column unified successfully!


In [5]:
rides_df_raw.drop(columns=["x", "unnamed: 0"], inplace=True, errors="ignore")

print(f" Columns 'X' and 'Unnamed: 0' have been dropped.")
print(f" Remaining columns: {rides_df_raw.columns}")


 Columns 'X' and 'Unnamed: 0' have been dropped.
 Remaining columns: Index(['id_recorrido', 'duracion_recorrido', 'fecha_origen_recorrido',
       'id_estacion_origen', 'nombre_estacion_origen',
       'direccion_estacion_origen', 'long_estacion_origen',
       'lat_estacion_origen', 'fecha_destino_recorrido', 'id_estacion_destino',
       'nombre_estacion_destino', 'direccion_estacion_destino',
       'long_estacion_destino', 'lat_estacion_destino', 'id_usuario',
       'modelo_bicicleta', 'género', 'year', 'género'],
      dtype='object')


In [6]:
cleaned_file_path = "/home/chona/code/ignaciogomenuka/ChallengeDecentraland/data/Rides/trips_2022_2024_cleaned.csv"

rides_df_raw.to_csv(cleaned_file_path, index=False)

print(f"✅ Cleaned dataset saved successfully at: {cleaned_file_path}")


✅ Cleaned dataset saved successfully at: /home/chona/code/ignaciogomenuka/ChallengeDecentraland/data/Rides/trips_2022_2024_cleaned.csv
