In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import csv
from tqdm import tqdm 

def load_reference_data(filename):
    df = pd.read_csv(filename)
    return df

def random_date(start, end):
    delta = end - start
    random_days = random.randint(0, delta.days)
    random_seconds = random.randint(0, 24*60*60 - 1)
    return start + timedelta(days=random_days, seconds=random_seconds)

def generate_synthetic_trips(reference_csv, output_csv, num_rows):
    df = load_reference_data(reference_csv)
    regions = df['region'].dropna().unique().tolist() if 'region' in df else ['RegionA', 'RegionB']
    datasources = df['datasource'].dropna().unique().tolist() if 'datasource' in df else ['source1']
    city_names = df['city'].dropna().unique().tolist() if 'city' in df else ['CityA']
    origins = df['origin_coord'].dropna().unique().tolist() if 'origin_coord' in df else ['POINT (0.0 0.0)']
    destinations = df['destination_coord'].dropna().unique().tolist() if 'destination_coord' in df else ['POINT (1.0 1.0)']

    if 'departure_time' in df:
        min_date = pd.to_datetime(df['departure_time'].min())
        max_date = pd.to_datetime(df['departure_time'].max())
    else:
        min_date = datetime(2020, 1, 1)
        max_date = datetime(2023, 12, 31)

    columns = df.columns.tolist()

    with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out)
        writer.writerow(columns)

        for _ in tqdm(range(num_rows), desc="Gerando linhas sintéticas"):
            region = random.choice(regions)
            datasource = random.choice(datasources)
            city = random.choice(city_names)
            origin_coord = random.choice(origins)
            destination_coord = random.choice(destinations)
            dep_time = random_date(min_date, max_date).strftime('%Y-%m-%d %H:%M:%S') if 'departure_time' in columns else ""
            row = []
            for col in columns:
                if col == 'region':
                    row.append(region)
                elif col == 'datasource':
                    row.append(datasource)
                elif col == 'city':
                    row.append(city)
                elif col == 'origin_coord':
                    row.append(origin_coord)
                elif col == 'destination_coord':
                    row.append(destination_coord)
                elif col == 'departure_time':
                    row.append(dep_time)
                else:
                    value = random.choice(df[col].dropna().tolist()) if not df[col].dropna().empty else ''
                    row.append(value)
            writer.writerow(row)
    print(f'Sample data file "{output_csv}" generated: {num_rows} records.')

# --------- EXECUTE ON COLAB ---------
# Upload file trips.csv
from google.colab import files
uploaded = files.upload()

# Generate num_rows informed
generate_synthetic_trips('trips.csv', 'trips_fake.csv', num_rows=1000000)

# Download file
files.download('trips_fake.csv')
