In [None]:
import csv
from pykafka import KafkaClient
import json
from time import sleep
import threading
from datetime import datetime, timezone
from tqdm import tqdm

# Configura Kafka
KAFKA_BROKER = "localhost:9091"  # Sostituisci con il tuo broker Kafka
client = KafkaClient(hosts=KAFKA_BROKER)

# Specifica l'intervallo di righe da leggere
START_ROW = 1  # Riga iniziale (0-indicizzata)
END_ROW = 620402   # Riga finale esclusa (0-indicizzata)
# Connettiti al broker Kafka
def invia_messaggi_a_kafka(file_path, start_row, end_row, topic_name):
    """
    Funzione thread-safe per leggere un file CSV e inviare un intervallo di righe a Kafka.
    """
    topic = client.topics[topic_name]
    producer = topic.get_sync_producer()
    print(f"Avvio l'invio dei messaggi dal file: {file_path} al topic: {topic_name} \n")

    sent_count = 0
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        for i, row in enumerate(reader):
            if start_row <= i < end_row:
                current_time = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
                row['time'] = current_time
                messaggio = json.dumps(row).encode('utf-8')
                producer.produce(messaggio)
                sleep(0.05)
                sent_count += 1
                if sent_count % 100 == 0:
                    print(f"[{file_path}] Inviate {sent_count} righe...")



# Elenco dei file CSV

file_paths = [
    "orderedData/ITA.csv","orderedData/USA.csv","orderedData/GBR.csv",
    "orderedData/NZL.csv","orderedData/FRA.csv","orderedData/SUI.csv"
]
topics = ["ITA","USA","GBR","NZL","FRA","SUI"]


# Crea e avvia i thread
threads = []
for file_path, topic in zip(file_paths, topics):

    thread = threading.Thread(target=invia_messaggi_a_kafka, 
        args=(file_path, START_ROW, END_ROW, "boat_data"))

    threads.append(thread)
    thread.start()

# Aspetta che tutti i thread terminino
for thread in threads:
    thread.join()

print("Tutti i file sono stati processati.")

In [12]:
file_path = "orderedData/USA.csv"
with open(file_path, 'r') as f:
    row_count = sum(1 for _ in f) - 1  # subtract 1 for header
print(f"Number of rows in {file_path}: {row_count}")

Number of rows in orderedData/USA.csv: 620402


In [None]:
with open(file_path, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    foiling_values = set()
    for row in reader:
        foiling_values.add(row.get('leg'))
print(f"Valori diversi nella colonna 'foiling': {foiling_values}")
print(f"Numero di valori diversi: {len(foiling_values)}")

In [None]:
def invia_messaggi_a_kafka(file_path, start_row, end_row, topic_name):
    """
    Funzione per leggere un file CSV e inviare un intervallo di righe a Kafka.
    """
    topic = client.topics[topic_name]   
    # Configura il produttore Kafka
    producer = topic.get_sync_producer()

    # Leggi il file CSV e invia l'intervallo di righe
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)  # Legge il file CSV come dizionari
        for i, row in enumerate(reader):
            if start_row <= i < end_row:
                messaggio = json.dumps(row).encode('utf-8')  # Serializza la riga in formato JSON
                producer.produce(messaggio)  # Invia il messaggio a Kafka
                sleep(0.05)
                #print(f"[{file_path}] Inviato: {messaggio}")

In [None]:
datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'

In [8]:
import threading
threading.enumerate()



[<_MainThread(MainThread, started 8539737216)>,
 <Thread(IOPub, started daemon 6108639232)>,
 <Heartbeat(Heartbeat, started daemon 6125465600)>,
 <Thread(Thread-1 (_watch_pipe_fd), started daemon 6143438848)>,
 <Thread(Thread-2 (_watch_pipe_fd), started daemon 6160265216)>,
 <ControlThread(Control, started daemon 6177091584)>,
 <HistorySavingThread(IPythonHistorySavingThread, started 6193917952)>]

In [2]:
import polars as pl
data = pl.read_csv("orderedData/ITA.csv")
print(data.head())

shape: (5, 48)
┌───────┬───────────┬───────────┬───────────┬───┬───────────────┬───────────────┬──────────┬───────┐
│       ┆ CWA       ┆ CWAvalue  ┆ TWA       ┆ … ┆ true_wind_dir ┆ true_wind_spe ┆ tstamp   ┆ turn  │
│ ---   ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ection.value  ┆ ed.value      ┆ ---      ┆ ---   │
│ i64   ┆ f64       ┆ f64       ┆ f64       ┆   ┆ ---           ┆ ---           ┆ f64      ┆ bool  │
│       ┆           ┆           ┆           ┆   ┆ f64           ┆ f64           ┆          ┆       │
╞═══════╪═══════════╪═══════════╪═══════════╪═══╪═══════════════╪═══════════════╪══════════╪═══════╡
│ 30020 ┆ 64.85083  ┆ 64.85083  ┆ 61.140518 ┆ … ┆ 101.289688    ┆ 9.307171      ┆ 1.7249e9 ┆ true  │
│ 30021 ┆ 64.820271 ┆ 64.820271 ┆ 61.109959 ┆ … ┆ 101.289688    ┆ 9.307171      ┆ 1.7249e9 ┆ false │
│ 30022 ┆ 64.820271 ┆ 64.820271 ┆ 61.109959 ┆ … ┆ 101.289688    ┆ 9.307171      ┆ 1.7249e9 ┆ false │
│ 30023 ┆ 64.820271 ┆ 64.820271 ┆ 61.109959 ┆ … ┆ 101.289688    ┆ 9.307171  

In [3]:
data.columns

['',
 'CWA',
 'CWAvalue',
 'TWA',
 'TWAvalue',
 'VMC',
 'VMG',
 'X',
 'Y',
 'Ydist',
 'Yrolling',
 'asset_id',
 'boat',
 'bord',
 'class',
 'course_over_ground.value',
 'cumulative_VMG',
 'direction',
 'distance_derivative',
 'distance_to_prev',
 'distance_to_start',
 'duration',
 'foiling',
 'gnss_position.altitude',
 'gnss_position.latitude',
 'gnss_position.longitude',
 'gybe',
 'heading.value',
 'leg',
 'maneuver',
 'opponent',
 'race_course.course_axis',
 'race_number',
 'source',
 'speed_over_ground.calculated',
 'speed_over_ground.value',
 'status',
 'tack',
 'time',
 'time_to_gybe',
 'time_to_prev',
 'time_to_tack',
 'time_to_turn',
 'timestamp',
 'true_wind_direction.value',
 'true_wind_speed.value',
 'tstamp',
 'turn']