In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv('../data/processed_data/train.csv')
schedules_df = pd.read_csv('../data/processed_data/schedules.csv')
ports_df = pd.read_csv('../data/original_data/ports.csv', sep='|')



print(f"Rows in train_df: {len(train_df)}")
print(f"Rows in schedules_df: {len(schedules_df)}")
print(f"Rows in ports_df: {len(ports_df)}")


port_ids_train = set(train_df['portId'].unique())
port_ids_schedules = set(schedules_df['portId'].unique())
port_ids_ports = set(ports_df['portId'].unique())



missing_port_ids = port_ids_train - port_ids_ports

if missing_port_ids:
    print(f"Encountered portIds in train that were not in ports: {missing_port_ids}")
else:
    print("All vesselIds in test are present in train.")

missing_port_ids = port_ids_schedules - port_ids_ports

if missing_port_ids:
    print(f"Encountered portIds in schedules that were not in ports: {missing_port_ids}")
else:
    print("All vesselIds in test are present in schedules.")


Rows in train_df: 1522065
Rows in schedules_df: 136250
Rows in ports_df: 1329
Encountered portIds in train that were not in ports: {nan}
Encountered portIds in schedules that were not in ports: {nan}


In [11]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

label_encoder = LabelEncoder()


# Remove rows where portId is na

nan_portId_train_df = train_df[train_df['portId'].isna()]
train_df = train_df.dropna(subset=['portId'])

nan_portId_schedules_df = schedules_df[schedules_df['portId'].isna()]
schedules_df = schedules_df.dropna(subset=['portId'])

nan_portId_ports_df = ports_df[ports_df['portId'].isna()]
ports_df = ports_df.dropna(subset=['portId'])


combined_port_ids = pd.concat([train_df['portId'], schedules_df['portId'], ports_df['portId']])
label_encoder.fit(combined_port_ids)

train_df['portId'] = label_encoder.transform(train_df['portId'])
schedules_df['portId'] = label_encoder.transform(schedules_df['portId'])
ports_df['portId'] = label_encoder.transform(ports_df['portId'])

combined_df = pd.concat([train_df[['portId']], schedules_df[['portId']], ports_df[['portId']]])

scaler = MinMaxScaler(feature_range=(0, 1))
combined_scaled = scaler.fit_transform(combined_df)

train_df['portId'] = combined_scaled[:len(train_df)]
schedules_df['portId'] = combined_scaled[len(train_df):len(train_df) + len(schedules_df)]
ports_df['portId'] = combined_scaled[len(train_df) + len(schedules_df):]

train_df = pd.concat([train_df, nan_portId_train_df], ignore_index=True)
schedules_df = pd.concat([schedules_df, nan_portId_schedules_df], ignore_index=True)
ports_df = pd.concat([ports_df, nan_portId_ports_df], ignore_index=True)



port_ids_train = set(train_df['portId'].unique())
port_ids_schedules = set(schedules_df['portId'].unique())
port_ids_ports = set(ports_df['portId'].unique())



missing_port_ids = port_ids_train - port_ids_ports

if missing_port_ids:
    print(f"Encountered portIds in train that were not in ports: {missing_port_ids}")
else:
    print("All vesselIds in test are present in train.")

missing_port_ids = port_ids_schedules - port_ids_ports

if missing_port_ids:
    print(f"Encountered portIds in schedules that were not in ports: {missing_port_ids}")
else:
    print("All vesselIds in test are present in schedules.")



Encountered portIds in train that were not in ports: {nan}
Encountered portIds in schedules that were not in ports: {nan}


  train_df = pd.concat([train_df, nan_portId_train_df], ignore_index=True)
  schedules_df = pd.concat([schedules_df, nan_portId_schedules_df], ignore_index=True)
  ports_df = pd.concat([ports_df, nan_portId_ports_df], ignore_index=True)


In [12]:
train_df.to_csv('../data/processed_data/train.csv', index=False)
ports_df.to_csv('../data/processed_data/ports.csv', index=False)
schedules_df.to_csv('../data/processed_data/schedules.csv', index=False)