In [1]:
import pandas as pd
import numpy as np
import configparser

In [None]:
config = configparser.ConfigParser()
config.read('ais_config.ini')

In [None]:
# read values from a section
ais_dataset_name = config.get('INIT', 'ais_dataset_name')
rotte_dataset_name = config.get('INIT', 'rotte_dataset_name')
#rottesort_filename = config.get('PREPROCESS', 'rottesort_filename')

world_porti_filename = config.get('INIT', 'world_porti_filename')
arrivi_nei_porti_fname = config.get('ELABORATION','arrivi_nei_porti_fname')
rotte_arrivi_porti_world_fname = config.get('ELABORATION','rotte_arrivi_porti_world_fname')
rotte_arrivi_porti_all_fname = config.get('ELABORATION','rotte_arrivi_porti_all_fname')

In [None]:
ais_dataset = pd.read_csv(ais_dataset_name, sep=',')
ais_dataset.head()

In [None]:
vessels_mmsi = ais_dataset['mmsi'].unique().astype(int)
vessels_imo = ais_dataset['imo'].unique().astype(int)

print(len(vessels_mmsi))
print(len(vessels_imo))

In [None]:
rotte_dataset = pd.read_csv(rotte_dataset_name, sep=',', low_memory=False)

In [None]:
#drop NA values
rotte_dataset=rotte_dataset.dropna()
#Get index of rows containing wrong values
i=rotte_dataset[rotte_dataset.stamp=='stamp'].index
#drop rows containing wrong values
rotte_dataset.drop(i,inplace=True)

In [None]:
#build timestamp column to be added to rotte_092021 dataframe
stamp=rotte_dataset['stamp'].astype(float)
#stamp
from datetime import datetime
timestamp_column = stamp.apply(lambda x: datetime.fromtimestamp(x))
#timestamp_column

In [None]:
#add timestamp column to rotte_092021 dataframe
rotte_dataset['timestamp'] = timestamp_column.values

In [None]:
#sort_dataset by mmsi,stamp,lng,lat
rotte_dataset=rotte_dataset.sort_values(by=['mmsi','stamp','lng','lat'],ascending=[True,True,True,True])
#rotte_dataset.head()

In [None]:
class Utils(object):
   def intersect(self, nums1, nums2):
      """
      :type nums1: List[int]
      :type nums2: List[int]
      :rtype: List[int]
      """
      m = {}
      print(len(nums1))
      print(len(nums2))
      if len(nums1)<len(nums2):
         nums1,nums2 = nums2,nums1
      for i in nums1:
         if i not in m:
            m[i] = 1
         else:
            m[i]+=1
      result = []
      for i in nums2:
         if i in m and m[i]:
            m[i]-=1
            result.append(i)
      return result

In [None]:
rotte_dataset_mmsi =  rotte_dataset['mmsi'].unique().astype(int)
print(len(rotte_dataset_mmsi))

In [None]:
utils = Utils()
mmsi_intersection = utils.intersect(vessels_mmsi.tolist(), rotte_dataset_mmsi.tolist())
print(f'result intersection: {mmsi_intersection}')

In [None]:
#Drop rows whose mmsi is not in rotte_092021_simplified(rotte_092021_mmsi,vessels_mmsi) from rotte_092021,
# i.e. get only rows whos mmsi is in mmsi_intersection

rotte_dataset_simplified = rotte_dataset[rotte_dataset['mmsi'].astype(int).isin(mmsi_intersection)]
#rotte_dataset_simplified.head()
#len(rotte_dataset_simplified)

In [None]:
rotte_dataset_simplified = rotte_dataset_simplified.reset_index()
#rotte_dataset_simplified.head()

ELABORAZIONE DELLE ROTTE

In [None]:
from datetime import datetime
import haversine as hs
from haversine import Unit

In [None]:
#elaborazione degli arrivi (df)
def arrival_elaboration(df_rotte):
    #df_rotte = rottesort

    dim=len(df_rotte)
    print("df_rotte len before: ",dim)

    df_arrival = pd.DataFrame(columns = ['row','mmsi','arrival','departure','lng','lat','lng_orig','lat_orig','speed','status'])

    oldmmsi=0
    sumrec=0
    start=0
    lat_orig=0
    lng_orig=0
    oldlng=0
    oldlat=0
    start='unknown'
    #end='unknown'
    status=0 ###0-nuovo 1-arrivato 2-partito

    start_time = datetime.now()

    for i in range(dim):
        try:
            mmsi = df_rotte.loc[i,'mmsi']            
            time_voyage=df_rotte.loc[i,'timestamp']
            lng=df_rotte.loc[i,'lng']
            lat=df_rotte.loc[i,'lat']
            speed=df_rotte.loc[i,'speed']

            if(mmsi!=oldmmsi):
                if (status==1):#si riferisce alla old ship
                    df_arrival=df_arrival.append({'row':i,'mmsi':oldmmsi,'arrival':start,
                           'departure':end,'lng':oldlng,'lat':oldlat,'lng_orig':lng_orig,'lat_orig':lat_orig,
                            'speed':speed,'status':status},ignore_index=True)
                start='unknown'
                status=0
                lng_orig=0
                lat_orig=0
                oldlng=0
                oldlat=0
    
            if(speed==0):          
                if (status==0):
                    start=time_voyage
                    oldlng=lng
                    oldlat=lat
                
                #To calculate distance in meters
                if(status<2):
                    loc1=(lat,lng)
                    loc2=(oldlat,oldlng)
            
                    distance = hs.haversine(loc1,loc2,unit=Unit.METERS)
            
                    if(distance > 3000.0):
                        #print(distance)
                        #if ((abs(oldlng-lng)+abs(oldlat-lat))>0.3):
                        df_arrival=df_arrival.append({'row':i,'mmsi':oldmmsi,'arrival':start,
                           'departure':end,'lng':oldlng,'lat':oldlat,
                                       'lng_orig':lng_orig,'lat_orig':lat_orig,'speed':speed,'status':status},ignore_index=True)
                        start=time_voyage
                        lng_orig=oldlng
                        lat_orig=oldlat
                        oldlng=lng
                        oldlat=lat
                    
                if(status==2):
                    start=time_voyage
                    lng_orig=oldlng
                    lat_orig=oldlat
                    oldlng=lng
                    oldlat=lat                    
                
                end=time_voyage
                status=1
                
            if (speed>0):
                if(status==1):
                    loc1=(lat,lng)
                    loc2=(oldlat,oldlng)
                    distance = hs.haversine(loc1,loc2,unit=Unit.METERS)
                    if(distance > 3000.0):
                        df_arrival=df_arrival.append({'row':i,'mmsi':oldmmsi,'arrival':start,
                           'departure':end,'lng':oldlng,'lat':oldlat,
                            'lng_orig':lng_orig,'lat_orig':lat_orig,'speed':speed,'status':2},ignore_index=True)
                        status=2
                        lng_orig=oldlng
                        lat_orig=oldlat
                if(status==0):
                    status=2
                   
            #stampa di controllo
            if(i%500000 == 0):
               print(i)
        except Exception as e:
            print(i)
            print(e)
    #end for
    
    #scrive l'ultimo record
    if(status==1):
            df_arrival=df_arrival.append({'row':i,'mmsi':oldmmsi,'arrival':start,
                        'departure':end,'lng':oldlng,'lat':oldlat,
                        'lng_orig':lng_orig,'lat_orig':lat_orig,'speed':speed,'status':status},ignore_index=True)

 
    end_time = datetime.now()
    print('Duration: {}'.format(end_time - start_time))
    print("df_arrival len after: ",len(df_arrival))
    
    return(df_arrival)

In [None]:
df_rotte = rottesort[['mmsi','stamp','timestamp','lng','lat','speed']]
df_rotte = df_rotte.sort_values(by=['mmsi','stamp'],ascending=[True,True])
df_rotte = df_rotte.reset_index()

In [None]:
df_arrival= arrival_elaboration(df_rotte)

In [None]:
#print(len(df_arrival))
#6518
#df_arrival[(df_arrival['speed']==0) & (df_arrival['status'] == 2)] # 0 records
df_arrival[(df_arrival['speed']==0)]

In [None]:
world_porti = pd.read_csv(world_porti_filename, sep=',',low_memory=False)

In [None]:
world_porti=world_porti[['Country','Name','Latitude','Longitude']]

In [None]:
world_porti.reset_index(drop=True,inplace=True)

In [None]:
def port_assign(df_rotte,df_porti):
    
    rotte_n = len(df_rotte)
    df = df_rotte
    df["port"] = ["null"]*rotte_n
    df["port_orig"] = ["null"]*rotte_n
    
    start_time = datetime.now()
    
    for i in range(rotte_n):
        found_orig=0
        found=0
        loc_orig=(df.loc[i,'lat_orig'],df.loc[i,'lng_orig'])
        loc=(df.loc[i,'lat'],df.loc[i,'lng'])
        
        for j in range(len(df_porti)):            
            loc_port=(df_porti.loc[j,'Latitude'],df_porti.loc[j,'Longitude'])            
            distance1 = hs.haversine(loc,loc_port,unit=Unit.METERS)
            if(distance1 < 3000.0):
                df.loc[i,'port']=df_porti.loc[j,'Name']
                found=1
                break
        
        if (found==0):
            df.loc[i,'port']='Not found'
        
        for j in range(len(df_porti)):            
            loc_port=(df_porti.loc[j,'Latitude'],df_porti.loc[j,'Longitude'])
            distance2 = hs.haversine(loc_orig,loc_port,unit=Unit.METERS)
            if(distance2 < 3000.0):
                df.loc[i,'port_orig']=df_porti.loc[j,'Name']
                found_orig=1  
                break
                
        if (found_orig==0):
            df.loc[i,'port_orig']='Not found'
            
        #stampa di controllo
        if(i%3000 == 0):
               print(i)
    
    end_time = datetime.now()
    print('Duration: {}'.format(end_time - start_time))
    print("df_arrival len after: ",len(df_arrival))
        
    return(df)

In [None]:
df_rotte_arrivi_porti = port_assign(df_arrival,world_porti)

ADD INFORMATION TO THE DATASET

In [None]:
vessels_mmsi_imo_dict = pd.Series(ais_dataset.imo.values,index=ais_dataset.mmsi).to_dict()
vessels_mmsi_callsign_dict = pd.Series(ais_dataset.callsign.values,index=ais_dataset.mmsi).to_dict()
vessels_mmsi_type_dict = pd.Series(ais_dataset.type.values,index=ais_dataset.mmsi).to_dict()

vessels_mmsi_imo_clean_dict = {k: vessels_mmsi_imo_dict[k] for k in vessels_mmsi_imo_dict if not pd.isna(vessels_mmsi_imo_dict[k])}
vessels_mmsi_callsign_clean_dict = {k: vessels_mmsi_callsign_dict[k] for k in vessels_mmsi_callsign_dict if not pd.isna(vessels_mmsi_callsign_dict[k])}
vessels_mmsi_type_clean_dict = {k: vessels_mmsi_type_dict[k] for k in vessels_mmsi_type_dict if not pd.isna(vessels_mmsi_type_dict[k])}

In [None]:
df_rotte_arrivi_porti["imo"] = df_rotte_arrivi_porti["mmsi"].astype(int)
df_rotte_arrivi_porti['imo'].replace(vessels_mmsi_imo_clean_dict ,inplace=True)

df_rotte_arrivi_porti["callsign"] = df_rotte_arrivi_porti["mmsi"].astype(int)
df_rotte_arrivi_porti['callsign'].replace(vessels_mmsi_callsign_clean_dict ,inplace=True)

df_rotte_arrivi_porti['type'] = df_rotte_arrivi_porti["mmsi"].astype(int)
df_rotte_arrivi_porti['type'].replace(vessels_mmsi_type_clean_dict ,inplace=True)

In [None]:
#export all records
df_rotte_arrivi_porti.to_csv(rotte_arrivi_porti_all_fname,index=False,sep=',')
df_rotte_arrivi_porti

In [None]:
#export only records such that a port has been found
df_rotte_arrivi_porti_world_found = df_rotte_arrivi_porti[(df_rotte_arrivi_porti['port'] != 'Not found')]
df_rotte_arrivi_porti_world_found.to_csv(rotte_arrivi_porti_world_fname,index=False,sep=',')

COMPUTE STATISTICS OF ARRIVALS IN THE PORTS

In [None]:
df_rotte_arrivi_porti_world_found.query("status > 0").groupby(['port']).count().to_csv(arrivi_nei_porti_fname,sep=';')

In [None]:
#check
df_rotte_arrivi_porti_world_found[(df_rotte_arrivi_porti_world_found['speed']==0)]