In [177]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
plt.rcParams.update({'font.size': 7})

In [178]:
sw_data=pd.read_csv("FINAL_EDGE_TABLE.csv",usecols=['Origin_Airport_Code','origin_lat','origin_long','passengers','sum_departures_performed'])
sw_airports_location=sw_data[['Origin_Airport_Code','origin_lat','origin_long']].drop_duplicates(keep='first').reset_index(drop=True)
renaming={'Origin_Airport_Code':'IATA','origin_lat':'LATITUDE','origin_long':'LONGITUDE'}
sw_airports_location.rename(columns=renaming,inplace=True)
sw_airports_location

Unnamed: 0,IATA,LATITUDE,LONGITUDE
0,ABQ,35.040199,-106.609001
1,ALB,42.748299,-73.801697
2,AMA,35.219398,-101.706001
3,ATL,33.636700,-84.428101
4,AUS,30.197535,-97.662015
...,...,...,...
80,SNA,33.675701,-117.867996
81,STL,38.748697,-90.370003
82,TPA,27.975500,-82.533203
83,TUL,36.198399,-95.888100


In [179]:
sw_passengers=sw_data[['Origin_Airport_Code','sum_departures_performed','passengers']]
sw_passengers=sw_passengers.groupby(['Origin_Airport_Code']).mean().reset_index()
sw_passengers.insert(3,"average",sw_passengers['passengers']/sw_passengers['sum_departures_performed'])
sw_passengers

Unnamed: 0,Origin_Airport_Code,sum_departures_performed,passengers,average
0,ABQ,254.888889,29756.222222,116.741935
1,ALB,176.166667,22521.000000,127.839167
2,AMA,254.500000,23229.500000,91.275049
3,ATL,320.166667,36946.388889,115.397362
4,AUS,214.000000,25781.076923,120.472322
...,...,...,...,...
80,SNA,675.428571,79748.142857,118.070431
81,STL,247.976190,29830.880952,120.297360
82,TPA,222.823529,27228.205882,122.196278
83,TUL,171.400000,18482.400000,107.831972


In [180]:
#Function that calculates the haversine distance between two sets of coordinates
from sklearn.metrics.pairwise import haversine_distances
def haversine(x,y):
    R=6371 #Radius of Earth in KM
    x_rd=np.radians(x)
    y_rd=np.radians(y)
    h=(haversine_distances([x_rd,y_rd]))[0][1]
    d=R*h #Haversine Distance
    return d

In [181]:
#Function that runs a k-means algorithm based on haversine distance
def k_means_haversine(coordinates,codes,weights,no_of_clusters):
    #Setting some initial variables
    distance_matrix=np.zeros((codes.size,no_of_clusters))
    counter=0
    
    #Randomly chooses k points that serve as the initial cluster centres
    idx=np.random.choice(codes.size,no_of_clusters,replace=False)
    new_centres=list((codes[idx]))
    initial_centres=set(new_centres)
    old_centres=[]
    centres_coords=coordinates[idx,:]   

    #Running a loop till the cluster centres converge
    while(set(old_centres)!=set(new_centres)):

        #The cluster centres of the previous iteration become old clusters now
        old_centres=new_centres

        #Calculates the distance between each point and every centre and stores in the data matrix
        k,l=0,0
        for i in coordinates:
            for j in centres_coords:
                distance_matrix[k][l]=haversine(list(i),list(j))
                l=l+1
            l=0
            k=k+1

        #Assigns each point to a cluster
        #Labels each point with the cluster centre closest to it and adds it as an attribute to the dataframe
        labels=[]
        for i in distance_matrix:
            i=list(i)
            labels.append(new_centres[i.index(min(i))])
        sw_airports_location['LABELS']=labels

        #For each cluster, finds out the weighted average of all latitudes and longitudes (Virtual point)
        #Makes that point the cluster centre whose haversine distance from the virtual point is least
        #Assigns all new cluster centres coordinates by replacing old ones
        centres_coords=[]
        for centres in np.unique(labels):
            airports=sw_airports_location.loc[sw_airports_location['LABELS']==centres]
            a=airports[['LATITUDE','LONGITUDE']].values
            c_weights=weights[airports.index.tolist()]
            h_distances=np.zeros(a.shape[0])
            for i in range(h_distances.size):
                h_distances[i]=haversine(list(a[i,:]),[0,0])
            j=np.average(h_distances,weights=c_weights)
            centres_coords.append(a[np.argmin(np.abs(h_distances-j)),:])
        centres_coords=np.array(centres_coords)
        
        #Finding the corresponding airports to the newly found cluster centre coordinates
        new_centres=[]
        for i in centres_coords:
            new_centres.append(sw_airports_location.iloc[sw_airports_location.index[(sw_airports_location['LATITUDE']==i[0]) & 
                                                             (sw_airports_location['LONGITUDE']==i[1])].tolist()[0]]['IATA'])

        #Increasing the counter for every iteration
        counter=counter+1

        if(counter==20):
            #Algorithm did not converge
            return ([])

    
    #Algorithm converged
    return (new_centres)

In [182]:
#Defining some pre-requisite values
n=100
airport_coordinates=sw_airports_location.values[:,1:3]
airport_codes=sw_airports_location.values[:,0]
weight=np.array(sw_passengers['average'])
k=7

In [183]:
#List of all airports
sw_airports=sw_airports_location['IATA'].tolist()
#Dictionary that stores the frequency of how many times each airport appeared as a cluster centre
cc_frequency={i:0 for i in sw_airports}
#Counter to check no. of times algorithm converged
converged_counter=0

while(converged_counter!=n):
    obtained_centres=k_means_haversine(airport_coordinates,airport_codes,weight,k)
    if(obtained_centres):
        converged_counter=converged_counter+1
    for i in obtained_centres:
        cc_frequency[i]=cc_frequency[i]+1

In [184]:
sum(cc_frequency.values())

700

In [185]:
print(dict(sorted(cc_frequency.items(),key=lambda x:x[1],reverse=True)))

{'PBI': 78, 'SEA': 64, 'SAT': 56, 'LAS': 52, 'RNO': 46, 'BWI': 29, 'STL': 29, 'MKE': 27, 'ATL': 23, 'ALB': 21, 'LIT': 20, 'CVG': 18, 'ORF': 17, 'PHL': 17, 'HOU': 14, 'ABQ': 11, 'MAF': 11, 'MEM': 10, 'BOI': 9, 'DCA': 9, 'PDX': 9, 'PNS': 9, 'ECP': 8, 'ONT': 8, 'SDF': 7, 'SJU': 7, 'AMA': 6, 'FLL': 6, 'GSP': 6, 'IAD': 6, 'LBB': 6, 'TUL': 5, 'LGA': 4, 'OKC': 4, 'SMF': 4, 'TPA': 4, 'BUF': 3, 'CMH': 3, 'DTW': 3, 'HRL': 3, 'LAX': 3, 'MSP': 3, 'OMA': 3, 'AUS': 2, 'DEN': 2, 'ICT': 2, 'MDW': 2, 'RDU': 2, 'RIC': 2, 'BNA': 1, 'DAL': 1, 'GEG': 1, 'MCI': 1, 'MSY': 1, 'PHX': 1, 'ROC': 1, 'BDL': 0, 'BHM': 0, 'BOS': 0, 'BUR': 0, 'CHS': 0, 'CLE': 0, 'CLT': 0, 'CRP': 0, 'DSM': 0, 'ELP': 0, 'EWR': 0, 'GRR': 0, 'IND': 0, 'ISP': 0, 'JAX': 0, 'LGB': 0, 'MCO': 0, 'MHT': 0, 'OAK': 0, 'PIT': 0, 'PVD': 0, 'PWM': 0, 'RSW': 0, 'SAN': 0, 'SFO': 0, 'SJC': 0, 'SLC': 0, 'SNA': 0, 'TUS': 0}


In [186]:
without_weight={'SEA': 68, 'SAT': 57, 'PBI': 55, 'RNO': 46, 'PHL': 39, 'MKE': 35, 'LAS': 31, 'ATL': 26, 'ORF': 22, 'BWI': 21, 'MCI': 19, 'STL': 18, 'MAF': 17, 'PDX': 17, 'ECP': 16, 'LBB': 16, 'JAX': 15, 'IND': 13, 'PNS': 13, 'MSP': 11, 'MCO': 10, 'SJU': 9, 'BOI': 8, 'MEM': 8, 'ONT': 8, 'SMF': 8, 'RSW': 7, 'ALB': 6, 'IAD': 6, 'SDF': 6, 'CLT': 5, 'LIT': 5, 'AMA': 4, 'CMH': 4, 'CVG': 4, 'DAL': 4, 'ELP': 3, 'LAX': 3, 'PHX': 3, 'TUL': 3, 'ABQ': 2, 'BDL': 2, 'BNA': 2, 'DCA': 2, 'DEN': 2, 'DSM': 2, 'MDW': 2, 'MSY': 2, 'PIT': 2, 'RDU': 2, 'BHM': 1, 'BUF': 1, 'CHS': 1, 'CLE': 1, 'DTW': 1, 'FLL': 1, 'GRR': 1, 'GSP': 1, 'HOU': 1, 'ISP': 1, 'LGA': 1, 'AUS': 0, 'BOS': 0, 'BUR': 0, 'CRP': 0, 'EWR': 0, 'GEG': 0, 'HRL': 0, 'ICT': 0, 'LGB': 0, 'MHT': 0, 'OAK': 0, 'OKC': 0, 'OMA': 0, 'PVD': 0, 'PWM': 0, 'RIC': 0, 'ROC': 0, 'SAN': 0, 'SFO': 0, 'SJC': 0, 'SLC': 0, 'SNA': 0, 'TPA': 0, 'TUS': 0}

In [187]:
with_weight={'PBI': 78, 'SEA': 64, 'SAT': 56, 'LAS': 52, 'RNO': 46, 'BWI': 29, 'STL': 29, 'MKE': 27, 'ATL': 23, 'ALB': 21, 'LIT': 20, 'CVG': 18, 'ORF': 17, 'PHL': 17, 'HOU': 14, 'ABQ': 11, 'MAF': 11, 'MEM': 10, 'BOI': 9, 'DCA': 9, 'PDX': 9, 'PNS': 9, 'ECP': 8, 'ONT': 8, 'SDF': 7, 'SJU': 7, 'AMA': 6, 'FLL': 6, 'GSP': 6, 'IAD': 6, 'LBB': 6, 'TUL': 5, 'LGA': 4, 'OKC': 4, 'SMF': 4, 'TPA': 4, 'BUF': 3, 'CMH': 3, 'DTW': 3, 'HRL': 3, 'LAX': 3, 'MSP': 3, 'OMA': 3, 'AUS': 2, 'DEN': 2, 'ICT': 2, 'MDW': 2, 'RDU': 2, 'RIC': 2, 'BNA': 1, 'DAL': 1, 'GEG': 1, 'MCI': 1, 'MSY': 1, 'PHX': 1, 'ROC': 1, 'BDL': 0, 'BHM': 0, 'BOS': 0, 'BUR': 0, 'CHS': 0, 'CLE': 0, 'CLT': 0, 'CRP': 0, 'DSM': 0, 'ELP': 0, 'EWR': 0, 'GRR': 0, 'IND': 0, 'ISP': 0, 'JAX': 0, 'LGB': 0, 'MCO': 0, 'MHT': 0, 'OAK': 0, 'PIT': 0, 'PVD': 0, 'PWM': 0, 'RSW': 0, 'SAN': 0, 'SFO': 0, 'SJC': 0, 'SLC': 0, 'SNA': 0, 'TUS': 0}

In [188]:
sw_airports_location

Unnamed: 0,IATA,LATITUDE,LONGITUDE,LABELS
0,ABQ,35.040199,-106.609001,SAT
1,ALB,42.748299,-73.801697,BUF
2,AMA,35.219398,-101.706001,SAT
3,ATL,33.636700,-84.428101,PNS
4,AUS,30.197535,-97.662015,SAT
...,...,...,...,...
80,SNA,33.675701,-117.867996,RNO
81,STL,38.748697,-90.370003,PNS
82,TPA,27.975500,-82.533203,PBI
83,TUL,36.198399,-95.888100,SAT


In [None]:
#Plotting the clusters
        #plt.scatter(coordinates[:,1],coordinates[:,0],c=pd.Categorical(labels).codes,cmap=plt.cm.rainbow)
        #plt.scatter(centres_coords[:,1],centres_coords[:,0],c='k',marker='o',s=300,alpha=0.5)
        #for i,txt in enumerate(codes):
        #    plt.annotate(txt,(coordinates[i,1],coordinates[i,0]))
        #plt.show();
        #print('Initial centres =',initial_centres)    
        #print('Final centres =',set(new_centres))
        #print('No. of iterations =',counter)
        
#Plotting the centres        
#plt.scatter(centres_coords[:,1],centres_coords[:,0],c='k',marker='o',s=300,alpha=0.5)
        #for i,txt in enumerate(new_centres):
        #    plt.annotate(txt,(centres_coords[i,1],centres_coords[i,0]))
        #plt.title('Iteration '+str(counter+1))
        #plt.show();