## Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from folium.plugins import HeatMap

from google.colab import drive

from pyspark import SparkContext

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, when, regexp_replace, monotonically_increasing_id, udf, count, sum
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, TimestampType, IntegerType

from datetime import datetime

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA

## Data Uploading

In [None]:
drive.mount('/content/drive')

path = "/content/drive/MyDrive/Colab Notebooks/Distributed Data Analysis and Mining/Project/data"

Mounted at /content/drive


In [None]:
# Create a SparkSession

spark = SparkSession.builder \
    .appName("LargeRDDToDF") \
    .master("local[*]") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()

In [None]:
df = spark.read.csv(path + "/df_final_with_weather.csv", header = True, inferSchema = True, sep = ",")

## Data Manipulation

In [None]:
cols_cluster = ["Actual_Departure_Time", "Departure_Delay_Minutes", "Taxi_Out_Time", "Taxi_In_Time", "Arrival_Delay_Minutes", "Flight_Cancelled", "Flight_Diverted",
                "Actual_Flight_Duration", "Airborne_Time", "Flight_Distance", "population_origin_ok", "tavg", "wspd", "wdir", "pres"] # Define columns for clustering

df = df.fillna(0, subset = cols_cluster)

In [None]:
# Assembling

vec_assembler = VectorAssembler(inputCols = cols_cluster,
                                outputCol='features',
                                handleInvalid = "keep"
                                )

df = vec_assembler.transform(df)

In [None]:
# Scaling

scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=False)

scalerModel = scaler.fit(df) # Compute summary statistics by fitting the StandardScaler
df = scalerModel.transform(df) # Normalize each feature to have unit standard deviation

## KMeans

In [None]:
silhouette_score=[]

evaluator = ClusteringEvaluator(predictionCol='prediction',
                                featuresCol='scaledFeatures',
                                metricName='silhouette',
                                distanceMeasure='squaredEuclidean')

for i in range(2,10):
    kmeans=KMeans(featuresCol='scaledFeatures', k=i)
    model=kmeans.fit(df)
    predictions=model.transform(df)
    score=evaluator.evaluate(predictions)
    silhouette_score.append(score)
    print('Silhouette Score for k =',i,'is',score)
    del model, predictions

In [None]:
k_values = range(2, 10)

plt.plot(k_values, silhouette_score)
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. k')
plt.show()

In [None]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3, initMode="k-means||")
model = kmeans.fit(df)

result  = model.transform(df)
result.show(5)

## Visualization

In [None]:
# Get cluster centroids

centroids = model.clusterCenters()
centroids_df = pd.DataFrame(centroids, columns=cols_cluster)
centroids_df['cluster'] = centroids_df.index

centroids_df

### PCA

In [None]:
spark_df = spark.createDataFrame(centroids_df) # Convert df to spark df

vec_assembler = VectorAssembler(inputCols=cols_cluster[:-1], outputCol="features") # Vectorize features
spark_df = vec_assembler.transform(spark_df)

In [None]:
# Apply PCA

pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(spark_df)
rs = model.transform(spark_df)

In [None]:
pca_df = rs.select("pcaFeatures", "cluster").toPandas()
pca_df['x'] = pca_df['pcaFeatures'].apply(lambda x: x[0])
pca_df['y'] = pca_df['pcaFeatures'].apply(lambda x: x[1])

plt.figure(figsize=(8, 6))
plt.scatter(pca_df['x'], pca_df['y'], c=pca_df['cluster'], cmap='viridis', alpha=0.7, s = 1000)

plt.title("Centroids PCA")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label='Cluster')
plt.show()

## Results

### Analytical Results

In [None]:
cluster_stats = result.groupBy("prediction") \
    .agg(
        (sum(when(col("Flight_Cancelled") == 1, 1)).alias("Cancelled_Flights")),
        (count("*")).alias("Total_Flights"),
        (sum(when(col("Flight_Diverted") == 1, 1)).alias("Diverted_Flights"))
    ) \
    .withColumn("Cancelled_Percentage", (col("Cancelled_Flights") / col("Total_Flights")) * 100) \
    .withColumn("Diverted_Percentage", (col("Diverted_Flights") / col("Total_Flights")) * 100) # Calculate the percentage of Flight_cancelled = 1 and Flight_diverted = 1 for each cluster

cluster_stats.show()

In [None]:
# Group by airline and cluster to see the frequency of each airline in each cluster

airline_cluster_freq = result.groupBy("Operating_Carrier", "prediction").count()

acf = airline_cluster_freq.toPandas()

top_airlines_per_cluster = acf.groupby('prediction').apply(lambda x: x.nlargest(2, 'count')) # Group by cluster and airline, then sort by count within each cluster

top_airlines_per_cluster

### Geographical Analysis

In [None]:
# Group by origin and destination data

grouped_df = result.groupBy("Origin_Airport", "latitude", "longitude", "Destination_Airport", "latitude_dest", "longitude_dest", "prediction").count()

sorted_df = airport_df.groupby(['Origin_Airport', 'latitude', 'longitude', 'prediction'], as_index=False).agg({
    'count': 'sum'}) # Group by only origin to get unique airports
sorted_df = sorted_df.sort_values(by=['Origin_Airport', 'count'], ascending=[True, False])
sorted_df = sorted_df.drop_duplicates(subset=['Origin_Airport'], keep='first') # Drop duplicates

In [None]:
# Aggregating and separating data into clusters

sorted_df = airport_df.groupby(['Origin_Airport', 'latitude', 'longitude', 'prediction'], as_index=False).agg({
    'count': 'sum'})

repr_airports0 = (
    airport_df[airport_df['prediction']==0].groupby('prediction', group_keys=False)
    .apply(lambda x: x.nlargest(500, 'count'))
)

repr_airports1 = (
    airport_df[airport_df['prediction']==1].groupby('prediction', group_keys=False)
    .apply(lambda x: x.nlargest(500, 'count'))
)

repr_airports2 = (
    airport_df[airport_df['prediction']==2].groupby('prediction', group_keys=False)
    .apply(lambda x: x.nlargest(500, 'count'))
)

### Top Airports per Cluster

In [None]:
m = folium.Map(location=[39.8283, -98.5795], zoom_start=4)

airport_coords = repr_airports2[['latitude', 'longitude']].values.tolist()

HeatMap(airport_coords).add_to(m) # Add data points to map

In [None]:
m

### Top Routes per Cluster

In [None]:
m = folium.Map(location=[39.8283, -98.5795], zoom_start=4)

cluster_colors = ['blue', 'red', 'green']

for _, row in top_airports_by_prediction.iterrows():
    origin = (row['latitude'], row['longitude'])
    destination = (row['latitude_dest'], row['longitude_dest'])
    frequency = row['count']
    cluster_origin = int(row['prediction'])
    color_origin = cluster_colors[cluster_origin % len(cluster_colors)]

    folium.PolyLine( # Plot a line according to the frequency
        locations=[origin, destination],
        color='blue',
        weight = 2.5 + 5 * (np.log(frequency) - np.log(min_counts)) / (np.log(max_counts) - np.log(min_counts)),
        opacity=0.7
    ).add_to(m) # Iterate over most frequent routes

    folium.Marker(location=origin, popup="Origin", icon=folium.Icon(color=color_origin)).add_to(m)
    folium.Marker(location=destination, popup="Destination", icon=folium.Icon(color=color_origin)).add_to(m)

In [None]:
m