# Hierarchical clustering

In [2]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

### Data Preprocesing

The dataset contains various columns such as Passenger ID, First Name, Last Name, Gender, Age, Nationality, information about the airports, departure date, pilot name, and flight status. For hierarchical clustering, we need to select numerical or categorical features that are relevant.

In [3]:
# Load the dataset
file_path = '/content/Airline Dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


Unnamed: 0,Passenger ID,First Name,Last Name,Gender,Age,Nationality,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Departure Date,Arrival Airport,Pilot Name,Flight Status
0,10856,Edithe,Leggis,Female,62,Japan,Coldfoot Airport,US,United States,NAM,North America,6/28/2022,CXF,Edithe Leggis,On Time
1,43872,Elwood,Catt,Male,62,Nicaragua,Kugluktuk Airport,CA,Canada,NAM,North America,12/26/2022,YCO,Elwood Catt,On Time
2,42633,Darby,Felgate,Male,67,Russia,Grenoble-Isère Airport,FR,France,EU,Europe,1/18/2022,GNB,Darby Felgate,On Time
3,78493,Dominica,Pyle,Female,71,China,Ottawa / Gatineau Airport,CA,Canada,NAM,North America,9/16/2022,YND,Dominica Pyle,Delayed
4,82072,Bay,Pencost,Male,21,China,Gillespie Field,US,United States,NAM,North America,2/25/2022,SEE,Bay Pencost,On Time


### Hierarchical clustering
We will use the scipy library to perform this task. The process involves:

Calculating a distance matrix to understand the distance between each point.
Using an agglomerative approach to cluster the data points hierarchically.
Creating a dendrogram to visualize these hierarchical clusters.

In [4]:
selected_columns = data[['Age', 'Passenger ID']]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(selected_columns)

In [None]:
# Compute the linkage matrix
linked = linkage(scaled_data, method='ward')

# Step 5: Plotting the Dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()


### Analysis and Interpretation
**Clusters Identification**: The dendrogram allows us to see how each cluster is formed by merging smaller clusters. You can decide on the number of clusters by setting a distance threshold and cutting the dendrogram at that height.
**Interpretation**: The clusters formed can be analyzed to understand common characteristics or patterns within each cluster. For example, clusters may represent groups of passengers with similar ages, genders, and nationalities.

In [None]:
# Choose the number of clusters
num_clusters = 3
cluster = AgglomerativeClustering(n_clusters=num_clusters, affinity='euclidean', linkage='ward')
cluster.fit_predict(scaled_data)

# Step 7: Adding Cluster Labels to Data
data['Cluster_Labels'] = cluster.labels_

# Step 8: Analysis of Clusters
# You can analyze the clusters as per your need, for example, viewing the mean of each cluster
print(data.groupby('Cluster_Labels').mean())