<a href="https://colab.research.google.com/github/hifsakazmi/xai_ml_robot_telemetry/blob/main/AAI_ML_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Data Preprocessing


### 1.1. Loading Data and Initial Exploration

#### Download Dataset

In [1]:
import os
import zipfile
import gdown

In [2]:
def download_dataset(dataset_link):
  local_path = "/content"  # This is where files get extracted
  zip_path = "/content/drone_dataset.zip"
  dataset_path = "/content/drone_dataset"

  if not os.path.exists(dataset_path):
    print("Downloading dataset from Google Drive...")

    # Download the file
    print("Downloading zip file...")
    gdown.download(dataset_link, zip_path, quiet=False)

    # Check if download was successful
    if not os.path.exists(zip_path):
      raise FileNotFoundError(f"Download failed! Zip file not found at {zip_path}")

    # Extract the zip file
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(local_path)

    os.remove(zip_path)
    print("Dataset downloaded and extracted successfully!")
  else:
    print("Dataset already exists!")
  return dataset_path

In [3]:
dataset_link = 'https://drive.google.com/uc?id=1RLkIm9UwxqQiFXrR96ixikb4jYCIBQau'
dataset_path = download_dataset(dataset_link)
os.listdir(dataset_path)



Downloading dataset from Google Drive...
Downloading zip file...


Downloading...
From: https://drive.google.com/uc?id=1RLkIm9UwxqQiFXrR96ixikb4jYCIBQau
To: /content/drone_dataset.zip
100%|██████████| 1.87M/1.87M [00:00<00:00, 116MB/s]

Extracting dataset...
Dataset downloaded and extracted successfully!





['Malfunction-Drone', 'Dos-Drone', 'ReadMeForDataSet.txt', 'NormalFlight']

#### Load Data in Dataframes

In [4]:
import pandas as pd
import numpy as np

In [5]:
def load_all_csvs_from_dir(directory_path):
  all_dataframes = []
  for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):
      filepath = os.path.join(directory_path, filename)
      try:
        df = pd.read_csv(filepath)
        all_dataframes.append(df)
      except Exception as e:
        print(f"Error reading {filename}: {e}")

  if all_dataframes:
    return pd.concat(all_dataframes, ignore_index=True)
  else:
    print(f"No CSV files found in '{directory_path}'. Returning empty DataFrame.")
    return pd.DataFrame()

#### Load Normal Flight Data

In [6]:
normal_flight_path = os.path.join(dataset_path, 'NormalFlight')
print(f"Loading CSVs from: {normal_flight_path}")

# Assuming there are CSVs inside 'NormalFlight' directory
df_normal = load_all_csvs_from_dir(normal_flight_path)

if not df_normal.empty:
  display(df_normal.head())
else:
  print("No CSVs were loaded or the DataFrame is empty.")

Loading CSVs from: /content/drone_dataset/NormalFlight


Unnamed: 0,S.No,setpoint_raw-global_Time,setpoint_raw-global_header.seq,setpoint_raw-global_header.stamp.secs,setpoint_raw-global_latitude,setpoint_raw-global_longitude,setpoint_raw-global_altitude,battery_Time,battery_header.seq,battery_header.stamp.secs,...,state_guided,state_manual_input,state_system_status,RSSI_Time,RSSI_Quality,RSSI_Signal,CPU_Time,CPU_Percent,RAM_Time,Used_RAM_MB
0,0,1729315000.0,137,0,33.707308,73.019849,1.25,1729315000.0,14916.0,1729315000.0,...,0.0,1.0,5.0,1729315000.0,1.0,-40.0,1729315000.0,9.6,1729315000.0,5.9
1,1,1729315000.0,139,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,
2,2,1729315000.0,141,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,
3,3,1729315000.0,147,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,
4,4,1729315000.0,155,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,


In [7]:
df_normal.shape

(49800, 79)

#### Load Malfunction Drone Data

In [8]:
malfunc_path = os.path.join(dataset_path, 'Malfunction-Drone')
print(f"Loading CSVs from: {malfunc_path}")

# Assuming there are CSVs inside 'Malfunction-Drone' directory
df_malfunction = load_all_csvs_from_dir(malfunc_path)

if not df_malfunction.empty:
  display(df_malfunction.head())
else:
  print("No CSVs were loaded or the DataFrame is empty.")

Loading CSVs from: /content/drone_dataset/Malfunction-Drone


Unnamed: 0,S.No,setpoint_raw-global_Time,setpoint_raw-global_header.seq,setpoint_raw-global_header.stamp.secs,setpoint_raw-global_latitude,setpoint_raw-global_longitude,setpoint_raw-global_altitude,battery_Time,battery_header.seq,battery_header.stamp.secs,...,state_guided,state_manual_input,state_system_status,RSSI_Time,RSSI_Quality,RSSI_Signal,CPU_Time,CPU_Percent,RAM_Time,Used_RAM_MB
0,0,1729688000.0,59,0,33.63709,72.991302,1.25,1729688000.0,21351.0,1729688000.0,...,0.0,1.0,5.0,1729688000.0,0.8,-54.0,1729688000.0,7.3,1729688000.0,7.6
1,1,1729688000.0,66,0,33.63709,72.991303,1.25,,,,...,,,,,,,,,,
2,2,1729688000.0,369,0,33.63709,72.991303,1.25,,,,...,,,,,,,,,,
3,3,1729688000.0,2111,0,33.63709,72.991303,1.25,,,,...,,,,,,,,,,
4,4,1729688000.0,2112,0,33.63709,72.991303,1.25,,,,...,,,,,,,,,,


In [9]:
df_malfunction.shape

(18030, 79)

#### Load DoS Drone Data

In [10]:
dos_drone_path = os.path.join(dataset_path, 'Dos-Drone')
print(f"Loading CSVs from: {dos_drone_path}")

# Assuming there are CSVs inside 'DoS-Drone' directory
df_dos = load_all_csvs_from_dir(dos_drone_path)

if not df_dos.empty:
  display(df_dos.head())
else:
  print("No CSVs were loaded or the DataFrame is empty.")

Loading CSVs from: /content/drone_dataset/Dos-Drone


Unnamed: 0,S.No,setpoint_raw-global_Time,setpoint_raw-global_header.seq,setpoint_raw-global_header.stamp.secs,setpoint_raw-global_latitude,setpoint_raw-global_longitude,setpoint_raw-global_altitude,battery_Time,battery_header.seq,battery_header.stamp.secs,...,state_guided,state_manual_input,state_system_status,RSSI_Time,RSSI_Quality,RSSI_Signal,CPU_Time,CPU_Percent,RAM_Time,Used_RAM_MB
0,0,1729405000.0,1,0,33.69785,73.016496,1.25,1729405000.0,4605.0,1729405000.0,...,1.0,1.0,5.0,1729405000.0,1.0,-39.0,1729405000.0,15.2,1729405000.0,5.6
1,1,1729405000.0,2,0,33.69785,73.016496,1.25,,,,...,,,,,,,,,,
2,2,1729405000.0,3,0,33.69785,73.016496,1.25,,,,...,,,,,,,,,,
3,3,1729405000.0,4,0,33.69785,73.016496,1.25,,,,...,,,,,,,,,,
4,4,1729405000.0,5,0,33.69785,73.016496,1.25,,,,...,,,,,,,,,,


In [11]:
df_dos.shape

(19587, 79)

#### Observations
* The dataset contains three subfolders **NormalFlight**, **Malfunction-Drone** and **Dos-Drone**
* Each subfolder contains raw data from drones in CSV format.
* Data in each CSV has **79 columns** documenting the following information from different drone sensors
  * Setpoint: Where the drone is supposed to go (target GPS, altitude).
  * Battery: voltage, current, percentage, temperature.
  * Global Position: Where the drone actually is (local X,Y,Z and GPS lat/long) and its speed.
  * IMU Data: How the drone is tilted (orientation) and its rotation speed.
  * RC Out: Control signals sent to the motors.
  * VFR HUD: Pilot display info: speed, altitude, heading, climb rate.
  * State: System status: armed, guided, connected, etc. (1=True, 0=False).
  * RSSI: Signal strength for the radio control link.
  * System Resources: Computer CPU and RAM usage.
* Number of rows in each category of drone data:
  * **Normal Flight**: 49,800
  * **Malfunction Drone**: 18,030
  * **Dos Drone**: 19,587   

### Data Proprocessing

#### Add Label to Dataframes

In [12]:
df_normal['label'] = 'normal'

df_dos['label'] = 'dos'

df_malfunction['label'] = 'malfunction'

#### Combine Dataframes


In [17]:
df_drones_all =  pd.concat([df_normal, df_malfunction, df_dos], axis=0, ignore_index=True, sort=False)

In [18]:
df_drones_all.head(10)

Unnamed: 0,S.No,setpoint_raw-global_Time,setpoint_raw-global_header.seq,setpoint_raw-global_header.stamp.secs,setpoint_raw-global_latitude,setpoint_raw-global_longitude,setpoint_raw-global_altitude,battery_Time,battery_header.seq,battery_header.stamp.secs,...,state_manual_input,state_system_status,RSSI_Time,RSSI_Quality,RSSI_Signal,CPU_Time,CPU_Percent,RAM_Time,Used_RAM_MB,label
0,0,1729315000.0,137,0,33.707308,73.019849,1.25,1729315000.0,14916.0,1729315000.0,...,1.0,5.0,1729315000.0,1.0,-40.0,1729315000.0,9.6,1729315000.0,5.9,normal
1,1,1729315000.0,139,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,normal
2,2,1729315000.0,141,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,normal
3,3,1729315000.0,147,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,normal
4,4,1729315000.0,155,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,normal
5,5,1729315000.0,170,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,normal
6,6,1729315000.0,198,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,normal
7,7,1729315000.0,208,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,normal
8,8,1729315000.0,214,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,normal
9,9,1729315000.0,218,0,33.707308,73.019849,1.25,,,,...,,,,,,,,,,normal


### Format Timestamps
The timestamps in this dataset are in Unix epoch seconds. Converting those to DateTime objects.

In [20]:
def format_timestamp_cols(df):
  for col in df.columns:
    if col.endswith('Time') or col.endswith('secs'):
      # First check if any values are 0 or near 0
      mask = df[col].astype(float) < 1  # or < 1000 if you want to be safe
      # Convert all
      df[col] = pd.to_datetime(df[col], unit='s', errors='coerce')
      # Set problematic ones to NaT
      df.loc[mask, col] = pd.NaT

In [21]:
format_timestamp_cols(df_drones_all)

### Exploratory Data Analysis

In [23]:
df_drones_all.head()

Unnamed: 0,S.No,setpoint_raw-global_Time,setpoint_raw-global_header.seq,setpoint_raw-global_header.stamp.secs,setpoint_raw-global_latitude,setpoint_raw-global_longitude,setpoint_raw-global_altitude,battery_Time,battery_header.seq,battery_header.stamp.secs,...,state_manual_input,state_system_status,RSSI_Time,RSSI_Quality,RSSI_Signal,CPU_Time,CPU_Percent,RAM_Time,Used_RAM_MB,label
0,0,2024-10-19 05:19:29.110145092,137,NaT,33.707308,73.019849,1.25,2024-10-19 05:19:06.615720987,14916.0,2024-10-19 05:19:06,...,1.0,5.0,2024-10-19 05:19:12.620326996,1.0,-40.0,2024-10-19 05:19:13,9.6,2024-10-19 05:19:16.838260889,5.9,normal
1,1,2024-10-19 05:19:29.122611761,139,NaT,33.707308,73.019849,1.25,NaT,,NaT,...,,,NaT,,,NaT,,NaT,,normal
2,2,2024-10-19 05:19:29.122659206,141,NaT,33.707308,73.019849,1.25,NaT,,NaT,...,,,NaT,,,NaT,,NaT,,normal
3,3,2024-10-19 05:19:29.123682499,147,NaT,33.707308,73.019849,1.25,NaT,,NaT,...,,,NaT,,,NaT,,NaT,,normal
4,4,2024-10-19 05:19:29.123990297,155,NaT,33.707308,73.019849,1.25,NaT,,NaT,...,,,NaT,,,NaT,,NaT,,normal


In [22]:
df_drones_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87417 entries, 0 to 87416
Data columns (total 80 columns):
 #   Column                                         Non-Null Count  Dtype         
---  ------                                         --------------  -----         
 0   S.No                                           87417 non-null  int64         
 1   setpoint_raw-global_Time                       87417 non-null  datetime64[ns]
 2   setpoint_raw-global_header.seq                 87417 non-null  int64         
 3   setpoint_raw-global_header.stamp.secs          0 non-null      datetime64[ns]
 4   setpoint_raw-global_latitude                   87417 non-null  float64       
 5   setpoint_raw-global_longitude                  87417 non-null  float64       
 6   setpoint_raw-global_altitude                   87417 non-null  float64       
 7   battery_Time                                   4349 non-null   datetime64[ns]
 8   battery_header.seq                             4349 non-