## Incident Dataset Cleaning & Normalizing Process

In [1]:
import pandas as pd


In [2]:
# Load the dataset
df = pd.read_csv('Aircraft_Incident_Dataset.csv')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Registration,Aircaft_Operator,Aircaft_Nature,Incident_Category,Incident_Cause(es),Incident_Location,Aircaft_Damage_Type,Date,...,Onboard_Crew,Onboard_Passengers,Onboard_Total,Fatalities,Aircaft_First_Flight,Aircraft_Phase,Departure_Airport,Destination_Airport,Ground_Casualties,Collision_Casualties
0,03-JAN-2022,British Aerospace 4121 Jetstream 41,ZS-NRJ,SA Airlink,Domestic Non Scheduled Passenger,Accident | repairable-damage,"Airplane - Engines, Airplane - Engines - Prop/...",near Venetia Mine...,Substantial,Monday 3 January 2022,...,Fatalities: 0 / Occupants: 3,Fatalities: 0 / Occupants: 4,Fatalities: 0 / Occupants: 7,0,1995-05-19 (26 years 8 months),Landing (LDG),Johannesburg-O.R. Tambo International Airport ...,"Venetia Mine Airport (FAVM) , South Africa",,
1,04-JAN-2022,British Aerospace 3101 Jetstream 31,HR-AYY,LANHSA,Domestic Scheduled Passenger,Accident | repairable-damage,"Airplane - Undercarriage, Airplane - Undercarr...",Roatán-Juan ...,Substantial,Tuesday 4 January 2022,...,Fatalities: 0 / Occupants:,Fatalities: 0 / Occupants:,Fatalities: 0 / Occupants: 19,0,1985,Landing (LDG),La Ceiba-Goloson International Airport (LCE/MH...,Roatán-Juan Manuel Gálvez International Airpor...,,
2,05-JAN-2022,Boeing 737-4H6,EP-CAP,Caspian Airlines,Domestic Scheduled Passenger,Accident | repairable-damage,"Airplane - Undercarriage, Airplane - Undercarr...",Isfahan-Shah...,Substantial,Wednesday 5 January 2022,...,Fatalities: 0 / Occupants:,Fatalities: 0 / Occupants:,Fatalities: 0 / Occupants: 116,0,1992-09-18 (29 years 4 months),Landing (LDG),"Mashhad Airport (MHD/OIMM) , Iran","Isfahan-Shahid Beheshti Airport (IFN/OIFM) , Iran",,
3,08-JAN-2022,Tupolev Tu-204-100C,RA-64032,"Cainiao, opb Aviastar-TU",Cargo,Accident | hull-loss,"Cargo - Fire/smoke, Result - Damaged on the gr...",Hangzhou-Xia...,Destroyed,Saturday 8 January 2022,...,Fatalities: 0 / Occupants: 8,Fatalities: 0 / Occupants: 0,Fatalities: 0 / Occupants: 8,0,2002-07-18 (19 years 6 months),Standing (STD),Hangzhou-Xiaoshan International Airport (HGH/Z...,"Novosibirsk-Tolmachevo Airport (OVB/UNNT) , Ru...",,
4,12-JAN-2022,Beechcraft 200 Super King Air,,private,Illegal Flight,"Criminal occurrence (sabotage, shoot down) | h...",Result - Damaged on the ground,"Machakilha, ...",Damaged beyond repair,Wednesday 12 January 2022,...,Fatalities: 0 / Occupants: 0,Fatalities: 0 / Occupants: 0,Fatalities: 0 / Occupants: 0,0,,Standing (STD),?,?,,


In [3]:
# We will iterate over each cell in the row to check for ASCII characters

# Using the 'apply' method with a lambda function to check each cell in the row
df = df[df.apply(lambda row: all(ord(char) < 128 for cell in row for char in str(cell)), axis=1)]

num_records = df.shape[0]
num_records


20536

In [4]:
# Columns to be removed
columns_to_remove = [
    "Date",  # Duplicate of Incident Date
    "Arit",  # Duplicate of Incident Date
    "Aircaft_Registration",  # Aircraft Registration
    "Ground_Casualties",  # 99% null
    "Collision_Casualties",  # 99% null
    "Time",  # 61% null
    "Aircaft_Engines",  # 55% null
    ### not required columns
    "Incident_Cause(es)",
    "Onboard_Crew",
    "Onboard_Passengers",
    "Aircaft_First_Flight"
]

# Remove the specified columns
df.drop(columns=columns_to_remove, inplace=True)

In [5]:
# Filter the dataset to include only passenger flights from aircaft nature column
df = df[df['Aircaft_Nature'].str.contains("Passenger", na=False)]
# Drop the 'Aircaft_Nature' column from dataframe
df.drop(columns=['Aircaft_Nature'], inplace=True)

In [6]:
# Impute string columns
for column in df.select_dtypes(include='object').columns:
    df[column].fillna('Unknown', inplace=True)

# Impute numeric columns - actually, pandas defaults to NaN for null numeric values,
# so you might not need to explicitly set them to NaN.
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    df[column].fillna(pd.NA, inplace=True)


In [7]:
#filter out only country name from the departure airport and destination airport and create new columns departure country and destination country
def extract_country_from_airport(airport_str):
    if pd.isna(airport_str) or ',' not in airport_str:
        return "Unknown"
    else:
        # Assuming the country name is the last part of the string
        parts = airport_str.split(", ")
        return parts[-1] if parts else "Unknown"

df['Departure_Country'] = df['Departure_Airport'].apply(extract_country_from_airport)
df['Destination_Country'] = df['Destination_Airport'].apply(extract_country_from_airport)

df.head()

Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Operator,Incident_Category,Incident_Location,Aircaft_Damage_Type,Onboard_Total,Fatalities,Aircraft_Phase,Departure_Airport,Destination_Airport,Departure_Country,Destination_Country
0,03-JAN-2022,British Aerospace 4121 Jetstream 41,SA Airlink,Accident | repairable-damage,near Venetia Mine...,Substantial,Fatalities: 0 / Occupants: 7,0,Landing (LDG),Johannesburg-O.R. Tambo International Airport ...,"Venetia Mine Airport (FAVM) , South Africa",South Africa,South Africa
2,05-JAN-2022,Boeing 737-4H6,Caspian Airlines,Accident | repairable-damage,Isfahan-Shah...,Substantial,Fatalities: 0 / Occupants: 116,0,Landing (LDG),"Mashhad Airport (MHD/OIMM) , Iran","Isfahan-Shahid Beheshti Airport (IFN/OIFM) , Iran",Iran,Iran
5,22-JAN-2022,Airbus A320-232,JetBlue Airways,Accident | repairable-damage,Hayden-Yampa...,Substantial,Fatalities: 0 / Occupants:,0,Takeoff (TOF),"Hayden-Yampa Valley Airport, CO (HDN/KHDN) , U...","Fort Lauderdale International Airport, FL (FLL...",United States of America,United States of America
11,15-FEB-2022,Britten-Norman BN-2A-9 Islander,Air Flamenco,Accident | repairable-damage,Culebra Airp...,Substantial,Fatalities: 0 / Occupants: 3,0,Landing (LDG),San Juan-Fernando Luis Ribas Dominicci Airport...,"Culebra Airport (CPX/TJCP) , Puerto Rico",Puerto Rico,Puerto Rico
14,09-JAN-2021,Boeing 737-524 (WL),Sriwijaya Air,Accident | hull-loss,near Jakarta-Soek...,Destroyed,Fatalities: 62 / Occupants: 62,62,En route (ENR),Jakarta-Soekarno-Hatta International Airport (...,"Pontianak Airport (PNK/WIOO) , Indonesia",Indonesia,Indonesia


In [8]:
# Drop the 'Departure' and 'Destination Airport' column from dataframe
df.drop(columns=['Departure_Airport', 'Destination_Airport'], inplace=True)
df.head()

Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Operator,Incident_Category,Incident_Location,Aircaft_Damage_Type,Onboard_Total,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country
0,03-JAN-2022,British Aerospace 4121 Jetstream 41,SA Airlink,Accident | repairable-damage,near Venetia Mine...,Substantial,Fatalities: 0 / Occupants: 7,0,Landing (LDG),South Africa,South Africa
2,05-JAN-2022,Boeing 737-4H6,Caspian Airlines,Accident | repairable-damage,Isfahan-Shah...,Substantial,Fatalities: 0 / Occupants: 116,0,Landing (LDG),Iran,Iran
5,22-JAN-2022,Airbus A320-232,JetBlue Airways,Accident | repairable-damage,Hayden-Yampa...,Substantial,Fatalities: 0 / Occupants:,0,Takeoff (TOF),United States of America,United States of America
11,15-FEB-2022,Britten-Norman BN-2A-9 Islander,Air Flamenco,Accident | repairable-damage,Culebra Airp...,Substantial,Fatalities: 0 / Occupants: 3,0,Landing (LDG),Puerto Rico,Puerto Rico
14,09-JAN-2021,Boeing 737-524 (WL),Sriwijaya Air,Accident | hull-loss,near Jakarta-Soek...,Destroyed,Fatalities: 62 / Occupants: 62,62,En route (ENR),Indonesia,Indonesia


In [9]:
# Define the function to determine if the flight is domestic, international, or unknown
def determine_flight_type(row):
    if 'Unknown' in [row['Departure_Country'], row['Destination_Country']]:
        return 'Unknown'
    elif row['Departure_Country'] == row['Destination_Country']:
        return 'Domestic'
    else:
        return 'International'

# Apply the function to create the 'Flight_Type' column
df['Flight_Type'] = df.apply(determine_flight_type, axis=1)


In [10]:
df.head()

Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Operator,Incident_Category,Incident_Location,Aircaft_Damage_Type,Onboard_Total,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type
0,03-JAN-2022,British Aerospace 4121 Jetstream 41,SA Airlink,Accident | repairable-damage,near Venetia Mine...,Substantial,Fatalities: 0 / Occupants: 7,0,Landing (LDG),South Africa,South Africa,Domestic
2,05-JAN-2022,Boeing 737-4H6,Caspian Airlines,Accident | repairable-damage,Isfahan-Shah...,Substantial,Fatalities: 0 / Occupants: 116,0,Landing (LDG),Iran,Iran,Domestic
5,22-JAN-2022,Airbus A320-232,JetBlue Airways,Accident | repairable-damage,Hayden-Yampa...,Substantial,Fatalities: 0 / Occupants:,0,Takeoff (TOF),United States of America,United States of America,Domestic
11,15-FEB-2022,Britten-Norman BN-2A-9 Islander,Air Flamenco,Accident | repairable-damage,Culebra Airp...,Substantial,Fatalities: 0 / Occupants: 3,0,Landing (LDG),Puerto Rico,Puerto Rico,Domestic
14,09-JAN-2021,Boeing 737-524 (WL),Sriwijaya Air,Accident | hull-loss,near Jakarta-Soek...,Destroyed,Fatalities: 62 / Occupants: 62,62,En route (ENR),Indonesia,Indonesia,Domestic


In [11]:
# Counting the number of records in the filtered dataset
num_records = df.shape[0]
num_records

5537

In [12]:
# List of specified aircraft models
aircraft_models = [
    "Boeing 777", "Boeing 767", "Sukhoi Superjet 100", "Airbus A320", 
    "Airbus A321", "Airbus A319", "Boeing 737", "Cessna 208 Caravan", 
    "Bombardier CRJ"
]

# Create a function to check if an aircraft model is in the list
def is_relevant_model(model):
    return any(model_str in model for model_str in aircraft_models)

# Filter the dataframe for rows where the Aircaft_Model matches one of the specified models
df = df[
    df['Aircaft_Model'].apply(is_relevant_model)
]

# Display the first few rows of the filtered dataframe
df.head()


Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Operator,Incident_Category,Incident_Location,Aircaft_Damage_Type,Onboard_Total,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type
2,05-JAN-2022,Boeing 737-4H6,Caspian Airlines,Accident | repairable-damage,Isfahan-Shah...,Substantial,Fatalities: 0 / Occupants: 116,0,Landing (LDG),Iran,Iran,Domestic
5,22-JAN-2022,Airbus A320-232,JetBlue Airways,Accident | repairable-damage,Hayden-Yampa...,Substantial,Fatalities: 0 / Occupants:,0,Takeoff (TOF),United States of America,United States of America,Domestic
14,09-JAN-2021,Boeing 737-524 (WL),Sriwijaya Air,Accident | hull-loss,near Jakarta-Soek...,Destroyed,Fatalities: 62 / Occupants: 62,62,En route (ENR),Indonesia,Indonesia,Domestic
30,10-FEB-2021,Airbus A320-214 (WL),Flyadeal,"Criminal occurrence (sabotage, shoot down) | r...",Abha Interna...,Substantial,Fatalities: 0 / Occupants:,0,Standing (STD),Unknown,Unknown,Unknown
49,18-MAR-2021,Airbus A320-232,Viva Aerobus,Accident | repairable-damage,Puerto Valla...,Substantial,Fatalities: 0 / Occupants: 127,0,Taxi (TXI),Mexico,Mexico,Domestic


In [13]:
# Counting the number of records in the filtered dataset
num_records = df.shape[0]
num_records

566

Since the records is small, how about linking the datasets through aircraft

In [14]:
# Function to rename aircraft model
def rename_aircraft_model(model):
    for model_str in aircraft_models:
        if model_str in model:
            return model_str
    return model  # Return the original model if no match is found

# Apply the renaming function to the Aircaft_Model column
df['Aircaft_Model'] = df['Aircaft_Model'].apply(rename_aircraft_model)

# Display the first few rows of the filtered dataframe
df.head()

Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Operator,Incident_Category,Incident_Location,Aircaft_Damage_Type,Onboard_Total,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type
2,05-JAN-2022,Boeing 737,Caspian Airlines,Accident | repairable-damage,Isfahan-Shah...,Substantial,Fatalities: 0 / Occupants: 116,0,Landing (LDG),Iran,Iran,Domestic
5,22-JAN-2022,Airbus A320,JetBlue Airways,Accident | repairable-damage,Hayden-Yampa...,Substantial,Fatalities: 0 / Occupants:,0,Takeoff (TOF),United States of America,United States of America,Domestic
14,09-JAN-2021,Boeing 737,Sriwijaya Air,Accident | hull-loss,near Jakarta-Soek...,Destroyed,Fatalities: 62 / Occupants: 62,62,En route (ENR),Indonesia,Indonesia,Domestic
30,10-FEB-2021,Airbus A320,Flyadeal,"Criminal occurrence (sabotage, shoot down) | r...",Abha Interna...,Substantial,Fatalities: 0 / Occupants:,0,Standing (STD),Unknown,Unknown,Unknown
49,18-MAR-2021,Airbus A320,Viva Aerobus,Accident | repairable-damage,Puerto Valla...,Substantial,Fatalities: 0 / Occupants: 127,0,Taxi (TXI),Mexico,Mexico,Domestic


In [15]:
df.head()

Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Operator,Incident_Category,Incident_Location,Aircaft_Damage_Type,Onboard_Total,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type
2,05-JAN-2022,Boeing 737,Caspian Airlines,Accident | repairable-damage,Isfahan-Shah...,Substantial,Fatalities: 0 / Occupants: 116,0,Landing (LDG),Iran,Iran,Domestic
5,22-JAN-2022,Airbus A320,JetBlue Airways,Accident | repairable-damage,Hayden-Yampa...,Substantial,Fatalities: 0 / Occupants:,0,Takeoff (TOF),United States of America,United States of America,Domestic
14,09-JAN-2021,Boeing 737,Sriwijaya Air,Accident | hull-loss,near Jakarta-Soek...,Destroyed,Fatalities: 62 / Occupants: 62,62,En route (ENR),Indonesia,Indonesia,Domestic
30,10-FEB-2021,Airbus A320,Flyadeal,"Criminal occurrence (sabotage, shoot down) | r...",Abha Interna...,Substantial,Fatalities: 0 / Occupants:,0,Standing (STD),Unknown,Unknown,Unknown
49,18-MAR-2021,Airbus A320,Viva Aerobus,Accident | repairable-damage,Puerto Valla...,Substantial,Fatalities: 0 / Occupants: 127,0,Taxi (TXI),Mexico,Mexico,Domestic


In [17]:
import re
def extract_occupant_number(onboard_str):
    match = re.search(r'Occupants: (\d+)', onboard_str)
    if match:
        return int(match.group(1))
    return None

df['Occupants'] = df['Onboard_Total'].apply(extract_occupant_number)


In [18]:
df.drop(columns=['Onboard_Total'], inplace=True)
df.head()

Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Operator,Incident_Category,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants
2,05-JAN-2022,Boeing 737,Caspian Airlines,Accident | repairable-damage,Isfahan-Shah...,Substantial,0,Landing (LDG),Iran,Iran,Domestic,116.0
5,22-JAN-2022,Airbus A320,JetBlue Airways,Accident | repairable-damage,Hayden-Yampa...,Substantial,0,Takeoff (TOF),United States of America,United States of America,Domestic,
14,09-JAN-2021,Boeing 737,Sriwijaya Air,Accident | hull-loss,near Jakarta-Soek...,Destroyed,62,En route (ENR),Indonesia,Indonesia,Domestic,62.0
30,10-FEB-2021,Airbus A320,Flyadeal,"Criminal occurrence (sabotage, shoot down) | r...",Abha Interna...,Substantial,0,Standing (STD),Unknown,Unknown,Unknown,
49,18-MAR-2021,Airbus A320,Viva Aerobus,Accident | repairable-damage,Puerto Valla...,Substantial,0,Taxi (TXI),Mexico,Mexico,Domestic,127.0


In [19]:
# Splitting the 'Incident_Category' column into two new columns based on the '|' character
df[['Incident_Type', 'Incident_Damage']] = df['Incident_Category'].str.split('|', expand=True)

# Display the first few rows of the updated dataframe to verify the changes
df.head()

Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Operator,Incident_Category,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,Incident_Type,Incident_Damage
2,05-JAN-2022,Boeing 737,Caspian Airlines,Accident | repairable-damage,Isfahan-Shah...,Substantial,0,Landing (LDG),Iran,Iran,Domestic,116.0,Accident,repairable-damage
5,22-JAN-2022,Airbus A320,JetBlue Airways,Accident | repairable-damage,Hayden-Yampa...,Substantial,0,Takeoff (TOF),United States of America,United States of America,Domestic,,Accident,repairable-damage
14,09-JAN-2021,Boeing 737,Sriwijaya Air,Accident | hull-loss,near Jakarta-Soek...,Destroyed,62,En route (ENR),Indonesia,Indonesia,Domestic,62.0,Accident,hull-loss
30,10-FEB-2021,Airbus A320,Flyadeal,"Criminal occurrence (sabotage, shoot down) | r...",Abha Interna...,Substantial,0,Standing (STD),Unknown,Unknown,Unknown,,"Criminal occurrence (sabotage, shoot down)",repairable-damage
49,18-MAR-2021,Airbus A320,Viva Aerobus,Accident | repairable-damage,Puerto Valla...,Substantial,0,Taxi (TXI),Mexico,Mexico,Domestic,127.0,Accident,repairable-damage


In [20]:
# Remove Incident_Category
df.drop(columns=['Incident_Category'],inplace=True)

In [21]:
# Removing all rows where any column contains 'Unknown' or NaN values
df = df.replace('Unknown', pd.NA).dropna()

# Display the first few rows of the cleaned dataframe to verify the changes
df.head()

Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,Incident_Type,Incident_Damage
2,05-JAN-2022,Boeing 737,Caspian Airlines,Isfahan-Shah...,Substantial,0,Landing (LDG),Iran,Iran,Domestic,116.0,Accident,repairable-damage
14,09-JAN-2021,Boeing 737,Sriwijaya Air,near Jakarta-Soek...,Destroyed,62,En route (ENR),Indonesia,Indonesia,Domestic,62.0,Accident,hull-loss
49,18-MAR-2021,Airbus A320,Viva Aerobus,Puerto Valla...,Substantial,0,Taxi (TXI),Mexico,Mexico,Domestic,127.0,Accident,repairable-damage
117,23-SEP-2021,Airbus A321,Hawaiian Airlines,Honolulu-Dan...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,134.0,Accident,repairable-damage
165,08-JAN-2020,Boeing 737,Ukraine International Airlines,near Sabashahr,Destroyed,176,En route (ENR),Iran,Ukraine,International,176.0,"Criminal occurrence (sabotage, shoot down)",hull-loss


This is the clean dataset (might include write_csv... at the very end), process of normalizing the dataset is below

In [22]:
df.to_csv('Clean Incident Dataset.csv', index=False)

In [23]:
# Counting the total number of records of the clean dataset'
total_records_cleaned = df.shape[0]
total_records_cleaned

399

In [24]:
# Counting the number of records where the flight type is 'International'
international_flights_count = df[df['Flight_Type'] == 'International'].shape[0]
international_flights_count


146

In [25]:
# Convert 'Incident_Date' to datetime format
df['Incident_Date'] = pd.to_datetime(df['Incident_Date'], format='%d-%b-%Y', errors='coerce')

# Create new columns for year, month, and day
df['Incident_Year'] = df['Incident_Date'].dt.year
df['Incident_Month'] = df['Incident_Date'].dt.month
df['Incident_Day'] = df['Incident_Date'].dt.day

# Display the first few rows of the updated dataframe
df.head()

Unnamed: 0,Incident_Date,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,Incident_Type,Incident_Damage,Incident_Year,Incident_Month,Incident_Day
2,2022-01-05,Boeing 737,Caspian Airlines,Isfahan-Shah...,Substantial,0,Landing (LDG),Iran,Iran,Domestic,116.0,Accident,repairable-damage,2022,1,5
14,2021-01-09,Boeing 737,Sriwijaya Air,near Jakarta-Soek...,Destroyed,62,En route (ENR),Indonesia,Indonesia,Domestic,62.0,Accident,hull-loss,2021,1,9
49,2021-03-18,Airbus A320,Viva Aerobus,Puerto Valla...,Substantial,0,Taxi (TXI),Mexico,Mexico,Domestic,127.0,Accident,repairable-damage,2021,3,18
117,2021-09-23,Airbus A321,Hawaiian Airlines,Honolulu-Dan...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,134.0,Accident,repairable-damage,2021,9,23
165,2020-01-08,Boeing 737,Ukraine International Airlines,near Sabashahr,Destroyed,176,En route (ENR),Iran,Ukraine,International,176.0,"Criminal occurrence (sabotage, shoot down)",hull-loss,2020,1,8


In [26]:
# Generating a simple numeric unique ID for each unique incident date

# First, sort the dataframe by 'Incident_Date' to ensure chronological order
df_sorted = df.sort_values(by='Incident_Date')

# Creating a numeric ID for each unique date
unique_sorted_dates = df_sorted['Incident_Date'].unique()
date_numeric_id_mapping = {date: i+1 for i, date in enumerate(unique_sorted_dates)}

# Applying the mapping to the dataframe
df_sorted['Date_ID'] = df_sorted['Incident_Date'].map(date_numeric_id_mapping)

df=df_sorted

# Drop the 'Incident_Date' column from dataframe
df.drop(columns=['Incident_Date'], inplace=True)

# Display the first few rows of the dataframe with the new Incident_ID
df.head()

Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,Incident_Type,Incident_Damage,Incident_Year,Incident_Month,Incident_Day,Date_ID
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,Accident,hull-loss,1970,7,19,1
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,Accident,hull-loss,1972,12,8,2
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,Accident,hull-loss,1973,5,31,3
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,Accident,repairable-damage,1973,10,28,4
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,Accident,repairable-damage,1975,2,18,5


In [27]:
# Create a Aircaft_Code for each unique Aircraft_Model referencing to the aircrafts_data in the airlines dataset

model_to_code = {
    'Boeing 777': '773',
    'Boeing 767': '763',
    'Sukhoi Superjet 100': 'SU9',
    'Airbus A320': '320',
    'Airbus A321':'321',
    'Airbus A319':'319',
    'Boeing 737':'733',
    'Cessna 208 Caravan':'CN1',
    'Bombardier CRJ':'CR2'
}

df['Aircraft_Code'] = df['Aircaft_Model'].map(model_to_code)

df.head()

Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,Incident_Type,Incident_Damage,Incident_Year,Incident_Month,Incident_Day,Date_ID,Aircraft_Code
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,Accident,hull-loss,1970,7,19,1,733
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,Accident,hull-loss,1972,12,8,2,733
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,Accident,hull-loss,1973,5,31,3,733
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,Accident,repairable-damage,1973,10,28,4,733
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,Accident,repairable-damage,1975,2,18,5,733


In [28]:
# Create a numeric unique ID for each unique Aircraft Operator

# Creating a numeric ID mapping for each unique operator
unique_operators = df['Aircaft_Operator'].unique()
operator_numeric_id_mapping = {operator: i+1 for i, operator in enumerate(unique_operators)}

# Applying the numeric ID mapping to the Aircraft Operator in the dataframe
df['Operator_ID'] = df['Aircaft_Operator'].map(operator_numeric_id_mapping)

# Displaying the first few rows to verify the change
df.head()


Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,Incident_Type,Incident_Damage,Incident_Year,Incident_Month,Incident_Day,Date_ID,Aircraft_Code,Operator_ID
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,Accident,hull-loss,1970,7,19,1,733,1
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,Accident,hull-loss,1972,12,8,2,733,1
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,Accident,hull-loss,1973,5,31,3,733,2
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,Accident,repairable-damage,1973,10,28,4,733,3
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,Accident,repairable-damage,1975,2,18,5,733,2


In [29]:
# Create a numeric unique ID for each unique Incident location

# Creating a numeric ID mapping for each Incident location
unique_location = df['Incident_Location'].unique()
location_numeric_id_mapping = {operator: i+1 for i, operator in enumerate(unique_location)}

# Applying the numeric ID mapping to the Incident location in the dataframe
df['Location_ID'] = df['Incident_Location'].map(location_numeric_id_mapping)

# Displaying the first few rows to verify the change
df.head()

Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,Incident_Type,Incident_Damage,Incident_Year,Incident_Month,Incident_Day,Date_ID,Aircraft_Code,Operator_ID,Location_ID
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,Accident,hull-loss,1970,7,19,1,733,1,1
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,Accident,hull-loss,1972,12,8,2,733,1,2
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,Accident,hull-loss,1973,5,31,3,733,2,3
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,Accident,repairable-damage,1973,10,28,4,733,3,4
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,Accident,repairable-damage,1975,2,18,5,733,2,5


In [30]:
# Create a numeric unique ID for each unique Aircaft_Damage_Type location

# Creating a numeric ID mapping for each Aircaft_Damage_Type
unique_Aircaft_Damage_Type = df['Aircaft_Damage_Type'].unique()
Aircaft_Damage_Type_numeric_id_mapping = {operator: i+1 for i, operator in enumerate(unique_Aircaft_Damage_Type)}

# Applying the numeric ID mapping to the Aircaft_Damage_Type in the dataframe
df['Damage Type ID'] = df['Aircaft_Damage_Type'].map(Aircaft_Damage_Type_numeric_id_mapping)

# Displaying the first few rows to verify the change
df.head()

Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,Incident_Type,Incident_Damage,Incident_Year,Incident_Month,Incident_Day,Date_ID,Aircraft_Code,Operator_ID,Location_ID,Damage Type ID
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,Accident,hull-loss,1970,7,19,1,733,1,1,1
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,Accident,hull-loss,1972,12,8,2,733,1,2,1
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,Accident,hull-loss,1973,5,31,3,733,2,3,2
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,Accident,repairable-damage,1973,10,28,4,733,3,4,3
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,Accident,repairable-damage,1975,2,18,5,733,2,5,3


In [31]:
# Create a numeric unique ID for each unique Aircraft_Phase

# Creating a numeric ID mapping for each Aircraft_Phase
unique_Aircraft_Phase = df['Aircraft_Phase'].unique()
Aircraft_Phase_numeric_id_mapping = {operator: i+1 for i, operator in enumerate(unique_Aircraft_Phase)}

# Applying the numeric ID mapping to the Aircaft_Phase in the dataframe
df['Phase ID'] = df['Aircraft_Phase'].map(Aircraft_Phase_numeric_id_mapping)

# Displaying the first few rows to verify the change
df.head()

Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,...,Incident_Damage,Incident_Year,Incident_Month,Incident_Day,Date_ID,Aircraft_Code,Operator_ID,Location_ID,Damage Type ID,Phase ID
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,...,hull-loss,1970,7,19,1,733,1,1,1,1
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,...,hull-loss,1972,12,8,2,733,1,2,1,2
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,...,hull-loss,1973,5,31,3,733,2,3,2,2
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,...,repairable-damage,1973,10,28,4,733,3,4,3,3
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,...,repairable-damage,1975,2,18,5,733,2,5,3,3


In [32]:
# Create a numeric unique ID for each unique Flight_Type

# Creating a numeric ID mapping for each Flight_Type
unique_Flight_Type = df['Flight_Type'].unique()
Flight_Type_numeric_id_mapping = {operator: i+1 for i, operator in enumerate(unique_Flight_Type)}

# Applying the numeric ID mapping to the Flight_Type in the dataframe
df['Flight Type ID'] = df['Flight_Type'].map(Flight_Type_numeric_id_mapping)

# Displaying the first few rows to verify the change
df.head()

Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,...,Incident_Year,Incident_Month,Incident_Day,Date_ID,Aircraft_Code,Operator_ID,Location_ID,Damage Type ID,Phase ID,Flight Type ID
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,...,1970,7,19,1,733,1,1,1,1,1
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,...,1972,12,8,2,733,1,2,1,2,1
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,...,1973,5,31,3,733,2,3,2,2,1
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,...,1973,10,28,4,733,3,4,3,3,1
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,...,1975,2,18,5,733,2,5,3,3,1


In [33]:
# Create a numeric unique ID for each unique Incident_Type

# Creating a numeric ID mapping for each Incident_Type
unique_Incident_Type = df['Incident_Type'].unique()
Incident_Type_numeric_id_mapping = {operator: i+1 for i, operator in enumerate(unique_Incident_Type)}

# Applying the numeric ID mapping to the Incident_Type in the dataframe
df['Incident Type ID'] = df['Incident_Type'].map(Incident_Type_numeric_id_mapping)

# Displaying the first few rows to verify the change
df.head()

Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,...,Incident_Month,Incident_Day,Date_ID,Aircraft_Code,Operator_ID,Location_ID,Damage Type ID,Phase ID,Flight Type ID,Incident Type ID
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,...,7,19,1,733,1,1,1,1,1,1
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,...,12,8,2,733,1,2,1,2,1,1
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,...,5,31,3,733,2,3,2,2,1,1
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,...,10,28,4,733,3,4,3,3,1,1
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,...,2,18,5,733,2,5,3,3,1,1


In [34]:
# Create a numeric unique ID for each unique Incident_Damage

# Creating a numeric ID mapping for each Incident_Damage
unique_Incident_Damage = df['Incident_Damage'].unique()
Incident_Damage_numeric_id_mapping = {operator: i+1 for i, operator in enumerate(unique_Incident_Damage)}

# Applying the numeric ID mapping to the Incident_Damage in the dataframe
df['Incident Damage ID'] = df['Incident_Damage'].map(Incident_Damage_numeric_id_mapping)

# Displaying the first few rows to verify the change
df.head()

Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,...,Incident_Day,Date_ID,Aircraft_Code,Operator_ID,Location_ID,Damage Type ID,Phase ID,Flight Type ID,Incident Type ID,Incident Damage ID
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,...,19,1,733,1,1,1,1,1,1,1
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,...,8,2,733,1,2,1,2,1,1,1
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,...,31,3,733,2,3,2,2,1,1,1
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,...,28,4,733,3,4,3,3,1,1,2
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,...,18,5,733,2,5,3,3,1,1,2


In [35]:
# Selecting only the departure and destination countries
df_countries = df[['Departure_Country', 'Destination_Country']]

# Combining the two columns 'Departure_Country' and 'Destination_Country' 
# and creating a new dataframe with unique countries

# Concatenating the two columns and dropping duplicates
combined_countries = pd.concat([df_countries['Departure_Country'], df_countries['Destination_Country']]).drop_duplicates()

# Creating a new dataframe with the unique countries
df_countries = pd.DataFrame(combined_countries, columns=['Country']).reset_index(drop=True)

# Adding an index column to the dataframe
df_countries['ID'] = df_countries.index + 1

# Reversing the columns
df_countries = df_countries.iloc[:, ::-1]

# Displaying the first few rows of the new dataframe
df_countries.head()


Unnamed: 0,ID,Country
0,1,United States of America
1,2,India
2,3,Spain
3,4,Malaysia
4,5,Canada


In [36]:
# Creating a mapping from the country names to their respective indices in df_unique_countries
country_index_mapping = df_countries.set_index('Country')['ID'].to_dict()

# Mapping the departure and destination countries in the original dataframe to their indices
df['Departure_Country_ID'] = df['Departure_Country'].map(country_index_mapping)
df['Destination_Country_ID'] = df['Destination_Country'].map(country_index_mapping)

# Displaying the first few rows to verify the changes
df.head()


Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,...,Aircraft_Code,Operator_ID,Location_ID,Damage Type ID,Phase ID,Flight Type ID,Incident Type ID,Incident Damage ID,Departure_Country_ID,Destination_Country_ID
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,...,733,1,1,1,1,1,1,1,1,1
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,...,733,1,2,1,2,1,1,1,1,1
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,...,733,2,3,2,2,1,1,1,2,2
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,...,733,3,4,3,3,1,1,2,1,1
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,...,733,2,5,3,3,1,1,2,2,2


In [37]:
# Creating a new dataframe for aircraft operators and their IDs
df_Aircaft_Operator = df[['Aircaft_Operator', 'Operator_ID']].drop_duplicates()

# Reversing the columns
df_Aircaft_Operator = df_Aircaft_Operator.iloc[:, ::-1]
df_Aircaft_Operator.to_csv('Aircraft Operator Dataset.csv', index=False)


# Displaying the first few rows of the new dataset
df_Aircaft_Operator.head()

Unnamed: 0,Operator_ID,Aircaft_Operator
12069,1,United Airlines
11073,2,Indian Airlines
11200,3,Piedmont Airlines
10521,4,Western Air Lines
10157,5,Lufthansa


In [38]:
# Creating a new dataframe for Incident_Location and their IDs
df_Incident_Location = df[['Incident_Location', 'Location_ID']].drop_duplicates()

# Reversing the columns
df_Incident_Location = df_Incident_Location.iloc[:, ::-1]

df_Incident_Location.to_csv('Incident Location Dataset.csv', index=False)


# Displaying the first few rows of the new dataset
df_Incident_Location.head()

Unnamed: 0,Location_ID,Incident_Location
12069,1,Philadelphia...
11571,2,near Chicago-Midw...
11073,3,near Delhi-Indira...
11200,4,Greensboro/H...
10485,5,Bangalore-Hi...


In [39]:
# Creating a new dataframe for Aircaft_Damage_Type and their IDs
df_Aircaft_Damage_Type = df[['Aircaft_Damage_Type', 'Damage Type ID']].drop_duplicates()

# Reversing the columns
df_Aircaft_Damage_Type = df_Aircaft_Damage_Type.iloc[:, ::-1]

df_Aircaft_Damage_Type.to_csv('Aircaft Damage Type Dataset.csv', index=False)

# Displaying the first few rows of the new dataset
df_Aircaft_Damage_Type.head()

Unnamed: 0,Damage Type ID,Aircaft_Damage_Type
12069,1,Damaged beyond repair
11073,2,Destroyed
11200,3,Substantial
10157,4,Minor


In [40]:
# Creating a new dataframe for Aircraft_Phase and their IDs
df_Aircraft_Phase = df[['Aircraft_Phase', 'Phase ID']].drop_duplicates()

# Reversing the columns
df_Aircraft_Phase = df_Aircraft_Phase.iloc[:, ::-1]

df_Aircraft_Phase.to_csv('Aircraft Phase Dataset.csv', index=False)

# Displaying the first few rows of the new dataset
df_Aircraft_Phase.head()

Unnamed: 0,Phase ID,Aircraft_Phase
12069,1,Takeoff (TOF)
11571,2,Approach (APR)
11200,3,Landing (LDG)
10157,4,En route (ENR)
9105,5,Standing (STD)


In [41]:
# Creating a new dataframe for Aircraft_Phase and their IDs
df_Flight_Type = df[['Flight_Type', 'Flight Type ID']].drop_duplicates()

# Reversing the columns
df_Flight_Type = df_Flight_Type.iloc[:, ::-1]

df_Flight_Type.to_csv('Flight Type Dataset.csv', index=False)

# Displaying the first few rows of the new dataset
df_Flight_Type.head()

Unnamed: 0,Flight Type ID,Flight_Type
12069,1,Domestic
10157,2,International


In [42]:
# Creating a new dataframe for Incident_Type and their IDs
df_Incident_Type = df[['Incident_Type', 'Incident Type ID']].drop_duplicates()

# Reversing the columns
df_Incident_Type = df_Incident_Type.iloc[:, ::-1]

df_Incident_Type.to_csv('Incident Type Dataset.csv', index=False)

# Displaying the first few rows of the new dataset
df_Incident_Type.head()

Unnamed: 0,Incident Type ID,Incident_Type
12069,1,Accident
10157,2,Hijacking
9478,3,"Criminal occurrence (sabotage, shoot down)"
1356,4,Incident


In [43]:
# Creating a new dataframe for Incident_Damage and their IDs
df_Incident_Damage = df[['Incident_Damage', 'Incident Damage ID']].drop_duplicates()

# Reversing the columns
df_Incident_Damage = df_Incident_Damage.iloc[:, ::-1]

df_Incident_Damage.to_csv('Incident Damage Dataset.csv', index=False)

# Displaying the first few rows of the new dataset
df_Incident_Damage.head()

Unnamed: 0,Incident Damage ID,Incident_Damage
12069,1,hull-loss
11200,2,repairable-damage


In [44]:
# Creating a new dataframe for Date and their IDs
df_Date = df[['Date_ID', 'Incident_Year', 'Incident_Month', 'Incident_Day']].drop_duplicates()

df_Date.to_csv('Date Dataset.csv', index=False)

# Displaying the first few rows of the new dataset
df_Date.head()

Unnamed: 0,Date_ID,Incident_Year,Incident_Month,Incident_Day
12069,1,1970,7,19
11571,2,1972,12,8
11073,3,1973,5,31
11200,4,1973,10,28
10485,5,1975,2,18


In [45]:
df['Incident Aircraft ID'] = range(1, len(df) + 1)
df['Incident ID'] = range(1, len(df) + 1)
df.head()

Unnamed: 0,Aircaft_Model,Aircaft_Operator,Incident_Location,Aircaft_Damage_Type,Fatalities,Aircraft_Phase,Departure_Country,Destination_Country,Flight_Type,Occupants,...,Location_ID,Damage Type ID,Phase ID,Flight Type ID,Incident Type ID,Incident Damage ID,Departure_Country_ID,Destination_Country_ID,Incident Aircraft ID,Incident ID
12069,Boeing 737,United Airlines,Philadelphia...,Damaged beyond repair,0,Takeoff (TOF),United States of America,United States of America,Domestic,61.0,...,1,1,1,1,1,1,1,1,1,1
11571,Boeing 737,United Airlines,near Chicago-Midw...,Damaged beyond repair,45,Approach (APR),United States of America,United States of America,Domestic,61.0,...,2,1,2,1,1,1,1,1,2,2
11073,Boeing 737,Indian Airlines,near Delhi-Indira...,Destroyed,48,Approach (APR),India,India,Domestic,65.0,...,3,2,2,1,1,1,2,2,3,3
11200,Boeing 737,Piedmont Airlines,Greensboro/H...,Substantial,0,Landing (LDG),United States of America,United States of America,Domestic,96.0,...,4,3,3,1,1,2,1,1,4,4
10485,Boeing 737,Indian Airlines,Bangalore-Hi...,Substantial,0,Landing (LDG),India,India,Domestic,81.0,...,5,3,3,1,1,2,2,2,5,5


In [46]:
# Create a new dataframe named Incident Aircraft
df_Incident_Aircraft = df[['Incident Aircraft ID','Aircraft_Code', 'Operator_ID', 'Departure_Country_ID','Destination_Country_ID','Flight Type ID', 'Occupants']]

df_Incident_Aircraft.to_csv('Incident Aircraft.csv', index=False)

# Create a new dataframe named Incident
df_Incident = df[['Incident ID','Location_ID','Damage Type ID','Phase ID', 'Incident Type ID', 'Incident Damage ID','Incident Aircraft ID','Date_ID','Fatalities']]

df_Incident.to_csv('Incident.csv', index=False)

# Displaying the first few rows of the new dataset
df_Incident_Aircraft.head()
df_Incident.head()

Unnamed: 0,Incident ID,Location_ID,Damage Type ID,Phase ID,Incident Type ID,Incident Damage ID,Incident Aircraft ID,Date_ID,Fatalities
12069,1,1,1,1,1,1,1,1,0
11571,2,2,1,2,1,1,2,2,45
11073,3,3,2,2,1,1,3,3,48
11200,4,4,3,3,1,2,4,4,0
10485,5,5,3,3,1,2,5,5,0


In [47]:
df_Incident_Aircraft.head()

Unnamed: 0,Incident Aircraft ID,Aircraft_Code,Operator_ID,Departure_Country_ID,Destination_Country_ID,Flight Type ID,Occupants
12069,1,733,1,1,1,1,61.0
11571,2,733,1,1,1,1,61.0
11073,3,733,2,2,2,1,65.0
11200,4,733,3,1,1,1,96.0
10485,5,733,2,2,2,1,81.0


In [48]:
## I forget to output the country dataset as csv before
df_countries.to_csv('Country.csv', index=False)
df_countries.head()

Unnamed: 0,ID,Country
0,1,United States of America
1,2,India
2,3,Spain
3,4,Malaysia
4,5,Canada
