In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Enhancing default style of Matplotlib and Seaborn
plt.style.use('ggplot')
sns.set(style="whitegrid")

In [12]:
# Import the dataset
data = pd.read_csv("Airplane_Crashes_and_Fatalities_Since_1908_t0_2023.csv",encoding="latin-1")

# Initial exploration of data
initial_affichage = data.head()


In [13]:
initial_affichage

Unnamed: 0,Date,Time,Location,Operator,Flight #,Route,AC Type,Registration,cn/ln,Aboard,Aboard Passangers,Aboard Crew,Fatalities,Fatalities Passangers,Fatalities Crew,Ground,Summary
0,9/17/1908,17:18,"Fort Myer, Virginia",Military - U.S. Army,,Demonstration,Wright Flyer III,,1.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,"During a demonstration flight, a U.S. Army fly..."
1,9/7/1909,,"Juvisy-sur-Orge, France",,,Air show,Wright Byplane,SC1,,1.0,0.0,1.0,1.0,0.0,0.0,0.0,Eugene Lefebvre was the first pilot to ever be...
2,7/12/1912,6:30,"Atlantic City, New Jersey",Military - U.S. Navy,,Test flight,Dirigible,,,5.0,0.0,5.0,5.0,0.0,5.0,0.0,First U.S. dirigible Akron exploded just offsh...
3,8/6/1913,,"Victoria, British Columbia, Canada",Private,,,Curtiss seaplane,,,1.0,0.0,1.0,1.0,0.0,1.0,0.0,The first fatal airplane accident in Canada oc...
4,9/9/1913,18:30,Over the North Sea,Military - German Navy,,,Zeppelin L-1 (airship),,,20.0,,,14.0,,,0.0,The airship flew into a thunderstorm and encou...


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4998 entries, 0 to 4997
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   4998 non-null   object 
 1   Time                   3486 non-null   object 
 2   Location               4994 non-null   object 
 3   Operator               4988 non-null   object 
 4   Flight #               1329 non-null   object 
 5   Route                  4221 non-null   object 
 6   AC Type                4983 non-null   object 
 7   Registration           4724 non-null   object 
 8   cn/ln                  4330 non-null   object 
 9   Aboard                 4980 non-null   float64
 10  Aboard Passangers      4769 non-null   float64
 11  Aboard Crew            4772 non-null   float64
 12  Fatalities             4990 non-null   float64
 13  Fatalities Passangers  4756 non-null   float64
 14  Fatalities Crew        4757 non-null   float64
 15  Grou

In [15]:
# Imputing missing values based on data distribution, data type, and relevance
# For numerical columns
for col in data.select_dtypes(include=['float64', 'int64']).columns:
    data[col].fillna(data[col].median(), inplace=True)
# For categorical columns
for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Converting relevant columns to appropriate data types
# Example: data['column_name'] = pd.to_numeric(data['column_name'], errors='coerce')

In [16]:
data.head()

Unnamed: 0,Date,Time,Location,Operator,Flight #,Route,AC Type,Registration,cn/ln,Aboard,Aboard Passangers,Aboard Crew,Fatalities,Fatalities Passangers,Fatalities Crew,Ground,Summary
0,9/17/1908,17:18,"Fort Myer, Virginia",Military - U.S. Army,-,Demonstration,Wright Flyer III,19,1,2.0,1.0,1.0,1.0,1.0,0.0,0.0,"During a demonstration flight, a U.S. Army fly..."
1,9/7/1909,15:00,"Juvisy-sur-Orge, France",Aeroflot,-,Air show,Wright Byplane,SC1,1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,Eugene Lefebvre was the first pilot to ever be...
2,7/12/1912,6:30,"Atlantic City, New Jersey",Military - U.S. Navy,-,Test flight,Dirigible,19,1,5.0,0.0,5.0,5.0,0.0,5.0,0.0,First U.S. dirigible Akron exploded just offsh...
3,8/6/1913,15:00,"Victoria, British Columbia, Canada",Private,-,Training,Curtiss seaplane,19,1,1.0,0.0,1.0,1.0,0.0,1.0,0.0,The first fatal airplane accident in Canada oc...
4,9/9/1913,18:30,Over the North Sea,Military - German Navy,-,Training,Zeppelin L-1 (airship),19,1,20.0,12.0,4.0,14.0,8.0,3.0,0.0,The airship flew into a thunderstorm and encou...


In [17]:
detailed = data.describe(include='all')
detailed

Unnamed: 0,Date,Time,Location,Operator,Flight #,Route,AC Type,Registration,cn/ln,Aboard,Aboard Passangers,Aboard Crew,Fatalities,Fatalities Passangers,Fatalities Crew,Ground,Summary
count,4998,4998,4998,4998,4998,4998,4998,4998.0,4998.0,4998.0,4998.0,4998.0,4998.0,4998.0,4998.0,4998.0,4998
unique,4570,1060,4122,2264,880,3825,2463,4689.0,3818.0,,,,,,,,4839
top,8/31/1988,15:00,"Moscow, Russia",Aeroflot,-,Training,Douglas DC-3,19.0,1.0,,,,,,,,Crashed under unknown circumstances.
freq,4,1550,21,265,3705,870,348,277.0,682.0,,,,,,,,73
mean,,,,,,,,,,31.141257,26.321329,4.457583,22.355342,18.518607,3.55062,1.704682,
std,,,,,,,,,,45.461818,43.203147,3.405622,35.036614,33.393452,3.093069,55.306594,
min,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,,,,,,,,,,7.0,3.0,2.0,4.0,1.0,2.0,0.0,
50%,,,,,,,,,,16.0,12.0,4.0,11.0,8.0,3.0,0.0,
75%,,,,,,,,,,35.0,29.0,5.0,25.0,20.0,5.0,0.0,


In [None]:
# Conversion automatique des dates (détecte les colonnes contenant "date")
for col in data.columns:
    if 'date' in col:
        data[col] = pd.to_datetime(data[col], errors='coerce')

# Conversion des nombres stockés sous forme de texte en float
for col in data.select_dtypes(include=['object']).columns:
    try:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    except:
        pass  # Ignore les colonnes non convertibles

for col in category_cols:
data[col] = data[col].astype('category')

print(data.dtypes)

Date                     float64
Time                     float64
Location                 float64
Operator                 float64
Flight #                 float64
Route                    float64
AC Type                  float64
Registration             float64
cn/ln                    float64
Aboard                   float64
Aboard Passangers        float64
Aboard Crew              float64
Fatalities               float64
Fatalities Passangers    float64
Fatalities Crew          float64
Ground                   float64
Summary                  float64
dtype: object
