In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
%matplotlib inline

In [2]:
df = pd.read_csv("data/AviationData.csv", encoding = "latin-1", low_memory = False)

df["Aircraft.Category"].value_counts()

Aircraft.Category
Airplane             27617
Helicopter            3440
Glider                 508
Balloon                231
Gyrocraft              173
Weight-Shift           161
Powered Parachute       91
Ultralight              30
Unknown                 14
WSFT                     9
Powered-Lift             5
Blimp                    4
UNK                      2
Rocket                   1
ULTR                     1
Name: count, dtype: int64

In [3]:
df.shape

(88889, 31)

In [4]:
airplane_df = df[df["Aircraft.Category"] == "Airplane"]

In [5]:
# Confirming Aircraft that are not Airplanes are removed
airplane_df.shape

(27617, 31)

In [6]:
list(airplane_df.columns)

['Event.Id',
 'Investigation.Type',
 'Accident.Number',
 'Event.Date',
 'Location',
 'Country',
 'Latitude',
 'Longitude',
 'Airport.Code',
 'Airport.Name',
 'Injury.Severity',
 'Aircraft.damage',
 'Aircraft.Category',
 'Registration.Number',
 'Make',
 'Model',
 'Amateur.Built',
 'Number.of.Engines',
 'Engine.Type',
 'FAR.Description',
 'Schedule',
 'Purpose.of.flight',
 'Air.carrier',
 'Total.Fatal.Injuries',
 'Total.Serious.Injuries',
 'Total.Minor.Injuries',
 'Total.Uninjured',
 'Weather.Condition',
 'Broad.phase.of.flight',
 'Report.Status',
 'Publication.Date']

In [7]:
airplane_df["Make"].value_counts()

Make
CESSNA             4867
Cessna             3608
PIPER              2805
Piper              1910
BOEING             1037
                   ... 
GLINES                1
RAMMEL THOMAS W       1
HEMMER                1
W.H. Hunnicutt        1
ORLICAN S R O         1
Name: count, Length: 3874, dtype: int64

In [8]:
airplane_df["Make"] = airplane_df["Make"].str.upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df["Make"] = airplane_df["Make"].str.upper()


In [9]:
airplane_df["Make"].value_counts()

Make
CESSNA             8475
PIPER              4715
BEECH              1692
BOEING             1324
MOONEY              419
                   ... 
RAMMEL THOMAS W       1
HEMMER                1
W.H. HUNNICUTT        1
CARR BRYAN            1
ORLICAN S R O         1
Name: count, Length: 3537, dtype: int64

In [10]:
# Cessna
airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["CESSNA AIRCRAFT CO", "CESSNA AIRCRAFT COMPANY", "CESSNA AIRCRAFT", "CESSNA ECTOR", "CESSNA SKYHAWK II", "CESSNA AIRCRAFT CO"], value="CESSNA")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["CESSNA AIRCRAFT CO", "CESSNA AIRCRAFT COMPANY", "CESSNA AIRCRAFT", "CESSNA ECTOR", "CESSNA SKYHAWK II", "CESSNA AIRCRAFT CO"], value="CESSNA")


In [11]:
# Piper
airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["PIPER AIRCRAFT INC", "PIPER AIRCRAFT CORPORATION", "PIPER AIRCRAFT", "NEW PIPER AIRCRAFT INC", "'PIPER AIRCRAFT, INC.'", "NEW PIPER", "PIPER AEROSTAR", "PIPER-AEROSTAR", "PIPER PAWNEE", "JETPROP DLX PIPER"], value="PIPER")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["PIPER AIRCRAFT INC", "PIPER AIRCRAFT CORPORATION", "PIPER AIRCRAFT", "NEW PIPER AIRCRAFT INC", "'PIPER AIRCRAFT, INC.'", "NEW PIPER", "PIPER AEROSTAR", "PIPER-AEROSTAR", "PIPER PAWNEE", "JETPROP DLX PIPER"], value="PIPER")


In [12]:
# The Beechcraft and Hawker companies merged, it is now called Beechcraft
airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["BEECHCRAFT", "HAWKER BEECHCRAFT", "HAWKER BEECHCRAFT CORP", "HAWKER BEECHCRAFT CORPORATION", "HAWKER BEECHCRAFT CORP.", "BEECH AIRCRAFT CORPORATION", "BEECH AIRCRAFT", "HAWKER BEECH", "HAWKER-BEECHCRAFT", "BEECH AIRCRAFT CO.", "BEECH AIRCRAFT CORP", "BEECHCRAFT CORPORATION", "HAWKER-BEECHCRAFT CORPORATION", "HAWKER", "HAWKER SIDDELEY", "HAWKER AIRCRAFT LTD"], value="BEECH")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["BEECHCRAFT", "HAWKER BEECHCRAFT", "HAWKER BEECHCRAFT CORP", "HAWKER BEECHCRAFT CORPORATION", "HAWKER BEECHCRAFT CORP.", "BEECH AIRCRAFT CORPORATION", "BEECH AIRCRAFT", "HAWKER BEECH", "HAWKER-BEECHCRAFT", "BEECH AIRCRAFT CO.", "BEECH AIRCRAFT CORP", "BEECHCRAFT CORPORATION", "HAWKER-BEECHCRAFT CORPORATION", "HAWKER", "HAWKER SIDDELEY", "HAWKER AIRCRAFT LTD"], value="BEECH")


In [13]:
# Stearman merged into Boeing
airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["THE BOEING COMPANY", "BOEING COMPANY", "BOEING STEARMAN", "'BOEING COMPANY, LONG BEACH DIV'", "BOEING-STEARMAN", "BOEING 777-306ER", "BOEING (STEARMAN)", "BOEING COMMERCIAL AIRPLANE GRO", "BOEING OF CANADA/DEHAV DIV", "STEARMAN", "STEARMAN AIRCRAFT"], value="BOEING")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["THE BOEING COMPANY", "BOEING COMPANY", "BOEING STEARMAN", "'BOEING COMPANY, LONG BEACH DIV'", "BOEING-STEARMAN", "BOEING 777-306ER", "BOEING (STEARMAN)", "BOEING COMMERCIAL AIRPLANE GRO", "BOEING OF CANADA/DEHAV DIV", "STEARMAN", "STEARMAN AIRCRAFT"], value="BOEING")


In [14]:
# Mooney
airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["MOONEY AIRCRAFT CORP.", "MOONEY AIRPLANE CO INC", "MOONEY INTERNATIONAL CORP", "MOONEY AIRCRAFT CORPORATION", "MOONEY AIRCRAFT CORP", "'MOONEY AIRPLANE COMPANY, INC.'"], value="MOONEY")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["MOONEY AIRCRAFT CORP.", "MOONEY AIRPLANE CO INC", "MOONEY INTERNATIONAL CORP", "MOONEY AIRCRAFT CORPORATION", "MOONEY AIRCRAFT CORP", "'MOONEY AIRPLANE COMPANY, INC.'"], value="MOONEY")


In [15]:
# Grumann and Northrop merged
airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["GRUMMAN ACFT ENG COR-SCHWEIZER", "GRUMMAN AMERICAN AVN. CORP", "GRUMMAN AMERICAN", "GRUMMAN-SCHWEIZER", "GRUMMAN AIRCRAFT ENG CORP", "GRUMMAN ACFT ENG", "GRUMMAN AMERICAN AVIATION", "GRUMMAN SCHWEIZER", "GRUMMAN AIRCRAFT COR-SCHWEIZER", "GRUMMAN AMERICAN AVN. CORP", "GRUMMAN AMERICAN CORPORATION", "GRUMMAN AIRCRAFT", "GRUMMAN ACFT ENG COR", "GRUMMAN AMERICAN AVIATION CORP", "NORTHROP"], value="GRUMMAN")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["GRUMMAN ACFT ENG COR-SCHWEIZER", "GRUMMAN AMERICAN AVN. CORP", "GRUMMAN AMERICAN", "GRUMMAN-SCHWEIZER", "GRUMMAN AIRCRAFT ENG CORP", "GRUMMAN ACFT ENG", "GRUMMAN AMERICAN AVIATION", "GRUMMAN SCHWEIZER", "GRUMMAN AIRCRAFT COR-SCHWEIZER", "GRUMMAN AMERICAN AVN. CORP", "GRUMMAN AMERICAN CORPORATION", "GRUMMAN AIRCRAFT", "GRUMMAN ACFT ENG COR", "GRUMMAN AMERICAN AVIATION CORP", "NORTHROP"], value="GRUMMAN")


In [16]:
# Airbus
airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["AIRBUS INDUSTRIE"], value="AIRBUS")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["AIRBUS INDUSTRIE"], value="AIRBUS")


In [17]:
# Maule
airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["MAULE AIRCRAFT CORP"], value="MAULE")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["MAULE AIRCRAFT CORP"], value="MAULE")


In [18]:
# Aeronca
airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["AERONCA AIRCRAFT CORPORATION", "AERONCA CHAMPION", "AERONCA CHAMP"], value="AERONCA")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airplane_df["Make"] = airplane_df["Make"].replace(to_replace=["AERONCA AIRCRAFT CORPORATION", "AERONCA CHAMPION", "AERONCA CHAMP"], value="AERONCA")


In [19]:
# Top 10
airplane_df["Make"].value_counts().head(10)

Make
CESSNA      8525
PIPER       4772
BEECH       1785
BOEING      1371
MOONEY       466
GRUMMAN      381
AIRBUS       288
BELLANCA     282
MAULE        233
AERONCA      232
Name: count, dtype: int64

In [20]:
# Remove any Make that have a value count under 232
airplane_df.groupby("Make").filter(lambda x : len(x)>=232)

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
7,20020909X01562,Accident,SEA82DA022,1982-01-01,"PULLMAN, WA",United States,,,,BLACKBURN AG STRIP,...,Personal,,0.0,0.0,0.0,2.0,VMC,Takeoff,Probable Cause,01-01-1982
8,20020909X01561,Accident,NYC82DA015,1982-01-01,"EAST HANOVER, NJ",United States,,,N58,HANOVER,...,Business,,0.0,0.0,0.0,2.0,IMC,Landing,Probable Cause,01-01-1982
12,20020917X02148,Accident,FTW82FRJ07,1982-01-02,"HOMER, LA",United States,,,,,...,Personal,,0.0,0.0,1.0,0.0,IMC,Cruise,Probable Cause,02-01-1983
13,20020917X02134,Accident,FTW82FRA14,1982-01-02,"HEARNE, TX",United States,,,T72,HEARNE MUNICIPAL,...,Personal,,1.0,0.0,0.0,0.0,IMC,Takeoff,Probable Cause,02-01-1983
15,20020917X02117,Accident,FTW82FPG08,1982-01-02,"LITTLE ROCK, AR",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,02-01-1983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88858,20221211106438,Accident,ERA23LA083,2022-12-09,"Hawkinsville, GA",United States,321814N,0832534W,51A,HAWKINSVILLE-PULASKI COUNTY,...,Personal,,0.0,1.0,0.0,0.0,VMC,,,15-12-2022
88861,20221215106460,Accident,ERA23LA088,2022-12-10,"Alabaster, AL",United States,331040N,0086470W,EET,,...,Personal,,0.0,0.0,0.0,2.0,,,,19-12-2022
88865,20221212106444,Accident,ERA23LA085,2022-12-12,"Knoxville, TN",United States,355745N,0835218W,DKX,KNOXVILLE DOWNTOWN ISLAND,...,Instructional,Knoxville Flight Training Academy,0.0,0.0,0.0,1.0,VMC,,,15-12-2022
88869,20221213106455,Accident,WPR23LA065,2022-12-13,"Lewistown, MT",United States,047257N,0109280W,KLWT,Lewiston Municipal Airport,...,,,0.0,0.0,0.0,1.0,,,,14-12-2022


In [None]:
# airplane_df.groupby("Make").count().describe()

In [None]:
# Export Make value counts into a CSV file
# airplane_df["Make"].value_counts().reset_index().to_csv("Make.csv")

In [None]:
# Trying fuzzywuzzy
# process.extract("cessna", airplane_df["Make"], limit = 10)

In [None]:
# Filtering out dates before January 01, 1998
df["Event.Date"] = pd.to_datetime(df["Event.Date"])
df[df["Event.Date"] > "1998-01-01"]