In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [2]:
pre_df = pd.read_csv('dane/AviationData.csv', encoding='ISO-8859-1', dtype={6: str, 7: str, 28: str})
df = pre_df[pre_df['Country'] == 'United States']


In [3]:
print(df.head())


         Event.Id Investigation.Type Accident.Number  Event.Date  \
0  20001218X45444           Accident      SEA87LA080  1948-10-24   
1  20001218X45447           Accident      LAX94LA336  1962-07-19   
2  20061025X01555           Accident      NYC07LA005  1974-08-30   
3  20001218X45448           Accident      LAX96LA321  1977-06-19   
4  20041105X01764           Accident      CHI79FA064  1979-08-02   

          Location        Country   Latitude   Longitude Airport.Code  \
0  MOOSE CREEK, ID  United States        NaN         NaN          NaN   
1   BRIDGEPORT, CA  United States        NaN         NaN          NaN   
2    Saltville, VA  United States  36.922223  -81.878056          NaN   
3       EUREKA, CA  United States        NaN         NaN          NaN   
4       Canton, OH  United States        NaN         NaN          NaN   

  Airport.Name  ... Purpose.of.flight Air.carrier Total.Fatal.Injuries  \
0          NaN  ...          Personal         NaN                  2.0   
1   

In [4]:
# sprawdzanie wartości w kolumnach
for column in df.head() :
    print(f"{column}: {df[column].unique()}")

Event.Id: ['20001218X45444' '20001218X45447' '20061025X01555' ... '20221227106497'
 '20221227106498' '20221230106513']
Investigation.Type: ['Accident' 'Incident']
Accident.Number: ['SEA87LA080' 'LAX94LA336' 'NYC07LA005' ... 'WPR23LA075' 'WPR23LA076'
 'ERA23LA097']
Event.Date: ['1948-10-24' '1962-07-19' '1974-08-30' ... '2022-12-21' '2022-12-26'
 '2022-12-29']
Location: ['MOOSE CREEK, ID' 'BRIDGEPORT, CA' 'Saltville, VA' ... 'Kellyton, AL'
 'San Manual, AZ' 'Auburn Hills, MI']
Country: ['United States']
Latitude: [nan '36.922223' '42.445277' ... '321814N' '039101N' '373829N']
Longitude: [nan '-81.878056' '-70.758333' ... '1114536W' '0835218W' '0121410W']
Airport.Code: [nan 'N58' 'JAX' ... 'KTOP' '8F6' 'KSRC']
Airport.Name: [nan 'BLACKBURN AG STRIP' 'HANOVER' ... 'HAWKINSVILLE-PULASKI COUNTY'
 'Lewiston Municipal Airport' 'WICHITA DWIGHT D EISENHOWER NT']
Injury.Severity: ['Fatal(2)' 'Fatal(4)' 'Fatal(3)' 'Fatal(1)' 'Non-Fatal' 'Incident'
 'Fatal(8)' 'Fatal(78)' 'Fatal(7)' 'Fatal(6)' 'Fa

In [6]:
df.columns

Index(['Event.Id', 'Investigation.Type', 'Accident.Number', 'Event.Date',
       'Location', 'Country', 'Latitude', 'Longitude', 'Airport.Code',
       'Airport.Name', 'Injury.Severity', 'Aircraft.damage',
       'Aircraft.Category', 'Registration.Number', 'Make', 'Model',
       'Amateur.Built', 'Number.of.Engines', 'Engine.Type', 'FAR.Description',
       'Schedule', 'Purpose.of.flight', 'Air.carrier', 'Total.Fatal.Injuries',
       'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured',
       'Weather.Condition', 'Broad.phase.of.flight', 'Report.Status',
       'Publication.Date'],
      dtype='object')

In [5]:
for k in df.keys():
    tmpNum = df.columns.get_loc(k)
    print(tmpNum,k)

0 Event.Id
1 Investigation.Type
2 Accident.Number
3 Event.Date
4 Location
5 Country
6 Latitude
7 Longitude
8 Airport.Code
9 Airport.Name
10 Injury.Severity
11 Aircraft.damage
12 Aircraft.Category
13 Registration.Number
14 Make
15 Model
16 Amateur.Built
17 Number.of.Engines
18 Engine.Type
19 FAR.Description
20 Schedule
21 Purpose.of.flight
22 Air.carrier
23 Total.Fatal.Injuries
24 Total.Serious.Injuries
25 Total.Minor.Injuries
26 Total.Uninjured
27 Weather.Condition
28 Broad.phase.of.flight
29 Report.Status
30 Publication.Date


In [6]:
print("How many nulls: ", df.isnull().sum().sum())
df.isnull().sum()

How many nulls:  496081


Event.Id                      0
Investigation.Type            0
Accident.Number               0
Event.Date                    0
Location                     11
Country                       0
Latitude                  49983
Longitude                 49993
Airport.Code              33176
Airport.Name              30678
Injury.Severity             108
Aircraft.damage            1979
Aircraft.Category         54094
Registration.Number         116
Make                         21
Model                        38
Amateur.Built                21
Number.of.Engines          1875
Engine.Type                3042
FAR.Description           54023
Schedule                  71951
Purpose.of.flight          2429
Air.carrier               67753
Total.Fatal.Injuries      10654
Total.Serious.Injuries    11375
Total.Minor.Injuries      10729
Total.Uninjured            5005
Weather.Condition           645
Broad.phase.of.flight     21090
Report.Status              2611
Publication.Date          12681
dtype: i

In [7]:
df.drop(columns=["Latitude", "Registration.Number", "Make","Longitude", "Schedule", "FAR.Description", "Air.carrier", "Airport.Code", "Airport.Name", "Aircraft.Category", "Event.Id", "Accident.Number"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["Latitude", "Registration.Number", "Make","Longitude", "Schedule", "FAR.Description", "Air.carrier", "Airport.Code", "Airport.Name", "Aircraft.Category", "Event.Id", "Accident.Number"], inplace=True)


In [8]:
for k in df.keys():
    tmpNum = df.columns.get_loc(k)
    print(tmpNum,k)

0 Investigation.Type
1 Event.Date
2 Location
3 Country
4 Injury.Severity
5 Aircraft.damage
6 Model
7 Amateur.Built
8 Number.of.Engines
9 Engine.Type
10 Purpose.of.flight
11 Total.Fatal.Injuries
12 Total.Serious.Injuries
13 Total.Minor.Injuries
14 Total.Uninjured
15 Weather.Condition
16 Broad.phase.of.flight
17 Report.Status
18 Publication.Date


In [9]:
# sprawdzanie wartości w kolumnach
for column in df.head() :
    print(f"{column}: {df[column].unique()}")

Investigation.Type: ['Accident' 'Incident']
Event.Date: ['1948-10-24' '1962-07-19' '1974-08-30' ... '2022-12-21' '2022-12-26'
 '2022-12-29']
Location: ['MOOSE CREEK, ID' 'BRIDGEPORT, CA' 'Saltville, VA' ... 'Kellyton, AL'
 'San Manual, AZ' 'Auburn Hills, MI']
Country: ['United States']
Injury.Severity: ['Fatal(2)' 'Fatal(4)' 'Fatal(3)' 'Fatal(1)' 'Non-Fatal' 'Incident'
 'Fatal(8)' 'Fatal(78)' 'Fatal(7)' 'Fatal(6)' 'Fatal(5)' 'Fatal(153)'
 'Fatal(12)' 'Fatal(14)' 'Fatal(23)' 'Fatal(10)' 'Fatal(11)' 'Fatal(17)'
 'Fatal(13)' 'Fatal(70)' 'Fatal(9)' 'Unavailable' 'Fatal(135)' 'Fatal(31)'
 'Fatal(25)' 'Fatal(82)' 'Fatal(156)' 'Fatal(28)' 'Fatal(18)' 'Fatal(43)'
 'Fatal(111)' 'Fatal(20)' 'Fatal(73)' 'Fatal(34)' 'Fatal(27)' 'Fatal(16)'
 'Fatal(37)' 'Fatal(132)' 'Fatal(68)' 'Fatal(15)' 'Fatal(110)'
 'Fatal(230)' 'Fatal(29)' 'Fatal(228)' 'Fatal(88)' 'Fatal(19)' 'Fatal(44)'
 'Fatal(64)' 'Fatal(65)' 'Fatal(92)' 'Fatal(265)' 'Fatal(21)' 'Fatal(49)'
 'Fatal' nan 'Minor' 'Serious']
Aircraft.damage: [

In [None]:
dataset = df

numerical_features = ["Total.Fatal.Injuries", "Total.Serious.Injuries", "Total.Minor.Injuries", "Total.Uninjured"]
binary_features = ["Investigation.Type"]
categorical_features = ["Aircraft.damage", "Model", "Amateur.Built", "Number.of.Engines", "Engine.Type", "Purpose.of.flight", "Broad.phase.of.flight", "Weather.Condition"]


numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

binary_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent'))
])

full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features),
    ('binary', binary_pipeline, binary_features)
],
remainder = 'passthrough') # 'drop'

full_processor.fit(dataset)

data_set_mod = pd.DataFrame(full_processor.transform(dataset), columns=full_processor.get_feature_names_out())
data_set_mod