## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('airlines-train.csv', index_col='FlightRecord')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,FlightDate,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed,IsDepDelayed
FlightRecord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
40108,2007-01-01,2021.0,2000,2212.0,2155,WN,149,N623SW,51.0,55.0,41.0,17.0,21.0,STL,SDF,254.0,3.0,7.0,0,,0,2.0,0.0,0.0,0.0,15.0,YES,YES
43494,2008-01-03,910.0,915,923.0,935,WN,453,N232WN,73.0,80.0,58.0,-12.0,-5.0,PHX,SNA,338.0,4.0,11.0,0,,0,,,,,,NO,NO
21603,1997-01-22,1422.0,1415,1614.0,1602,HP,2028,N311AW,112.0,107.0,93.0,12.0,7.0,DEN,PHX,602.0,7.0,12.0,0,,0,,,,,,YES,YES
18691,1996-01-27,1125.0,1100,1337.0,1315,WN,1488,N359,72.0,75.0,61.0,22.0,25.0,BUR,PHX,369.0,5.0,6.0,0,,0,,,,,,YES,YES
8022,1991-01-27,1703.0,1705,1931.0,1944,US,112,,148.0,159.0,,-13.0,-2.0,TPA,SYR,1104.0,,,0,,0,,,,,,NO,NO


In [4]:
# quantas linhas e colunas existem?
data.shape

(35182, 28)

## Análise dos dados

In [15]:
data.columns.values

array(['FlightDate', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime',
       'UniqueCarrier', 'FlightNum', 'TailNum', 'ActualElapsedTime',
       'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin',
       'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled',
       'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'IsArrDelayed',
       'IsDepDelayed'], dtype=object)

In [5]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35182 entries, 40108 to 2011
Data columns (total 28 columns):
FlightDate           35182 non-null object
DepTime              34298 non-null float64
CRSDepTime           35182 non-null int64
ArrTime              34212 non-null float64
CRSArrTime           35182 non-null int64
UniqueCarrier        35182 non-null object
FlightNum            35182 non-null int64
TailNum              22346 non-null object
ActualElapsedTime    34212 non-null float64
CRSElapsedTime       35172 non-null float64
AirTime              21841 non-null float64
ArrDelay             34212 non-null float64
DepDelay             34298 non-null float64
Origin               35182 non-null object
Dest                 35182 non-null object
Distance             35155 non-null float64
TaxiIn               22344 non-null float64
TaxiOut              22346 non-null float64
Cancelled            35182 non-null int64
CancellationCode     189 non-null object
Diverted             351

In [6]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

DepTime                884
ArrTime                970
TailNum              12836
ActualElapsedTime      970
CRSElapsedTime          10
AirTime              13341
ArrDelay               970
DepDelay               884
Distance                27
TaxiIn               12838
TaxiOut              12836
CancellationCode     34993
CarrierDelay         28046
WeatherDelay         28046
NASDelay             28046
SecurityDelay        28046
LateAircraftDelay    28046
dtype: int64

In [7]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DepTime,34298.0,1345.5412,465.3347,1.0,929.0,1330.0,1735.0,2400.0
CRSDepTime,35182.0,1313.1056,476.215,0.0,910.0,1320.0,1720.0,2359.0
ArrTime,34212.0,1504.5727,483.9483,1.0,1118.0,1526.0,1917.0,2400.0
CRSArrTime,35182.0,1485.2642,492.3544,0.0,1109.0,1516.0,1902.0,2359.0
FlightNum,35182.0,819.5181,777.8129,1.0,204.0,557.0,1242.75,3948.0
ActualElapsedTime,34212.0,124.8756,73.9543,16.0,71.0,101.0,151.0,475.0
CRSElapsedTime,35172.0,125.009,73.2944,17.0,71.0,102.0,151.0,437.0
AirTime,21841.0,114.3403,69.5007,14.0,61.0,91.0,140.0,402.0
ArrDelay,34212.0,9.2457,29.374,-63.0,-6.0,2.0,15.0,475.0
DepDelay,34298.0,9.8819,25.9745,-16.0,-2.0,1.0,10.0,473.0


In [8]:
# sumário das características textuais
data.describe(include=['O']).T

Unnamed: 0,count,unique,top,freq
FlightDate,35182,594,2008-01-03,1606
UniqueCarrier,35182,10,US,14967
TailNum,22346,3383,UNKNOW,144
Origin,35182,131,DEN,2845
Dest,35182,134,PHX,7451
CancellationCode,189,3,B,80
IsArrDelayed,35182,2,YES,19598
IsDepDelayed,35182,2,YES,18507


In [14]:
# quais as correlações entre as características numéricas?
corr = pd.get_dummies(data, columns=['survived', 'sex', 'embarked']).corr()
corr

Unnamed: 0,pclass,age,sibsp,parch,fare,survived_no,survived_yes,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
pclass,1.0,-0.4396,0.0695,0.0156,-0.5721,0.2944,-0.2944,-0.0897,0.0897,-0.2335,0.226,0.0677
age,-0.4396,1.0,-0.2487,-0.1694,0.1838,0.0483,-0.0483,-0.0871,0.0871,0.1214,-0.0035,-0.119
sibsp,0.0695,-0.2487,1.0,0.3849,0.1865,0.0259,-0.0259,0.1015,-0.1015,-0.0467,-0.0342,0.0651
parch,0.0156,-0.1694,0.3849,1.0,0.2559,-0.0784,0.0784,0.2209,-0.2209,-0.0169,-0.0926,0.0762
fare,-0.5721,0.1838,0.1865,0.2559,1.0,-0.2405,0.2405,0.1734,-0.1734,0.2415,-0.1323,-0.132
survived_no,0.2944,0.0483,0.0259,-0.0784,-0.2405,1.0,-1.0,-0.5134,0.5134,-0.1403,-0.0007,0.1297
survived_yes,-0.2944,-0.0483,-0.0259,0.0784,0.2405,-1.0,1.0,0.5134,-0.5134,0.1403,0.0007,-0.1297
sex_female,-0.0897,-0.0871,0.1015,0.2209,0.1734,-0.5134,0.5134,1.0,-1.0,0.0437,0.0859,-0.0999
sex_male,0.0897,0.0871,-0.1015,-0.2209,-0.1734,0.5134,-0.5134,-1.0,1.0,-0.0437,-0.0859,0.0999
embarked_C,-0.2335,0.1214,-0.0467,-0.0169,0.2415,-0.1403,0.1403,0.0437,-0.0437,1.0,-0.1586,-0.7711


In [20]:
# quais as correlações mais expressivas entre as variáveis?
corr[corr != 1][abs(corr) > 0.05].dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,pclass,age,sibsp,parch,fare,survived_no,survived_yes,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
pclass,,-0.4396,0.0695,,-0.5721,0.2944,-0.2944,-0.0897,0.0897,-0.2335,0.226,0.0677
age,-0.4396,,-0.2487,-0.1694,0.1838,,,-0.0871,0.0871,0.1214,,-0.119
sibsp,0.0695,-0.2487,,0.3849,0.1865,,,0.1015,-0.1015,,,0.0651
parch,,-0.1694,0.3849,,0.2559,-0.0784,0.0784,0.2209,-0.2209,,-0.0926,0.0762
fare,-0.5721,0.1838,0.1865,0.2559,,-0.2405,0.2405,0.1734,-0.1734,0.2415,-0.1323,-0.132
survived_no,0.2944,,,-0.0784,-0.2405,,-1.0,-0.5134,0.5134,-0.1403,,0.1297
survived_yes,-0.2944,,,0.0784,0.2405,-1.0,,0.5134,-0.5134,0.1403,,-0.1297
sex_female,-0.0897,-0.0871,0.1015,0.2209,0.1734,-0.5134,0.5134,,-1.0,,0.0859,-0.0999
sex_male,0.0897,0.0871,-0.1015,-0.2209,-0.1734,0.5134,-0.5134,-1.0,,,-0.0859,0.0999
embarked_C,-0.2335,0.1214,,,0.2415,-0.1403,0.1403,,,,-0.1586,-0.7711


In [12]:
data.groupby('IsDepDelayed').mean().T

IsDepDelayed,NO,YES
DepTime,1274.7797,1412.4962
CRSDepTime,1268.5779,1353.2255
ArrTime,1445.8084,1560.2985
CRSArrTime,1435.958,1529.6896
FlightNum,837.1076,803.6697
ActualElapsedTime,119.8412,129.6497
CRSElapsedTime,120.6237,128.9623
AirTime,109.8696,119.2413
ArrDelay,-3.024,20.8809
DepDelay,-2.2748,21.3847


In [29]:
data2 = data[['pclass', 'survived', 'sex', 'sibsp', 'parch']]
data2.survived = data2.survived.map({'yes': 1, 'no': 0})
data2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Unnamed: 0_level_0,pclass,survived,sex,sibsp,parch
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
416,2,0,male,0,0
194,1,0,male,0,0
600,3,0,male,0,0
1112,3,0,female,1,1
878,3,0,female,1,0


In [30]:
# existe correlação entre sobrevivência e classe social?
data2[['pclass', 'survived']].groupby(['pclass'], as_index=False).mean().\
  sort_values(by='survived', ascending=False)

Unnamed: 0,pclass,survived
0,1,0.5946
1,2,0.3934
2,3,0.2527


In [13]:
data['UniqueCarrier'].value_counts()

US    14967
UA     7546
WN     4951
HP     2770
PS     2572
DL      740
PI      641
AA      574
TW      254
CO      167
Name: UniqueCarrier, dtype: int64

In [14]:
# existe correlação da sobrevivência com o deque?
corr = pd.get_dummies(data, columns=['IsDepDelayed', 'UniqueCarrier']).corr()
corr

Unnamed: 0,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsDepDelayed_NO,IsDepDelayed_YES,UniqueCarrier_AA,UniqueCarrier_CO,UniqueCarrier_DL,UniqueCarrier_HP,UniqueCarrier_PI,UniqueCarrier_PS,UniqueCarrier_TW,UniqueCarrier_UA,UniqueCarrier_US,UniqueCarrier_WN
DepTime,1.0,0.9171,0.7945,0.7983,0.0133,-0.0469,-0.0427,-0.0448,0.1821,0.2183,-0.0404,-0.0101,0.0078,,-0.0189,0.1,0.0221,0.0695,-0.0136,0.2859,-0.1479,0.1479,0.0503,-0.0264,0.0489,-0.0236,0.0454,-0.0143,0.0242,0.0231,-0.0466,0.0115
CRSDepTime,0.9171,1.0,0.739,0.8832,0.0092,-0.0655,-0.0623,-0.056,0.1,0.1218,-0.0518,0.0067,-0.0353,-0.0022,-0.0223,0.0435,0.0063,0.0352,-0.0155,0.2055,-0.0888,0.0888,-0.2663,-0.0227,0.0488,-0.0095,0.0508,0.0012,0.0257,0.0444,-0.0057,0.0177
ArrTime,0.7945,0.739,1.0,0.8395,-0.0049,0.0727,0.073,0.0479,0.0915,0.1038,0.0788,0.0335,0.0126,,,0.0065,-0.0027,0.1045,-0.0053,0.0599,-0.1182,0.1182,0.0408,-0.0144,0.0533,0.0043,0.0265,-0.0591,0.0026,0.0359,-0.0262,-0.0087
CRSArrTime,0.7983,0.8832,0.8395,1.0,-0.0033,0.0548,0.0599,0.0318,0.102,0.1248,0.0733,0.032,-0.0167,-0.0071,-0.0216,0.0413,0.01,0.054,-0.0104,0.1929,-0.0951,0.0951,-0.2872,-0.0113,0.0633,0.011,0.0273,-0.0534,0.0135,0.0724,-0.0035,0.018
FlightNum,0.0133,0.0092,-0.0049,-0.0033,1.0,-0.0552,-0.0485,-0.0495,0.0078,0.018,-0.0563,-0.0321,-0.1076,-0.0064,-0.0034,0.0596,-0.0049,-0.0521,0.0265,0.0803,0.0215,-0.0215,0.016,0.0566,-0.0155,0.3951,0.0232,0.2384,-0.0486,-0.0455,-0.4509,0.2026
ActualElapsedTime,-0.0469,-0.0655,0.0727,0.0548,-0.0552,1.0,0.9842,0.989,0.1286,0.0823,0.9725,0.143,0.2705,,,0.0248,0.0357,0.1931,-0.001,-0.0384,-0.0663,0.0663,0.0774,0.1381,0.238,0.1595,-0.1149,-0.2047,0.0049,0.149,-0.1193,-0.0884
CRSElapsedTime,-0.0427,-0.0623,0.073,0.0599,-0.0485,0.9842,1.0,0.988,0.0441,0.0758,0.9851,0.0969,0.1536,-0.0409,0.0119,0.0204,0.0138,0.0514,-0.0028,-0.0333,-0.0568,0.0568,0.0801,0.1397,0.237,0.1555,-0.1218,-0.2175,0.0043,0.1507,-0.1258,-0.0651
AirTime,-0.0448,-0.056,0.0479,0.0318,-0.0495,0.989,0.988,1.0,0.0828,0.0601,0.9785,0.0827,0.1371,-0.0232,,0.0228,0.0173,0.0896,0.0018,-0.039,-0.0674,0.0674,0.0734,0.1644,0.284,0.1685,,,-0.0176,0.0134,-0.1577,-0.1522
ArrDelay,0.1821,0.1,0.0915,0.102,0.0078,0.1286,0.0441,0.0828,1.0,0.8947,0.046,0.1107,0.3507,,,0.5094,0.1881,0.5264,0.0124,0.6824,-0.4068,0.4068,0.0131,-0.0075,0.04,0.0032,0.0044,0.0051,0.0375,0.0124,-0.0516,0.0216
DepDelay,0.2183,0.1218,0.1038,0.1248,0.018,0.0823,0.0758,0.0601,0.8947,1.0,0.0687,0.006,0.1027,,0.0509,0.546,0.1647,0.3148,0.0099,0.7514,-0.4553,0.4553,0.0215,-0.0048,0.0384,-0.0224,-0.0027,-0.0364,0.0387,0.0097,-0.0603,0.0874
