# Data Manipulation in Pandas

In this assignment, you will be working on the same dataframe of flights departing New York City in 2013. 

In [1]:
import pandas as pd

In [2]:
# Install the package 'nycflights13' before you can run this
from nycflights13 import flights
flights.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T10:00:00Z
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01T10:00:00Z
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T11:00:00Z


In [3]:
flights.shape

(336776, 19)

## Data frame with columns

- year,month,day
        Date of departure    
- dep_time,arr_time
        Actual departure and arrival times (format HHMM or HMM), local tz.
- sched_dep_time,sched_arr_time
        Scheduled departure and arrival times (format HHMM or HMM), local tz.    
- dep_delay,arr_delay
        Departure and arrival delays, in minutes. Negative times represent early departures/arrivals.
- hour,minute
        Time of scheduled departure broken into hour and minutes.
- carrier
        Two letter carrier abbreviation. See airlines() to get name
- tailnum
        Plane tail number
- flight
        Flight number
- origin,dest
        Origin and destination. See airports() for additional metadata.
- air_time
        Amount of time spent in the air, in minutes
- distance
        Distance between airports, in miles
- time_hour
        Scheduled date and hour of the flight as a date. Along with origin, can be used to join flights data to weather data.

In [4]:
# use describe() to summarize all columns
flights.describe(include='all')

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
count,336776.0,336776.0,336776.0,328521.0,336776.0,328521.0,328063.0,336776.0,327346.0,336776,336776.0,334264,336776,336776,327346.0,336776.0,336776.0,336776.0,336776
unique,,,,,,,,,,16,,4043,3,105,,,,,6936
top,,,,,,,,,,UA,,N725MQ,EWR,ORD,,,,,2013-09-13T12:00:00Z
freq,,,,,,,,,,58665,,575,120835,17283,,,,,94
mean,2013.0,6.54851,15.710787,1349.109947,1344.25484,12.63907,1502.054999,1536.38022,6.895377,,1971.92362,,,,150.68646,1039.912604,13.180247,26.2301,
std,0.0,3.414457,8.768607,488.281791,467.335756,40.210061,533.264132,497.457142,44.633292,,1632.471938,,,,93.688305,733.233033,4.661316,19.300846,
min,2013.0,1.0,1.0,1.0,106.0,-43.0,1.0,1.0,-86.0,,1.0,,,,20.0,17.0,1.0,0.0,
25%,2013.0,4.0,8.0,907.0,906.0,-5.0,1104.0,1124.0,-17.0,,553.0,,,,82.0,502.0,9.0,8.0,
50%,2013.0,7.0,16.0,1401.0,1359.0,-2.0,1535.0,1556.0,-5.0,,1496.0,,,,129.0,872.0,13.0,29.0,
75%,2013.0,10.0,23.0,1744.0,1729.0,11.0,1940.0,1945.0,14.0,,3465.0,,,,192.0,1389.0,17.0,44.0,


## Question 1. Selecting rows

From the 'flights' dataframe, find all flights that satisfy the following certain conditions:

In [5]:
# Had an arrival delay of two or more hours
flights[flights['arr_delay']>=120]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
119,2013,1,1,811.0,630,101.0,1047.0,830,137.0,MQ,4576,N531MQ,LGA,CLT,118.0,544,6,30,2013-01-01T11:00:00Z
151,2013,1,1,848.0,1835,853.0,1001.0,1950,851.0,MQ,3944,N942MQ,JFK,BWI,41.0,184,18,35,2013-01-01T23:00:00Z
218,2013,1,1,957.0,733,144.0,1056.0,853,123.0,UA,856,N534UA,EWR,BOS,37.0,200,7,33,2013-01-01T12:00:00Z
268,2013,1,1,1114.0,900,134.0,1447.0,1222,145.0,UA,1086,N76502,LGA,IAH,248.0,1416,9,0,2013-01-01T14:00:00Z
447,2013,1,1,1505.0,1310,115.0,1638.0,1431,127.0,EV,4497,N17984,EWR,RIC,63.0,277,13,10,2013-01-01T18:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336579,2013,9,30,1823.0,1545,158.0,1934.0,1733,121.0,9E,3459,N916XJ,JFK,BNA,95.0,765,15,45,2013-09-30T19:00:00Z
336668,2013,9,30,1951.0,1649,182.0,2157.0,1903,174.0,EV,4294,N13988,EWR,SAV,95.0,708,16,49,2013-09-30T20:00:00Z
336724,2013,9,30,2053.0,1815,158.0,2310.0,2054,136.0,EV,5292,N600QX,EWR,ATL,91.0,746,18,15,2013-09-30T22:00:00Z
336757,2013,9,30,2159.0,1845,194.0,2344.0,2030,194.0,9E,3320,N906XJ,JFK,BUF,50.0,301,18,45,2013-09-30T22:00:00Z


In [6]:
# Flew to Houston (IAH or HOU)
flights[(flights.dest=='IAH') | (flights.dest=='HOU')]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
32,2013,1,1,623.0,627,-4.0,933.0,932,1.0,UA,496,N459UA,LGA,IAH,229.0,1416,6,27,2013-01-01T11:00:00Z
81,2013,1,1,728.0,732,-4.0,1041.0,1038,3.0,UA,473,N488UA,LGA,IAH,238.0,1416,7,32,2013-01-01T12:00:00Z
89,2013,1,1,739.0,739,0.0,1104.0,1038,26.0,UA,1479,N37408,EWR,IAH,249.0,1400,7,39,2013-01-01T12:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336524,2013,9,30,1729.0,1720,9.0,2001.0,2010,-9.0,UA,652,N455UA,EWR,IAH,173.0,1400,17,20,2013-09-30T21:00:00Z
336527,2013,9,30,1735.0,1715,20.0,2010.0,2005,5.0,WN,2067,N296WN,EWR,HOU,188.0,1411,17,15,2013-09-30T21:00:00Z
336618,2013,9,30,1859.0,1859,0.0,2134.0,2159,-25.0,UA,1128,N14731,LGA,IAH,180.0,1416,18,59,2013-09-30T22:00:00Z
336694,2013,9,30,2015.0,2015,0.0,2244.0,2307,-23.0,UA,1545,N17730,EWR,IAH,174.0,1400,20,15,2013-10-01T00:00:00Z


In [7]:
# Were operated by United (UA), American (AA), or Delta (DL)
flights[(flights.carrier=='UA') | (flights.carrier=='AA') | (flights.carrier=='DL')]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T10:00:00Z
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T11:00:00Z
5,2013,1,1,554.0,558,-4.0,740.0,728,12.0,UA,1696,N39463,EWR,ORD,150.0,719,5,58,2013-01-01T10:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336737,2013,9,30,2105.0,2106,-1.0,2329.0,2354,-25.0,UA,475,N477UA,EWR,IAH,175.0,1400,21,6,2013-10-01T01:00:00Z
336744,2013,9,30,2121.0,2100,21.0,2349.0,14,-25.0,DL,2363,N193DN,JFK,LAX,296.0,2475,21,0,2013-10-01T01:00:00Z
336751,2013,9,30,2140.0,2140,0.0,10.0,40,-30.0,AA,185,N335AA,JFK,LAX,298.0,2475,21,40,2013-10-01T01:00:00Z
336755,2013,9,30,2149.0,2156,-7.0,2245.0,2308,-23.0,UA,523,N813UA,EWR,BOS,37.0,200,21,56,2013-10-01T01:00:00Z


In [8]:
# Departed in July, August, and September
flights[(flights.month.isin([7,8,9]))]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
250450,2013,7,1,1.0,2029,212.0,236.0,2359,157.0,B6,915,N653JB,JFK,SFO,315.0,2586,20,29,2013-07-02T00:00:00Z
250451,2013,7,1,2.0,2359,3.0,344.0,344,0.0,B6,1503,N805JB,JFK,SJU,200.0,1598,23,59,2013-07-02T03:00:00Z
250452,2013,7,1,29.0,2245,104.0,151.0,1,110.0,B6,234,N348JB,JFK,BTV,66.0,266,22,45,2013-07-02T02:00:00Z
250453,2013,7,1,43.0,2130,193.0,322.0,14,188.0,B6,1371,N794JB,LGA,FLL,143.0,1076,21,30,2013-07-02T01:00:00Z
250454,2013,7,1,44.0,2150,174.0,300.0,100,120.0,AA,185,N324AA,JFK,LAX,297.0,2475,21,50,2013-07-02T01:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,2013,9,30,,1455,,,1634,,9E,3393,,JFK,DCA,,213,14,55,2013-09-30T18:00:00Z
336772,2013,9,30,,2200,,,2312,,9E,3525,,LGA,SYR,,198,22,0,2013-10-01T02:00:00Z
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10,2013-09-30T16:00:00Z
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59,2013-09-30T15:00:00Z


In [9]:
# Arrived more than two hours late, but didn’t leave late
flights[(flights.arr_delay>120) & (flights.dep_delay==0)]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
33011,2013,10,7,1350.0,1350,0.0,1736.0,1526,130.0,EV,5181,N611QX,LGA,MSN,117.0,812,13,50,2013-10-07T17:00:00Z
214512,2013,5,23,1810.0,1810,0.0,2208.0,2000,128.0,MQ,4626,N525MQ,LGA,CMH,82.0,479,18,10,2013-05-23T22:00:00Z
250689,2013,7,1,905.0,905,0.0,1443.0,1223,140.0,DL,1057,N337NB,LGA,MIA,183.0,1096,9,5,2013-07-01T13:00:00Z


In [10]:
# Were delayed by at least an hour, but made up over 30 minutes in flight
flights[(flights.dep_delay>=60) & (flights.air_time>30)]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
119,2013,1,1,811.0,630,101.0,1047.0,830,137.0,MQ,4576,N531MQ,LGA,CLT,118.0,544,6,30,2013-01-01T11:00:00Z
135,2013,1,1,826.0,715,71.0,1136.0,1045,51.0,AA,443,N3GVAA,JFK,MIA,160.0,1089,7,15,2013-01-01T12:00:00Z
151,2013,1,1,848.0,1835,853.0,1001.0,1950,851.0,MQ,3944,N942MQ,JFK,BWI,41.0,184,18,35,2013-01-01T23:00:00Z
218,2013,1,1,957.0,733,144.0,1056.0,853,123.0,UA,856,N534UA,EWR,BOS,37.0,200,7,33,2013-01-01T12:00:00Z
268,2013,1,1,1114.0,900,134.0,1447.0,1222,145.0,UA,1086,N76502,LGA,IAH,248.0,1416,9,0,2013-01-01T14:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336742,2013,9,30,2119.0,2005,74.0,2310.0,2212,58.0,EV,4321,N17984,EWR,MCI,147.0,1092,20,5,2013-10-01T00:00:00Z
336757,2013,9,30,2159.0,1845,194.0,2344.0,2030,194.0,9E,3320,N906XJ,JFK,BUF,50.0,301,18,45,2013-09-30T22:00:00Z
336760,2013,9,30,2211.0,2059,72.0,2339.0,2242,57.0,EV,4672,N12145,EWR,STL,120.0,872,20,59,2013-10-01T00:00:00Z
336762,2013,9,30,2233.0,2113,80.0,112.0,30,42.0,UA,471,N578UA,EWR,SFO,318.0,2565,21,13,2013-10-01T01:00:00Z


In [11]:
# Departed between midnight and 6am (inclusive)
flights.query('dep_time<=600')

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T10:00:00Z
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01T10:00:00Z
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T11:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335802,2013,9,30,557.0,600,-3.0,852.0,923,-31.0,UA,303,N510UA,JFK,SFO,326.0,2586,6,0,2013-09-30T10:00:00Z
335803,2013,9,30,558.0,600,-2.0,815.0,829,-14.0,EV,4137,N16981,EWR,ATL,107.0,746,6,0,2013-09-30T10:00:00Z
335804,2013,9,30,558.0,600,-2.0,742.0,749,-7.0,DL,731,N337NB,LGA,DTW,83.0,502,6,0,2013-09-30T10:00:00Z
335805,2013,9,30,559.0,600,-1.0,,715,,WN,464,N411WN,EWR,MDW,,711,6,0,2013-09-30T10:00:00Z


In [12]:
# How many flights have a missing dep_time? 
flights[flights.dep_time.isnull()]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
838,2013,1,1,,1630,,,1815,,EV,4308,N18120,EWR,RDU,,416,16,30,2013-01-01T21:00:00Z
839,2013,1,1,,1935,,,2240,,AA,791,N3EHAA,LGA,DFW,,1389,19,35,2013-01-02T00:00:00Z
840,2013,1,1,,1500,,,1825,,AA,1925,N3EVAA,LGA,MIA,,1096,15,0,2013-01-01T20:00:00Z
841,2013,1,1,,600,,,901,,B6,125,N618JB,JFK,FLL,,1069,6,0,2013-01-01T11:00:00Z
1777,2013,1,2,,1540,,,1747,,EV,4352,N10575,EWR,CVG,,569,15,40,2013-01-02T20:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,2013,9,30,,1455,,,1634,,9E,3393,,JFK,DCA,,213,14,55,2013-09-30T18:00:00Z
336772,2013,9,30,,2200,,,2312,,9E,3525,,LGA,SYR,,198,22,0,2013-10-01T02:00:00Z
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10,2013-09-30T16:00:00Z
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59,2013-09-30T15:00:00Z


## Question 2. Sorting

In [13]:
# Sort flights to find the least delayed flights. Find the flights that left earliest.
flights.sort_values(['dep_delay', 'dep_time'], ascending=[True,False])

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
89673,2013,12,7,2040.0,2123,-43.0,40.0,2352,48.0,B6,97,N592JB,JFK,DEN,265.0,1626,21,23,2013-12-08T02:00:00Z
113633,2013,2,3,2022.0,2055,-33.0,2240.0,2338,-58.0,DL,1715,N612DL,LGA,MSY,162.0,1183,20,55,2013-02-04T01:00:00Z
64501,2013,11,10,1408.0,1440,-32.0,1549.0,1559,-10.0,EV,5713,N825AS,LGA,IAD,52.0,229,14,40,2013-11-10T19:00:00Z
9619,2013,1,11,1900.0,1930,-30.0,2233.0,2243,-10.0,DL,1435,N934DL,LGA,TPA,139.0,1010,19,30,2013-01-12T00:00:00Z
24915,2013,1,29,1703.0,1730,-27.0,1947.0,1957,-10.0,F9,837,N208FR,LGA,DEN,250.0,1620,17,30,2013-01-29T22:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,2013,9,30,,1455,,,1634,,9E,3393,,JFK,DCA,,213,14,55,2013-09-30T18:00:00Z
336772,2013,9,30,,2200,,,2312,,9E,3525,,LGA,SYR,,198,22,0,2013-10-01T02:00:00Z
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10,2013-09-30T16:00:00Z
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59,2013-09-30T15:00:00Z


In [14]:
# Which flights travelled the farthest? Which travelled the shortest?
flights.sort_values('distance')

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
275945,2013,7,27,,106,,,245,,US,1632,,EWR,LGA,,17,1,6,2013-07-27T05:00:00Z
3083,2013,1,4,1240.0,1200,40.0,1333.0,1306,27.0,EV,4193,N14972,EWR,PHL,30.0,80,12,0,2013-01-04T17:00:00Z
16328,2013,1,19,1617.0,1617,0.0,1722.0,1722,0.0,EV,4616,N12540,EWR,PHL,34.0,80,16,17,2013-01-19T21:00:00Z
112178,2013,2,1,2128.0,2129,-1.0,2216.0,2224,-8.0,EV,4619,N13969,EWR,PHL,24.0,80,21,29,2013-02-02T02:00:00Z
19983,2013,1,23,2128.0,2129,-1.0,2221.0,2224,-3.0,EV,4619,N12135,EWR,PHL,23.0,80,21,29,2013-01-24T02:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99112,2013,12,18,928.0,930,-2.0,1543.0,1535,8.0,HA,51,N395HA,JFK,HNL,641.0,4983,9,30,2013-12-18T14:00:00Z
223207,2013,6,2,956.0,1000,-4.0,1442.0,1435,7.0,HA,51,N383HA,JFK,HNL,617.0,4983,10,0,2013-06-02T14:00:00Z
151311,2013,3,17,1006.0,1000,6.0,1607.0,1530,37.0,HA,51,N380HA,JFK,HNL,686.0,4983,10,0,2013-03-17T14:00:00Z
218562,2013,5,28,953.0,1000,-7.0,1447.0,1500,-13.0,HA,51,N385HA,JFK,HNL,631.0,4983,10,0,2013-05-28T14:00:00Z


## Question 3. Selecting Columns

Use at least three ways to select dep_time, dep_delay, arr_time, and arr_delay from flights.

In [15]:
# Method 1
flights[['dep_time', 'dep_delay','arr_time','arr_delay']]

Unnamed: 0,dep_time,dep_delay,arr_time,arr_delay
0,517.0,2.0,830.0,11.0
1,533.0,4.0,850.0,20.0
2,542.0,2.0,923.0,33.0
3,544.0,-1.0,1004.0,-18.0
4,554.0,-6.0,812.0,-25.0
...,...,...,...,...
336771,,,,
336772,,,,
336773,,,,
336774,,,,


In [16]:
# Method 2
flights.filter(['dep_time', 'dep_delay','arr_time','arr_delay'])

Unnamed: 0,dep_time,dep_delay,arr_time,arr_delay
0,517.0,2.0,830.0,11.0
1,533.0,4.0,850.0,20.0
2,542.0,2.0,923.0,33.0
3,544.0,-1.0,1004.0,-18.0
4,554.0,-6.0,812.0,-25.0
...,...,...,...,...
336771,,,,
336772,,,,
336773,,,,
336774,,,,


In [17]:
# Method 3
flights.iloc[:,[3,5,6,8]]

Unnamed: 0,dep_time,dep_delay,arr_time,arr_delay
0,517.0,2.0,830.0,11.0
1,533.0,4.0,850.0,20.0
2,542.0,2.0,923.0,33.0
3,544.0,-1.0,1004.0,-18.0
4,554.0,-6.0,812.0,-25.0
...,...,...,...,...
336771,,,,
336772,,,,
336773,,,,
336774,,,,


In [18]:
# Pattern match


## Question 4. Adding new columns

Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. 

For example, 759 means 7:59 and 801 means 8:01. Their difference is not 42 but 2 minutes. 

In [19]:
flights_sml = flights.filter(['dep_time','sched_dep_time','arr_time' ])
flights_sml

Unnamed: 0,dep_time,sched_dep_time,arr_time
0,517.0,515,830.0
1,533.0,529,850.0
2,542.0,540,923.0
3,544.0,545,1004.0
4,554.0,600,812.0
...,...,...,...
336771,,1455,
336772,,2200,
336773,,1210,
336774,,1159,


In [20]:
# Convert them to a more convenient representation of number of minutes since midnight (0).
flights_sml['dep_time'] = flights_sml.dep_time //100 * 60 + flights_sml.dep_time % 100
flights_sml['sched_dep_time'] = flights_sml.sched_dep_time //100 * 60 + flights_sml.sched_dep_time % 100
flights_sml['arr_time'] = flights_sml.arr_time //100 * 60 + flights_sml.arr_time % 100
flights_sml

Unnamed: 0,dep_time,sched_dep_time,arr_time
0,317.0,315,510.0
1,333.0,329,530.0
2,342.0,340,563.0
3,344.0,345,604.0
4,354.0,360,492.0
...,...,...,...
336771,,895,
336772,,1320,
336773,,730,
336774,,719,


In [21]:
# Create a new column of arr_time - dep_time. 
flights_sml['time'] = flights_sml['arr_time']- flights_sml['dep_time']

# Compare this column with air_time. 
flights_sml['air_time'] = flights['air_time']
flights_sml

Unnamed: 0,dep_time,sched_dep_time,arr_time,time,air_time
0,317.0,315,510.0,193.0,227.0
1,333.0,329,530.0,197.0,227.0
2,342.0,340,563.0,221.0,160.0
3,344.0,345,604.0,260.0,183.0
4,354.0,360,492.0,138.0,116.0
...,...,...,...,...,...
336771,,895,,,
336772,,1320,,,
336773,,730,,,
336774,,719,,,


In [22]:
# Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?
# Try creating a column to calculate dep_delay from dep_time and sched_dep_time (and/or other columns if necessary). 
flights_sml['dep_delay'] = flights['dep_delay']
flights_sml['new_dep_dely'] =   flights_sml['dep_time'] - flights_sml['sched_dep_time']

# Test your results. 
flights_sml

Unnamed: 0,dep_time,sched_dep_time,arr_time,time,air_time,dep_delay,new_dep_dely
0,317.0,315,510.0,193.0,227.0,2.0,2.0
1,333.0,329,530.0,197.0,227.0,4.0,4.0
2,342.0,340,563.0,221.0,160.0,2.0,2.0
3,344.0,345,604.0,260.0,183.0,-1.0,-1.0
4,354.0,360,492.0,138.0,116.0,-6.0,-6.0
...,...,...,...,...,...,...,...
336771,,895,,,,,
336772,,1320,,,,,
336773,,730,,,,,
336774,,719,,,,,


## Question 5. Mixing things together

The following questions may require multiple operations above. 

In [23]:
#flights.sort_values

In [24]:
# Find the 20 most delayed flights. 
# Display the following: year,month,day,carrier,flight,dep_delay,arr_delay,carrier
# How do you want to handle ties? 
flights['del_rank'] = flights['dep_delay'].rank(method='min',ascending=False)
flights = flights.sort_values(['del_rank', 'dep_delay'])

flights_med = flights.head(20)

flights_med[['year','month','day','carrier','flight','dep_delay','arr_delay','del_rank']]

Unnamed: 0,year,month,day,carrier,flight,dep_delay,arr_delay,del_rank
7072,2013,1,9,HA,51,1301.0,1272.0,1.0
235778,2013,6,15,MQ,3535,1137.0,1127.0,2.0
8239,2013,1,10,MQ,3695,1126.0,1109.0,3.0
327043,2013,9,20,AA,177,1014.0,1007.0,4.0
270376,2013,7,22,MQ,3075,1005.0,989.0,5.0
173992,2013,4,10,DL,2391,960.0,931.0,6.0
151974,2013,3,17,DL,2119,911.0,915.0,7.0
247040,2013,6,27,DL,2007,899.0,850.0,8.0
270987,2013,7,22,DL,2047,898.0,895.0,9.0
87238,2013,12,5,AA,172,896.0,878.0,10.0


In [31]:
# Sort all AA flights to find the top 10 fastest (highest speed) flights.
# Display the following: year,month,day,carrier,flight,orig,dest,distance,air_time,speed (miles per hour)
#flights['speed'] = flights['distance'] / flights['hour']

#flights = flights[flights.carrier=='AA'].filter(['year','month','day','carrier','origin','dest','air_time','distance','speed'])

#flights['speed_rank'] = flights['speed'].rank(method='min',ascending=False)

#flights = flights.sort_values('speed_rank', ascending=True)

#flights.iloc[:10]


Unnamed: 0,year,month,day,carrier,origin,dest,air_time,distance,speed,speed_rank
40096,2013,10,15,AA,JFK,LAS,319.0,2248,374.666667,1.0
70641,2013,11,17,AA,JFK,LAS,326.0,2248,374.666667,1.0
59530,2013,11,5,AA,JFK,LAS,329.0,2248,374.666667,1.0
316361,2013,9,9,AA,JFK,LAS,277.0,2248,374.666667,1.0
75508,2013,11,22,AA,JFK,LAS,356.0,2248,374.666667,1.0
91617,2013,12,10,AA,JFK,LAS,339.0,2248,374.666667,1.0
332285,2013,9,26,AA,JFK,LAS,297.0,2248,374.666667,1.0
58549,2013,11,4,AA,JFK,LAS,324.0,2248,374.666667,1.0
72565,2013,11,19,AA,JFK,LAS,295.0,2248,374.666667,1.0
88117,2013,12,6,AA,JFK,LAS,330.0,2248,374.666667,1.0


In [36]:
# Sort all AA flights to find the top 10 fastest (highest speed) flights.
# Display the following: year,month,day,carrier,flight,orig,dest,distance,air_time,speed (miles per hour)
flights['speed'] = flights['distance'] / flights['hour']
    
flights_med = (flights[(flights.carrier=='AA')]
               .filter(['year','month','day','carrier','origin','dest','air_time','distance','speed'])
               .sort_values('speed', ascending=False)
               .head(10))
    
flights_med

Unnamed: 0,year,month,day,carrier,origin,dest,air_time,distance,speed
79092,2013,11,26,AA,JFK,LAS,311.0,2248,374.666667
70641,2013,11,17,AA,JFK,LAS,326.0,2248,374.666667
106975,2013,12,27,AA,JFK,LAS,297.0,2248,374.666667
59530,2013,11,5,AA,JFK,LAS,329.0,2248,374.666667
52125,2013,10,28,AA,JFK,LAS,329.0,2248,374.666667
48548,2013,10,24,AA,JFK,LAS,315.0,2248,374.666667
81110,2013,11,28,AA,JFK,LAS,314.0,2248,374.666667
46611,2013,10,22,AA,JFK,LAS,300.0,2248,374.666667
31659,2013,10,6,AA,JFK,LAS,293.0,2248,374.666667
64121,2013,11,10,AA,JFK,LAS,295.0,2248,374.666667


In [60]:
# Find all flights that satisfy the following:
# - From John F. Kennedy Airpot (JFK) or Newark Aiport (EWR) to Seattle-Tacoma Airport (SEA) 
# - Carrier is UA, AA, or DL. 
# - Dates from 4/1/2013 (inclusive) to 4/3/2013 (inclusive)
# - Scheduled arrival time is before noon. 
# - Display the following: year,month,day,carrier,flight,origin,dest,sched_dep_time,sched_arr_time
# - Sort by year, month, day, sched_arr_time


flights_large = (flights[(flights.dest=='SEA')& ((flights.origin=='JFK') | (flights.origin == 'EWR'))
                        & ((flights.carrier=='UA')| (flights.carrier=='AA')| (flights.carrier=='DL'))
                        & ((flights.month==4)& flights.day.isin([1,2,3]))
                        & ((flights.sched_arr_time<1200))]
                 .filter(['year','month','day','carrier','flight','origin','dest','sched_dep_time','sched_arr_time'])
                 .sort_values(['year','month','day','sched_arr_time'])
                )
flights_large

Unnamed: 0,year,month,day,carrier,flight,origin,dest,sched_dep_time,sched_arr_time
165210,2013,4,1,DL,183,JFK,SEA,745,1100
166180,2013,4,2,DL,183,JFK,SEA,745,1100
167168,2013,4,3,DL,183,JFK,SEA,745,1100
