## Data Wrangling Python

### Data Tidying

In [1]:
import pandas as pd

In [2]:
storms = pd.read_csv("https://raw.githubusercontent.com/rstudio/EDAWR/master/data-raw/storms.csv")
storms

Unnamed: 0,storm,wind,pressure,date
0,Alberto,110,1007,2000-08-03
1,Alex,45,1009,1998-07-27
2,Allison,65,1005,1995-06-03
3,Ana,40,1013,1997-06-30
4,Arlene,50,1010,1999-06-11
5,Arthur,45,1010,1996-06-17


In [3]:
cases = pd.read_csv("https://raw.githubusercontent.com/rstudio/EDAWR/master/data-raw/cases.csv")
cases

Unnamed: 0,country,2011,2012,2013
0,FR,7000,6900,7000
1,DE,5800,6000,6200
2,US,15000,14000,13000


In [4]:
pollution = pd.read_csv("https://raw.githubusercontent.com/rstudio/EDAWR/master/data-raw/pollution.csv")
pollution

Unnamed: 0,city,size,amount
0,New York,large,23
1,New York,small,14
2,London,large,22
3,London,small,16
4,Beijing,large,121
5,Beijing,small,56


In [5]:
pd.melt(
    cases,
    id_vars = "country",
    value_vars = ["2011", "2012", "2013"]
)

Unnamed: 0,country,variable,value
0,FR,2011,7000
1,DE,2011,5800
2,US,2011,15000
3,FR,2012,6900
4,DE,2012,6000
5,US,2012,14000
6,FR,2013,7000
7,DE,2013,6200
8,US,2013,13000


In [6]:
tidy_pollution = pd.pivot(data = pollution, index = "city", columns = "size")
tidy_pollution.columns = tidy_pollution.columns.get_level_values(1).to_numpy()
tidy_pollution.reset_index(inplace=True)
tidy_pollution

Unnamed: 0,city,large,small
0,Beijing,121,56
1,London,22,16
2,New York,23,14


#### Separate dan Unite

##### Separate

In [7]:
storms_split_date = storms.copy()
storms_split_date[["year", "month", "day"]] = storms_split_date["date"].str.split("-", expand=True)
storms_split_date.drop(columns=["date"], inplace=True)
storms_split_date

Unnamed: 0,storm,wind,pressure,year,month,day
0,Alberto,110,1007,2000,8,3
1,Alex,45,1009,1998,7,27
2,Allison,65,1005,1995,6,3
3,Ana,40,1013,1997,6,30
4,Arlene,50,1010,1999,6,11
5,Arthur,45,1010,1996,6,17


In [8]:
storms_join_date = storms_split_date.copy()
storms_join_date["date"] = storms_join_date["year"] + "-" + storms_join_date["month"] + "-" + storms_join_date["day"]
storms_join_date.drop(columns=["year", "month", "day"], inplace=True)
storms_join_date

Unnamed: 0,storm,wind,pressure,date
0,Alberto,110,1007,2000-08-03
1,Alex,45,1009,1998-07-27
2,Allison,65,1005,1995-06-03
3,Ana,40,1013,1997-06-30
4,Arlene,50,1010,1999-06-11
5,Arthur,45,1010,1996-06-17


In [9]:
storms_join_date2 = storms_split_date.copy()
storms_join_date2["date"] = storms_join_date2[["year", "month", "day"]].T.agg("-".join)
storms_join_date2.drop(columns=["year", "month", "day"], inplace=True)
storms_join_date2

Unnamed: 0,storm,wind,pressure,date
0,Alberto,110,1007,2000-08-03
1,Alex,45,1009,1998-07-27
2,Allison,65,1005,1995-06-03
3,Ana,40,1013,1997-06-30
4,Arlene,50,1010,1999-06-11
5,Arthur,45,1010,1996-06-17


### Data Transformation

In [10]:
flights = pd.read_csv("data/flights.csv")

#### Filter

- Flight to SFO atau OAK
- Flight in January
- Flight delayed more than one hour
- Flight departed between midnight and 5am (inclusive)

In [11]:
flights.loc[(flights["dest"] == "SFO") | (flights["dest"] == "OAK")]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
13,2013,1,1,558.0,600,-2.0,923.0,937,-14.0,UA,1124,N53441,EWR,SFO,361.0,2565,6,0,2013-01-01 06:00:00
26,2013,1,1,611.0,600,11.0,945.0,931,14.0,UA,303,N532UA,JFK,SFO,366.0,2586,6,0,2013-01-01 06:00:00
55,2013,1,1,655.0,700,-5.0,1037.0,1045,-8.0,DL,1865,N705TW,JFK,SFO,362.0,2586,7,0,2013-01-01 07:00:00
82,2013,1,1,729.0,730,-1.0,1049.0,1115,-26.0,VX,11,N635VA,JFK,SFO,356.0,2586,7,30,2013-01-01 07:00:00
87,2013,1,1,734.0,737,-3.0,1047.0,1113,-26.0,B6,643,N625JB,JFK,SFO,350.0,2586,7,37,2013-01-01 07:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336658,2013,9,30,1938.0,1850,48.0,2248.0,2220,28.0,VX,29,N626VA,JFK,SFO,323.0,2586,18,50,2013-09-30 18:00:00
336662,2013,9,30,1940.0,1900,40.0,2228.0,2232,-4.0,DL,1465,N723TW,JFK,SFO,323.0,2586,19,0,2013-09-30 19:00:00
336679,2013,9,30,2001.0,2000,1.0,2302.0,2328,-26.0,UA,1651,N35260,EWR,SFO,323.0,2565,20,0,2013-09-30 20:00:00
336701,2013,9,30,2023.0,2025,-2.0,2315.0,2350,-35.0,B6,915,N712JB,JFK,SFO,327.0,2586,20,25,2013-09-30 20:00:00


In [12]:
flights.loc[flights["dest"].isin(["SFO", "OAK"])]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
13,2013,1,1,558.0,600,-2.0,923.0,937,-14.0,UA,1124,N53441,EWR,SFO,361.0,2565,6,0,2013-01-01 06:00:00
26,2013,1,1,611.0,600,11.0,945.0,931,14.0,UA,303,N532UA,JFK,SFO,366.0,2586,6,0,2013-01-01 06:00:00
55,2013,1,1,655.0,700,-5.0,1037.0,1045,-8.0,DL,1865,N705TW,JFK,SFO,362.0,2586,7,0,2013-01-01 07:00:00
82,2013,1,1,729.0,730,-1.0,1049.0,1115,-26.0,VX,11,N635VA,JFK,SFO,356.0,2586,7,30,2013-01-01 07:00:00
87,2013,1,1,734.0,737,-3.0,1047.0,1113,-26.0,B6,643,N625JB,JFK,SFO,350.0,2586,7,37,2013-01-01 07:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336658,2013,9,30,1938.0,1850,48.0,2248.0,2220,28.0,VX,29,N626VA,JFK,SFO,323.0,2586,18,50,2013-09-30 18:00:00
336662,2013,9,30,1940.0,1900,40.0,2228.0,2232,-4.0,DL,1465,N723TW,JFK,SFO,323.0,2586,19,0,2013-09-30 19:00:00
336679,2013,9,30,2001.0,2000,1.0,2302.0,2328,-26.0,UA,1651,N35260,EWR,SFO,323.0,2565,20,0,2013-09-30 20:00:00
336701,2013,9,30,2023.0,2025,-2.0,2315.0,2350,-35.0,B6,915,N712JB,JFK,SFO,327.0,2586,20,25,2013-09-30 20:00:00


In [13]:
flights.query("dest in ['SFO', 'OAK']")

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
13,2013,1,1,558.0,600,-2.0,923.0,937,-14.0,UA,1124,N53441,EWR,SFO,361.0,2565,6,0,2013-01-01 06:00:00
26,2013,1,1,611.0,600,11.0,945.0,931,14.0,UA,303,N532UA,JFK,SFO,366.0,2586,6,0,2013-01-01 06:00:00
55,2013,1,1,655.0,700,-5.0,1037.0,1045,-8.0,DL,1865,N705TW,JFK,SFO,362.0,2586,7,0,2013-01-01 07:00:00
82,2013,1,1,729.0,730,-1.0,1049.0,1115,-26.0,VX,11,N635VA,JFK,SFO,356.0,2586,7,30,2013-01-01 07:00:00
87,2013,1,1,734.0,737,-3.0,1047.0,1113,-26.0,B6,643,N625JB,JFK,SFO,350.0,2586,7,37,2013-01-01 07:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336658,2013,9,30,1938.0,1850,48.0,2248.0,2220,28.0,VX,29,N626VA,JFK,SFO,323.0,2586,18,50,2013-09-30 18:00:00
336662,2013,9,30,1940.0,1900,40.0,2228.0,2232,-4.0,DL,1465,N723TW,JFK,SFO,323.0,2586,19,0,2013-09-30 19:00:00
336679,2013,9,30,2001.0,2000,1.0,2302.0,2328,-26.0,UA,1651,N35260,EWR,SFO,323.0,2565,20,0,2013-09-30 20:00:00
336701,2013,9,30,2023.0,2025,-2.0,2315.0,2350,-35.0,B6,915,N712JB,JFK,SFO,327.0,2586,20,25,2013-09-30 20:00:00


In [14]:
flights.loc[flights["month"] == 1]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26999,2013,1,31,,1325,,,1505,,MQ,4475,N730MQ,LGA,RDU,,431,13,25,2013-01-31 13:00:00
27000,2013,1,31,,1200,,,1430,,MQ,4658,N505MQ,LGA,ATL,,762,12,0,2013-01-31 12:00:00
27001,2013,1,31,,1410,,,1555,,MQ,4491,N734MQ,LGA,CLE,,419,14,10,2013-01-31 14:00:00
27002,2013,1,31,,1446,,,1757,,UA,337,,LGA,IAH,,1416,14,46,2013-01-31 14:00:00


In [15]:
flights.query("month == 1")

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26999,2013,1,31,,1325,,,1505,,MQ,4475,N730MQ,LGA,RDU,,431,13,25,2013-01-31 13:00:00
27000,2013,1,31,,1200,,,1430,,MQ,4658,N505MQ,LGA,ATL,,762,12,0,2013-01-31 12:00:00
27001,2013,1,31,,1410,,,1555,,MQ,4491,N734MQ,LGA,CLE,,419,14,10,2013-01-31 14:00:00
27002,2013,1,31,,1446,,,1757,,UA,337,,LGA,IAH,,1416,14,46,2013-01-31 14:00:00


In [16]:
flights.loc[flights["dep_delay"] > 60]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
119,2013,1,1,811.0,630,101.0,1047.0,830,137.0,MQ,4576,N531MQ,LGA,CLT,118.0,544,6,30,2013-01-01 06:00:00
135,2013,1,1,826.0,715,71.0,1136.0,1045,51.0,AA,443,N3GVAA,JFK,MIA,160.0,1089,7,15,2013-01-01 07:00:00
151,2013,1,1,848.0,1835,853.0,1001.0,1950,851.0,MQ,3944,N942MQ,JFK,BWI,41.0,184,18,35,2013-01-01 18:00:00
218,2013,1,1,957.0,733,144.0,1056.0,853,123.0,UA,856,N534UA,EWR,BOS,37.0,200,7,33,2013-01-01 07:00:00
268,2013,1,1,1114.0,900,134.0,1447.0,1222,145.0,UA,1086,N76502,LGA,IAH,248.0,1416,9,0,2013-01-01 09:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336742,2013,9,30,2119.0,2005,74.0,2310.0,2212,58.0,EV,4321,N17984,EWR,MCI,147.0,1092,20,5,2013-09-30 20:00:00
336757,2013,9,30,2159.0,1845,194.0,2344.0,2030,194.0,9E,3320,N906XJ,JFK,BUF,50.0,301,18,45,2013-09-30 18:00:00
336760,2013,9,30,2211.0,2059,72.0,2339.0,2242,57.0,EV,4672,N12145,EWR,STL,120.0,872,20,59,2013-09-30 20:00:00
336762,2013,9,30,2233.0,2113,80.0,112.0,30,42.0,UA,471,N578UA,EWR,SFO,318.0,2565,21,13,2013-09-30 21:00:00


In [17]:
flights.query("dep_delay > 60")

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
119,2013,1,1,811.0,630,101.0,1047.0,830,137.0,MQ,4576,N531MQ,LGA,CLT,118.0,544,6,30,2013-01-01 06:00:00
135,2013,1,1,826.0,715,71.0,1136.0,1045,51.0,AA,443,N3GVAA,JFK,MIA,160.0,1089,7,15,2013-01-01 07:00:00
151,2013,1,1,848.0,1835,853.0,1001.0,1950,851.0,MQ,3944,N942MQ,JFK,BWI,41.0,184,18,35,2013-01-01 18:00:00
218,2013,1,1,957.0,733,144.0,1056.0,853,123.0,UA,856,N534UA,EWR,BOS,37.0,200,7,33,2013-01-01 07:00:00
268,2013,1,1,1114.0,900,134.0,1447.0,1222,145.0,UA,1086,N76502,LGA,IAH,248.0,1416,9,0,2013-01-01 09:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336742,2013,9,30,2119.0,2005,74.0,2310.0,2212,58.0,EV,4321,N17984,EWR,MCI,147.0,1092,20,5,2013-09-30 20:00:00
336757,2013,9,30,2159.0,1845,194.0,2344.0,2030,194.0,9E,3320,N906XJ,JFK,BUF,50.0,301,18,45,2013-09-30 18:00:00
336760,2013,9,30,2211.0,2059,72.0,2339.0,2242,57.0,EV,4672,N12145,EWR,STL,120.0,872,20,59,2013-09-30 20:00:00
336762,2013,9,30,2233.0,2113,80.0,112.0,30,42.0,UA,471,N578UA,EWR,SFO,318.0,2565,21,13,2013-09-30 21:00:00


In [18]:
flights.loc[(flights["hour"] >= 0) & (flights["hour"] <= 5)]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
5,2013,1,1,554.0,558,-4.0,740.0,728,12.0,UA,1696,N39463,EWR,ORD,150.0,719,5,58,2013-01-01 05:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335784,2013,9,30,516.0,517,-1.0,734.0,757,-23.0,UA,252,N571UA,EWR,IAH,181.0,1400,5,17,2013-09-30 05:00:00
335785,2013,9,30,540.0,545,-5.0,821.0,855,-34.0,AA,2243,N622AA,JFK,MIA,143.0,1089,5,45,2013-09-30 05:00:00
335786,2013,9,30,540.0,545,-5.0,918.0,933,-15.0,B6,1403,N510JB,JFK,SJU,194.0,1598,5,45,2013-09-30 05:00:00
335787,2013,9,30,549.0,550,-1.0,911.0,932,-21.0,B6,939,N552JB,JFK,BQN,185.0,1576,5,50,2013-09-30 05:00:00


In [19]:
flights.query("hour >= 0 and hour <= 5")

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
5,2013,1,1,554.0,558,-4.0,740.0,728,12.0,UA,1696,N39463,EWR,ORD,150.0,719,5,58,2013-01-01 05:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335784,2013,9,30,516.0,517,-1.0,734.0,757,-23.0,UA,252,N571UA,EWR,IAH,181.0,1400,5,17,2013-09-30 05:00:00
335785,2013,9,30,540.0,545,-5.0,821.0,855,-34.0,AA,2243,N622AA,JFK,MIA,143.0,1089,5,45,2013-09-30 05:00:00
335786,2013,9,30,540.0,545,-5.0,918.0,933,-15.0,B6,1403,N510JB,JFK,SJU,194.0,1598,5,45,2013-09-30 05:00:00
335787,2013,9,30,549.0,550,-1.0,911.0,932,-21.0,B6,939,N552JB,JFK,BQN,185.0,1576,5,50,2013-09-30 05:00:00


In [20]:
flights[["dep_delay", "arr_delay", "dest"]]

Unnamed: 0,dep_delay,arr_delay,dest
0,2.0,11.0,IAH
1,4.0,20.0,IAH
2,2.0,33.0,MIA
3,-1.0,-18.0,BQN
4,-6.0,-25.0,ATL
...,...,...,...
336771,,,DCA
336772,,,SYR
336773,,,BNA
336774,,,CLE


In [21]:
flights.filter(like = "time")

Unnamed: 0,dep_time,sched_dep_time,arr_time,sched_arr_time,air_time,time_hour
0,517.0,515,830.0,819,227.0,2013-01-01 05:00:00
1,533.0,529,850.0,830,227.0,2013-01-01 05:00:00
2,542.0,540,923.0,850,160.0,2013-01-01 05:00:00
3,544.0,545,1004.0,1022,183.0,2013-01-01 05:00:00
4,554.0,600,812.0,837,116.0,2013-01-01 06:00:00
...,...,...,...,...,...,...
336771,,1455,,1634,,2013-09-30 14:00:00
336772,,2200,,2312,,2013-09-30 22:00:00
336773,,1210,,1330,,2013-09-30 12:00:00
336774,,1159,,1344,,2013-09-30 11:00:00


In [22]:
flights[flights.columns[:5]]

Unnamed: 0,year,month,day,dep_time,sched_dep_time
0,2013,1,1,517.0,515
1,2013,1,1,533.0,529
2,2013,1,1,542.0,540
3,2013,1,1,544.0,545
4,2013,1,1,554.0,600
...,...,...,...,...,...
336771,2013,9,30,,1455
336772,2013,9,30,,2200
336773,2013,9,30,,1210
336774,2013,9,30,,1159


In [23]:
flights.iloc[:, flights.columns.get_loc("origin"):flights.columns.get_loc("time_hour")+1]

Unnamed: 0,origin,dest,air_time,distance,hour,minute,time_hour
0,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
...,...,...,...,...,...,...,...
336771,JFK,DCA,,213,14,55,2013-09-30 14:00:00
336772,LGA,SYR,,198,22,0,2013-09-30 22:00:00
336773,LGA,BNA,,764,12,10,2013-09-30 12:00:00
336774,LGA,CLE,,419,11,59,2013-09-30 11:00:00


#### Arrange atau Sort

In [24]:
flights.sort_values(by = ["time_hour", "dest"])

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
15,2013,1,1,559.0,559,0.0,702.0,706,-4.0,B6,1806,N708JB,JFK,BOS,44.0,187,5,59,2013-01-01 05:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110520,2013,12,31,13.0,2359,14.0,439.0,437,2.0,B6,839,N566JB,JFK,BQN,189.0,1576,23,59,2013-12-31 23:00:00
111279,2013,12,31,2356.0,2359,-3.0,436.0,445,-9.0,B6,745,N665JB,JFK,PSE,200.0,1617,23,59,2013-12-31 23:00:00
110521,2013,12,31,18.0,2359,19.0,449.0,444,5.0,DL,412,N713TW,JFK,SJU,192.0,1598,23,59,2013-12-31 23:00:00
111276,2013,12,31,2328.0,2330,-2.0,412.0,409,3.0,B6,1389,N651JB,EWR,SJU,198.0,1608,23,30,2013-12-31 23:00:00


In [25]:
flights.sort_values(by = "dep_delay", ascending = False)

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
7072,2013,1,9,641.0,900,1301.0,1242.0,1530,1272.0,HA,51,N384HA,JFK,HNL,640.0,4983,9,0,2013-01-09 09:00:00
235778,2013,6,15,1432.0,1935,1137.0,1607.0,2120,1127.0,MQ,3535,N504MQ,JFK,CMH,74.0,483,19,35,2013-06-15 19:00:00
8239,2013,1,10,1121.0,1635,1126.0,1239.0,1810,1109.0,MQ,3695,N517MQ,EWR,ORD,111.0,719,16,35,2013-01-10 16:00:00
327043,2013,9,20,1139.0,1845,1014.0,1457.0,2210,1007.0,AA,177,N338AA,JFK,SFO,354.0,2586,18,45,2013-09-20 18:00:00
270376,2013,7,22,845.0,1600,1005.0,1044.0,1815,989.0,MQ,3075,N665MQ,JFK,CVG,96.0,589,16,0,2013-07-22 16:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,2013,9,30,,1455,,,1634,,9E,3393,,JFK,DCA,,213,14,55,2013-09-30 14:00:00
336772,2013,9,30,,2200,,,2312,,9E,3525,,LGA,SYR,,198,22,0,2013-09-30 22:00:00
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10,2013-09-30 12:00:00
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59,2013-09-30 11:00:00


#### Add New Column atau Mutate

In [26]:
flights.assign(dep_delay_hour = flights["dep_delay"] / 60)

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour,dep_delay_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00,0.033333
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00,0.066667
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00,0.033333
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00,-0.016667
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00,-0.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,2013,9,30,,1455,,,1634,,9E,3393,,JFK,DCA,,213,14,55,2013-09-30 14:00:00,
336772,2013,9,30,,2200,,,2312,,9E,3525,,LGA,SYR,,198,22,0,2013-09-30 22:00:00,
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10,2013-09-30 12:00:00,
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59,2013-09-30 11:00:00,


#### Group

In [27]:
flights.groupby("month")[["year"]].count().reset_index().rename(columns={"year": "count"})

Unnamed: 0,month,count
0,1,27004
1,2,24951
2,3,28834
3,4,28330
4,5,28796
5,6,28243
6,7,29425
7,8,29327
8,9,27574
9,10,28889


In [28]:
flights.groupby("month").count().reset_index()[["month", "year"]].rename(columns={"year": "count"})

Unnamed: 0,month,count
0,1,27004
1,2,24951
2,3,28834
3,4,28330
4,5,28796
5,6,28243
6,7,29425
7,8,29327
8,9,27574
9,10,28889


In [29]:
flights.groupby("dest")["year"].count().reset_index().rename(columns={"dest": "count"})

Unnamed: 0,count,year
0,ABQ,254
1,ACK,265
2,ALB,439
3,ANC,8
4,ATL,17215
...,...,...
100,TPA,7466
101,TUL,315
102,TVC,101
103,TYS,631


In [30]:
flights.groupby("dest").count().reset_index()[["dest", "year"]].rename(columns={"year": "count"})

Unnamed: 0,dest,count
0,ABQ,254
1,ACK,265
2,ALB,439
3,ANC,8
4,ATL,17215
...,...,...
100,TPA,7466
101,TUL,315
102,TVC,101
103,TYS,631


In [31]:
flights.groupby("month")["dep_delay"].sum().reset_index().rename(columns={"dep_delay": "total_dep_delay"})

Unnamed: 0,month,total_dep_delay
0,1,265801.0
1,2,256251.0
2,3,370001.0
3,4,385554.0
4,5,366658.0
5,6,567729.0
6,7,618916.0
7,8,363715.0
8,9,182327.0
9,10,178909.0


In [32]:
flights.groupby("month").sum("dep_delay").reset_index()[["month", "dep_delay"]].rename(columns={"dep_delay": "total_dep_delay"})

Unnamed: 0,month,total_dep_delay
0,1,265801.0
1,2,256251.0
2,3,370001.0
3,4,385554.0
4,5,366658.0
5,6,567729.0
6,7,618916.0
7,8,363715.0
8,9,182327.0
9,10,178909.0


In [33]:
flights.groupby("month").agg({"year": "count", "dep_delay": "sum"}).reset_index()\
    .rename(columns={"year":"count","dep_delay": "total_dep_delay"})

Unnamed: 0,month,count,total_dep_delay
0,1,27004,265801.0
1,2,24951,256251.0
2,3,28834,370001.0
3,4,28330,385554.0
4,5,28796,366658.0
5,6,28243,567729.0
6,7,29425,618916.0
7,8,29327,363715.0
8,9,27574,182327.0
9,10,28889,178909.0


In [34]:
most_5_del = flights.copy()
most_5_del = most_5_del.groupby(["month", "carrier"]).agg({"dep_delay": "sum"}).reset_index()
most_5_del["overall_rank"] = most_5_del["dep_delay"].rank(method="dense", ascending=False)
most_5_del["group_rank"] = most_5_del.groupby("month")["dep_delay"].rank(method="dense", ascending=False)
most_5_del.loc[most_5_del["group_rank"] <= 5].sort_values(by=["month", "group_rank"])

Unnamed: 0,month,carrier,dep_delay,overall_rank,group_rank
5,1,EV,96649.0,9.0,1.0
3,1,B6,41942.0,33.0,2.0
11,1,UA,38342.0,38.0,3.0
0,1,9E,25290.0,59.0,4.0
1,1,AA,18960.0,70.0,5.0
21,2,EV,76580.0,17.0,1.0
19,2,B6,54403.0,26.0,2.0
26,2,UA,32125.0,44.0,3.0
16,2,9E,22306.0,63.0,4.0
17,2,AA,19906.0,68.0,5.0
