In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets

In [12]:
df_flights = pd.read_csv('datasets/flights.csv')

In [13]:
df_flights.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2014,1,1,1.0,96.0,235.0,70.0,AS,N508AS,145,PDX,ANC,194.0,1542,0.0,1.0
1,2014,1,1,4.0,-6.0,738.0,-23.0,US,N195UW,1830,SEA,CLT,252.0,2279,0.0,4.0
2,2014,1,1,8.0,13.0,548.0,-4.0,UA,N37422,1609,PDX,IAH,201.0,1825,0.0,8.0
3,2014,1,1,28.0,-2.0,800.0,-23.0,US,N547UW,466,PDX,CLT,251.0,2282,0.0,28.0
4,2014,1,1,34.0,44.0,325.0,43.0,AS,N762AS,121,SEA,ANC,201.0,1448,0.0,34.0


In [30]:
df_flights.shape

(162049, 16)

In [23]:
df_flights['carrier'].unique()

array(['AS', 'US', 'UA', 'DL', 'AA', 'F9', 'VX', 'OO', 'WN', 'B6', 'HA'],
      dtype=object)

In [24]:
df_flights['origin'].unique()

array(['PDX', 'SEA'], dtype=object)

### Bitwise operators

In [14]:
True & False

False

In [15]:
True | False

True

In [22]:
print(np.binary_repr(10))
print(np.binary_repr(15))
print(np.binary_repr(10 & 15))
print(np.binary_repr(10 | 15))

1010
1111
1010
1111


#### element-wise bitwise operation using numpy

In [16]:
np.array([True, True, False]) & np.array([True, False, False])

array([ True, False, False])

In [17]:
np.array([True, True, False]) | np.array([True, False, False])

array([ True,  True, False])

## performing selection by multiple conditions

In [32]:
selection_criteria = (df_flights['carrier'] == "AA") & (df_flights['origin'] == "SEA")
selection_criteria = np.logical_and(df_flights['carrier'] == "AA", df_flights['origin'] == "SEA")
selection_criteria = np.all([df_flights['carrier'] == "AA", df_flights['origin'] == "SEA"], axis=0)

In [33]:
selection_criteria

0         False
1         False
2         False
3         False
4         False
          ...  
162044    False
162045    False
162046    False
162047    False
162048    False
Length: 162049, dtype: bool

In [34]:
df_flights.loc[selection_criteria]

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
13,2014,1,1,557.0,-3.0,1134.0,-16.0,AA,N3JLAA,1094,SEA,DFW,184.0,1660,5.0,57.0
17,2014,1,1,600.0,0.0,1151.0,-19.0,AA,N3JFAA,2240,SEA,ORD,206.0,1721,6.0,0.0
58,2014,1,1,704.0,-6.0,1245.0,-20.0,AA,N3KNAA,1308,SEA,DFW,188.0,1660,7.0,4.0
62,2014,1,1,708.0,-7.0,1510.0,-19.0,AA,N3DNAA,236,SEA,JFK,281.0,2422,7.0,8.0
86,2014,1,1,810.0,-10.0,1419.0,-11.0,AA,N3DSAA,1236,SEA,ORD,202.0,1721,8.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161892,2014,9,30,1422.0,12.0,2002.0,2.0,AA,N3LCAA,2318,SEA,DFW,193.0,1660,14.0,22.0
161919,2014,9,30,1534.0,69.0,2151.0,91.0,AA,N3JXAA,1054,SEA,ORD,205.0,1721,15.0,34.0
161926,2014,9,30,1542.0,-3.0,2120.0,-10.0,AA,N3CGAA,2361,SEA,DFW,194.0,1660,15.0,42.0
161939,2014,9,30,1629.0,14.0,2244.0,44.0,AA,N3KUAA,1209,SEA,ORD,203.0,1721,16.0,29.0


#### wrong way

In [36]:
(df_flights['carrier'] == "AA") and (df_flights['origin'] == "SEA")

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

#### explains the use of numpy.all, numpy.any

In [46]:
np.all([df_flights['carrier'] == "AA", df_flights['origin'] == "SEA"], axis=1)

array([False, False])

In [47]:
np.all([df_flights['carrier'] == "AA", df_flights['origin'] == "SEA"])

False