In [2]:
import pandas as pd
import seaborn as sns

In [5]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [3]:
flights = sns.load_dataset('flights')

In [6]:
type(flights)

pandas.core.frame.DataFrame

### Inspect Step

.columns

.head()

.tail()

.info() # Data types of the columns, shows any missing values

.describe() # Summary statistics

In [8]:
flights.columns

Index(['year', 'month', 'passengers'], dtype='object')

In [10]:
flights.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [11]:
flights.tail()

Unnamed: 0,year,month,passengers
139,1960,Aug,606
140,1960,Sep,508
141,1960,Oct,461
142,1960,Nov,390
143,1960,Dec,432


In [12]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   year        144 non-null    int64   
 1   month       144 non-null    category
 2   passengers  144 non-null    int64   
dtypes: category(1), int64(2)
memory usage: 2.9 KB


In [14]:
flights.describe()

Unnamed: 0,year,passengers
count,144.0,144.0
mean,1954.5,280.298611
std,3.464102,119.966317
min,1949.0,104.0
25%,1951.75,180.0
50%,1954.5,265.5
75%,1957.25,360.5
max,1960.0,622.0


### Select and Analyze Specific Columns

In [16]:
passengers = flights['passengers'] 

In [17]:
passengers.mean()

np.float64(280.2986111111111)

In [18]:
passengers.median()

np.float64(265.5)

In [19]:
passengers.min()

np.int64(104)

In [20]:
passengers.max()

np.int64(622)

In [21]:
passengers.sum()

np.int64(40363)

In [22]:
passengers.count()

np.int64(144)

In [23]:
passengers.std()

np.float64(119.9663169429432)

In [26]:
year_passengers = flights[['year', 'passengers']]

In [27]:
year_passengers.head()

Unnamed: 0,year,passengers
0,1949,112
1,1949,118
2,1949,132
3,1949,129
4,1949,121


### Basic Filtering

In [29]:
high_traffic = flights[flights['passengers'] > 400]

In [36]:
mask = pd.Series([True] * 144) # creating the mask

In [None]:
flights['passengers'] > 400 # Boolean Mask

In [37]:
passengers_mask = flights ['passengers'] > 500

In [32]:
high_traffic[high_traffic['passengers'] == high_traffic['passengers'].max()]

Unnamed: 0,year,month,passengers
138,1960,Jul,622


In [34]:
high_traffic['month'].value_counts().sort_index() # to sort by index

month
Jan    1
Feb    0
Mar    2
Apr    1
May    2
Jun    4
Jul    5
Aug    5
Sep    4
Oct    2
Nov    0
Dec    2
Name: count, dtype: int64

In [42]:
flights.groupby('month')['passengers'].sum()

  flights.groupby('month')['passengers'].sum()


month
Jan    2901
Feb    2820
Mar    3242
Apr    3205
May    3262
Jun    3740
Jul    4216
Aug    4213
Sep    3629
Oct    3199
Nov    2794
Dec    3142
Name: passengers, dtype: int64

In [48]:
flights.groupby(['year','month'])['passengers'].sum()

  flights.groupby(['year','month'])['passengers'].sum()


year  month
1949  Jan      112
      Feb      118
      Mar      132
      Apr      129
      May      121
              ... 
1960  Aug      606
      Sep      508
      Oct      461
      Nov      390
      Dec      432
Name: passengers, Length: 144, dtype: int64

### Advanced Filtering

In [49]:
flights['year'] >= 1955

0      False
1      False
2      False
3      False
4      False
       ...  
139     True
140     True
141     True
142     True
143     True
Name: year, Length: 144, dtype: bool

In [50]:
flights['month'] == 'Jul'

0      False
1      False
2      False
3      False
4      False
       ...  
139    False
140    False
141    False
142    False
143    False
Name: month, Length: 144, dtype: bool

In [54]:
fifties_july_mask = flights[(flights['year'] >= 1955) & (flights['month'] == 'Jul')] # we can combine

In [53]:
flights[fifties_july_mask] 

Unnamed: 0,year,month,passengers
78,1955,Jul,364
90,1956,Jul,413
102,1957,Jul,465
114,1958,Jul,491
126,1959,Jul,548
138,1960,Jul,622


In [56]:
high_traffic_mask = flights['passengers'] > 500

In [57]:
summer_mask = (flights['month'] == 'Jun') | (flights['month'] == 'Jul') | (flights['month'] == 'Aug')   # 'or' operator

In [58]:
high_traffic_summer_mask = (flights['passengers'] > 500) & ((flights['month'] == 'Jun') | (flights['month'] == 'Jul') | (flights['month'] == 'Aug'))

In [60]:
flights[high_traffic_summer_mask]

Unnamed: 0,year,month,passengers
115,1958,Aug,505
126,1959,Jul,548
127,1959,Aug,559
137,1960,Jun,535
138,1960,Jul,622
139,1960,Aug,606


#### Better way : Using .isin() for multiple values

In [71]:
summer_month = ['Jun', 'Jul', 'Aug']

In [74]:
summer_month_mask = (flights['month'].isin(summer_month)) & (flights['passengers'] > 500)

In [75]:
flights [summer_month_mask]

Unnamed: 0,year,month,passengers
115,1958,Aug,505
126,1959,Jul,548
127,1959,Aug,559
137,1960,Jun,535
138,1960,Jul,622
139,1960,Aug,606


#### Using .between() for Ranges

In [76]:
# (flights['passengers'] > 300) & (flights['passengers'] < 400)
moderate_traffic_mask = flights['passengers'].between(300,400)

In [77]:
flights[moderate_traffic_mask]

Unnamed: 0,year,month,passengers
66,1954,Jul,302
77,1955,Jun,315
78,1955,Jul,364
79,1955,Aug,347
80,1955,Sep,312
86,1956,Mar,317
87,1956,Apr,313
88,1956,May,318
89,1956,Jun,374
92,1956,Sep,355


In [79]:
years = [1955, 1956, 1957]
fifties_moderate_year = (flights['passengers'].between(300,400)) & (flights['year'].isin(years))

In [80]:
flights[fifties_moderate_year]

Unnamed: 0,year,month,passengers
77,1955,Jun,315
78,1955,Jul,364
79,1955,Aug,347
80,1955,Sep,312
86,1956,Mar,317
87,1956,Apr,313
88,1956,May,318
89,1956,Jun,374
92,1956,Sep,355
93,1956,Oct,306


### Inverting Conditions With~

In [82]:
summer_month = ['Jun', 'Jul', 'Aug'] 
non_summer_mask = ~flights['month'].isin(summer_month) # reverse True to False

In [83]:
flights[non_summer_mask]

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121
...,...,...,...
136,1960,May,472
140,1960,Sep,508
141,1960,Oct,461
142,1960,Nov,390


In [87]:
passengers_mask = flights['passengers'].between(100,200) 
year_mask = flights['year'].between(1950,1960)

In [88]:
passengers_year_mask = (passengers_mask) & (year_mask)

In [89]:
len(flights[passengers_year_mask])

36

In [93]:
month_list = ['Dec', 'Jan']

after_1955_mask = flights['year'] > 1955
jan_dec_mask = flights['month'].isin(month_list)

In [94]:
flights[(after_1955_mask) & (jan_dec_mask)]

Unnamed: 0,year,month,passengers
84,1956,Jan,284
95,1956,Dec,306
96,1957,Jan,315
107,1957,Dec,336
108,1958,Jan,340
119,1958,Dec,337
120,1959,Jan,360
131,1959,Dec,405
132,1960,Jan,417
143,1960,Dec,432


In [98]:
high_traffic_mask = flights['passengers'] > 600

In [99]:
flights[high_traffic_mask]['year'].unique()

array([1960])

### Sorting
#### Top/buttom performers identification
##### Trends over time
###### Identifying Ouiliers

In [100]:
flights_sorted = flights.sort_values('passengers') # sorts the data in ascending order (small to high)

In [101]:
flights_sorted

Unnamed: 0,year,month,passengers
10,1949,Nov,104
0,1949,Jan,112
22,1950,Nov,114
12,1950,Jan,115
11,1949,Dec,118
...,...,...,...
137,1960,Jun,535
126,1959,Jul,548
127,1959,Aug,559
139,1960,Aug,606


In [105]:
flights.sort_values(by = ['year', 'passengers'], ascending=[False,True]) #year = False, passengers = True

Unnamed: 0,year,month,passengers
142,1960,Nov,390
133,1960,Feb,391
132,1960,Jan,417
134,1960,Mar,419
143,1960,Dec,432
...,...,...,...
2,1949,Mar,132
5,1949,Jun,135
8,1949,Sep,136
6,1949,Jul,148


### nlargest, nsmallest

In [108]:
flights.nlargest(10, 'passengers')

Unnamed: 0,year,month,passengers
138,1960,Jul,622
139,1960,Aug,606
127,1959,Aug,559
126,1959,Jul,548
137,1960,Jun,535
140,1960,Sep,508
115,1958,Aug,505
114,1958,Jul,491
125,1959,Jun,472
136,1960,May,472


In [109]:
flights.nsmallest(10, 'passengers')

Unnamed: 0,year,month,passengers
10,1949,Nov,104
0,1949,Jan,112
22,1950,Nov,114
12,1950,Jan,115
1,1949,Feb,118
11,1949,Dec,118
9,1949,Oct,119
4,1949,May,121
16,1950,May,125
13,1950,Feb,126
