In [1]:
import pandas as pd
import seaborn as sns

In [2]:
flights = sns.load_dataset('flights')

### loc and iloc - Precise Selection

In [3]:
flights[flights['passengers'] > 500]

Unnamed: 0,year,month,passengers
115,1958,Aug,505
126,1959,Jul,548
127,1959,Aug,559
137,1960,Jun,535
138,1960,Jul,622
139,1960,Aug,606
140,1960,Sep,508


- select row 5?
- select rows 10-207?
- specific rows and columns at the same time?
- changes a specific value?

In [4]:
# iloc - position based (by number); example: df.iloc[0,2] --- will return first row and 3 columns
# loc - label based (by name); example: df.loc[0, 'name'] --- the first row of the column 'name'

In [None]:
# ILOC

In [5]:
flights.iloc[0] # the first row

year          1949
month          Jan
passengers     112
Name: 0, dtype: object

In [6]:
flights.iloc[0,2]

np.int64(112)

In [7]:
flights.iloc[0:5]

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [10]:
flights.iloc[-5:] # the last five rows

Unnamed: 0,year,month,passengers
139,1960,Aug,606
140,1960,Sep,508
141,1960,Oct,461
142,1960,Nov,390
143,1960,Dec,432


In [11]:
flights.iloc[:5, :2]

Unnamed: 0,year,month
0,1949,Jan
1,1949,Feb
2,1949,Mar
3,1949,Apr
4,1949,May


In [13]:
flights.iloc[0:11:5] # get rows 0, 5, 10 using steps

Unnamed: 0,year,month,passengers
0,1949,Jan,112
5,1949,Jun,135
10,1949,Nov,104


In [15]:
# specific rows to get: 2, 11, 101
wanted_rows = [2, 11, 101]
flights.iloc[wanted_rows]

Unnamed: 0,year,month,passengers
2,1949,Mar,132
11,1949,Dec,118
101,1957,Jun,422


In [16]:
wanted_rows = [2, 11, 101]
wanted_columns = [1, 2]

flights.iloc[wanted_rows, wanted_columns]

Unnamed: 0,month,passengers
2,Mar,132
11,Dec,118
101,Jun,422


In [17]:
flights.iloc[:, 2]

0      112
1      118
2      132
3      129
4      121
      ... 
139    606
140    508
141    461
142    390
143    432
Name: passengers, Length: 144, dtype: int64

In [18]:
# LOC

In [19]:
# syntax: df.loc[rows, columns]
# uses labels (index and column names)

In [20]:
flights.loc[0]

year          1949
month          Jan
passengers     112
Name: 0, dtype: object

In [21]:
flights.loc[0, 'passengers']

np.int64(112)

In [22]:
# get the first 5 rows
# for the passengers and year columns

wanted_columns = ['passengers', 'year']

flights.loc[:5, wanted_columns]

Unnamed: 0,passengers,year
0,112,1949
1,118,1949
2,132,1949
3,129,1949
4,121,1949
5,135,1949


In [23]:
flights.loc[:, ['passengers', 'year']]

Unnamed: 0,passengers,year
0,112,1949
1,118,1949
2,132,1949
3,129,1949
4,121,1949
...,...,...
139,606,1960
140,508,1960
141,461,1960
142,390,1960


#### Combining Boolean masks with loc

loc supports using Boolean Mask for filtering

**NOTE** iloc DOES NOT supports Boolean mask (will get an error)

In [25]:
high_traffic_mask = flights['passengers'] > 500

In [27]:
# get the years with the high traffic while using loc

flights.loc[high_traffic_mask, 'year'].unique()

array([1958, 1959, 1960])

In [131]:
medium_high_traffic_mask = flights['passengers'] > 400
after_1955_flights_mask = flights['year'] > 1955

wanted_columns = ['year', 'month']

flights.loc[(medium_high_traffic_mask) & (after_1955_flights_mask), wanted_columns]

Unnamed: 0,year,month
90,1956,Jul
91,1956,Aug
101,1957,Jun
102,1957,Jul
103,1957,Aug
104,1957,Sep
113,1958,Jun
114,1958,Jul
115,1958,Aug
116,1958,Sep


In [34]:
# get the summer months ('Jun', 'Jul', 'Aug'), show only the month and passengers using loc

summer_months = ['Jun', 'Jul', 'Aug']

summer_mask = flights['month'].isin(summer_months)
wanted_columns = ['month', 'passengers']

flights.loc[summer_mask, wanted_columns].head(10)

Unnamed: 0,month,passengers
5,Jun,135
6,Jul,148
7,Aug,148
17,Jun,149
18,Jul,170
19,Aug,170
29,Jun,178
30,Jul,199
31,Aug,199
41,Jun,218


#### Setting Values with iloc and loc

In [61]:
flights_copy = flights.copy() # creates a copy

In [68]:
flights_copy.loc[0, 'passengers'] = 999

In [69]:
flights_copy.head()

Unnamed: 0,year,month,passengers,traffic
0,1949,Jan,999,Low
1,1949,Feb,118,Low
2,1949,Mar,132,Low
3,1949,Apr,129,Low
4,1949,May,121,Low


In [73]:
# Category 'traffic' - low(0-200), medium(200-400), high(400-)

low_traffic_mask = flights_copy['passengers'] < 200
medium_traffic_mask = flights_copy['passengers'].between(200, 400)
high_traffic_mask = flights_copy['passengers'] > 400

In [74]:
flights_copy.loc[low_traffic_mask, 'traffic'] = 'Low'
flights_copy.loc[medium_traffic_mask, 'traffic'] = 'Medium'
flights_copy.loc[high_traffic_mask, 'traffic'] = 'High'

In [75]:
flights_copy

Unnamed: 0,year,month,passengers,traffic
0,1949,Jan,999,High
1,1949,Feb,118,Low
2,1949,Mar,132,Low
3,1949,Apr,129,Low
4,1949,May,121,Low
...,...,...,...,...
139,1960,Aug,606,High
140,1960,Sep,508,High
141,1960,Oct,461,High
142,1960,Nov,390,Medium


In [77]:
# ger rows 50 - 59 with iloc

flights.iloc[50:60]

Unnamed: 0,year,month,passengers
50,1953,Mar,236
51,1953,Apr,235
52,1953,May,229
53,1953,Jun,243
54,1953,Jul,264
55,1953,Aug,272
56,1953,Sep,237
57,1953,Oct,211
58,1953,Nov,180
59,1953,Dec,201


In [78]:
# get the last 10 rows with iloc

flights.iloc[-10:]

Unnamed: 0,year,month,passengers
134,1960,Mar,419
135,1960,Apr,461
136,1960,May,472
137,1960,Jun,535
138,1960,Jul,622
139,1960,Aug,606
140,1960,Sep,508
141,1960,Oct,461
142,1960,Nov,390
143,1960,Dec,432


In [79]:
titanic = sns.load_dataset('titanic')

In [81]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Handling Missing Data

In [82]:
titanic.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [83]:
# 177 missing in 'age' (20% of data)
# embarked and embark_town missing 2 each
# deck missing 688 (~80% of data)

In [84]:
# Strategy 1 -Drop Missing Values (drop incomplete rows)
## this is valid when we miss around 5% (95% is complete) # Titanic is not the case

.dropna() # drops all rows with NaN values/missing values

In [129]:
# mode 
most_frequent_embarked_name = titanic['embarked'].mode()[0]
most_frequent_embarked_town_name = titanic['embark_town'].mode()[0]

In [125]:
most_frequent_embarked_name

'S'

In [126]:
most_frequent_embarked_town_name

'Southampton'

In [127]:
# use .fillna to populate the missing 'embarked' and 'embarked_town' rows

titanic['embarked'] = titanic['embarked'].fillna(most_frequent_embarked_name)
titanic['embark_town'] = titanic['embark_town'].fillna(most_frequent_embarked_town_name)

In [87]:
titanic_clean = titanic.dropna()

In [128]:
len(titanic)

891

In [90]:
len(titanic_clean)

182

In [91]:
# Not valid for the titanic since we have much more than 5% of missing values

In [92]:
problematic_columns = ['embarked', 'embark_town']

titanic_new = titanic.dropna(subset = problematic_columns)

In [93]:
titanic_new

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [94]:
# Strategy 2 - Fill missinf values

# when: can`t lose data, can make educated guesses

In [100]:
median_age = titanic['age'].median()

.fillna # fills the NaN values with the specified value (median age in this case)

In [102]:
age_complete = titanic['age'].fillna(median_age)

In [103]:
age_complete.isna().sum()

np.int64(0)

In [104]:
titanic['age'] = age_complete

In [105]:
titanic.isna().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [113]:
# Srtategy 3 Drop columns with too many missing values

In [107]:
titanic['deck'].value_counts()

deck
C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: count, dtype: int64

In [111]:
deck_c_mask = titanic['deck'].isin(['C', 'G'])
titanic[deck_c_mask]['survived'].value_counts()

survived
1    37
0    26
Name: count, dtype: int64

In [112]:
deck_na_mask = titanic['deck'].isna()
titanic[deck_na_mask]['survived'].value_counts()

survived
0    482
1    206
Name: count, dtype: int64

.drop(columns[])

In [117]:
columns_to_drop = ['deck']
titanic = titanic.drop(columns = columns_to_drop)

KeyError: "['deck'] not found in axis"

In [116]:
titanic.isna().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
embark_town    2
alive          0
alone          0
dtype: int64