### **Python pandas library - storing and manipulating data in "dataframes" (tables)**

In [None]:
# Set-up
import pandas as pd
import numpy as np
pd.set_option('display.min_rows', 20)

In [None]:
# Upload all data files - must be on local computer
# Cities.csv, Countries.csv, Players.csv, Teams.csv, Titanic.csv
# If running notebook on local computer:
#   No need to run this cell (it will generate an error)
#   Make sure data files are in same workspace as notebook
from google.colab import files
uploaded = files.upload()

Saving Cities.csv to Cities.csv
Saving Countries.csv to Countries.csv
Saving Players.csv to Players.csv
Saving Teams.csv to Teams.csv
Saving Titanic.csv to Titanic.csv


### Reading from CSV file into dataframe

In [None]:
f = open('Cities.csv')
cities = pd.read_csv(f)

In [None]:
cities

Unnamed: 0,city,country,latitude,longitude,temperature
0,Aalborg,Denmark,57.03,9.92,7.52
1,Aberdeen,United Kingdom,57.17,-2.08,8.10
2,Abisko,Sweden,63.35,18.83,0.20
3,Adana,Turkey,36.99,35.32,18.67
4,Albacete,Spain,39.00,-1.87,12.62
5,Algeciras,Spain,36.13,-5.47,17.38
6,Amiens,France,49.90,2.30,10.17
7,Amsterdam,Netherlands,52.35,4.92,8.93
8,Ancona,Italy,43.60,13.50,13.52
9,Andorra,Andorra,42.50,1.52,9.60


In [None]:
# Number of rows
len(cities)

213

In [None]:
# First few rows
cities.head()

Unnamed: 0,city,country,latitude,longitude,temperature
0,Aalborg,Denmark,57.03,9.92,7.52
1,Aberdeen,United Kingdom,57.17,-2.08,8.1
2,Abisko,Sweden,63.35,18.83,0.2
3,Adana,Turkey,36.99,35.32,18.67
4,Albacete,Spain,39.0,-1.87,12.62


In [None]:
# Last 15 rows
cities.tail(15)

Unnamed: 0,city,country,latitude,longitude,temperature
198,Trikala,Greece,39.56,21.77,16.0
199,Trondheim,Norway,63.42,10.42,4.53
200,Turku,Finland,60.45,22.25,4.72
201,Uppsala,Sweden,59.86,17.64,4.17
202,Valencia,Spain,39.49,-0.4,16.02
203,Vienna,Austria,48.2,16.37,7.86
204,Vigo,Spain,42.22,-8.73,12.85
205,Vilnius,Lithuania,54.68,25.32,5.38
206,Warsaw,Poland,52.25,21.0,7.2
207,Wroclaw,Poland,51.11,17.03,7.17


### Sorting, selecting rows and columns

In [None]:
# Sorting by country then descending temperature
cities.sort_values(['country','temperature'],ascending=[True,False])

Unnamed: 0,city,country,latitude,longitude,temperature
78,Elbasan,Albania,41.12,20.08,15.18
9,Andorra,Andorra,42.50,1.52,9.60
203,Vienna,Austria,48.20,16.37,7.86
95,Graz,Austria,47.08,15.41,6.91
125,Linz,Austria,48.32,14.29,6.79
175,Salzburg,Austria,47.81,13.04,4.62
105,Innsbruck,Austria,47.28,11.41,4.54
47,Brest,Belarus,52.10,23.70,6.73
161,Pinsk,Belarus,52.13,26.09,6.42
138,Mazyr,Belarus,52.05,29.27,6.25


In [None]:
# Selecting a single column - returns a 'series'
cities.city
# Also show cities['city'], cities['temperature'], cities.temperature

0          Aalborg
1         Aberdeen
2           Abisko
3            Adana
4         Albacete
5        Algeciras
6           Amiens
7        Amsterdam
8           Ancona
9          Andorra
          ...     
203         Vienna
204           Vigo
205        Vilnius
206         Warsaw
207        Wroclaw
208    Yevpatoriya
209       Zaragoza
210       Zhytomyr
211      Zonguldak
212         Zurich
Name: city, Length: 213, dtype: object

In [None]:
# Selecting multiple columns - returns a dataframe
cities[['city','temperature']]
# Also show cities[['city']]

Unnamed: 0,city,temperature
0,Aalborg,7.52
1,Aberdeen,8.10
2,Abisko,0.20
3,Adana,18.67
4,Albacete,12.62
5,Algeciras,17.38
6,Amiens,10.17
7,Amsterdam,8.93
8,Ancona,13.52
9,Andorra,9.60


In [None]:
# Selecting rows based on condition
# Note: no need to do type conversion - pandas infers types for columns
cities[cities.longitude < 0]

Unnamed: 0,city,country,latitude,longitude,temperature
1,Aberdeen,United Kingdom,57.17,-2.08,8.1
4,Albacete,Spain,39.0,-1.87,12.62
5,Algeciras,Spain,36.13,-5.47,17.38
10,Angers,France,47.48,-0.53,10.98
17,Badajoz,Spain,38.88,-6.97,15.61
24,Belfast,United Kingdom,54.6,-5.96,8.48
32,Bilbao,Spain,43.25,-2.93,11.41
33,Birmingham,United Kingdom,52.47,-1.92,8.81
34,Blackpool,United Kingdom,53.83,-3.05,9.15
38,Bordeaux,France,44.85,-0.6,11.87


In [None]:
# Selecting rows by number
cities[15:20]
# Show cities[:8] and cities[200:]

Unnamed: 0,city,country,latitude,longitude,temperature
15,Augsburg,Germany,48.35,10.9,4.54
16,Bacau,Romania,46.58,26.92,7.51
17,Badajoz,Spain,38.88,-6.97,15.61
18,Baia Mare,Romania,47.66,23.58,8.87
19,Balti,Moldova,47.76,27.91,8.23


In [None]:
# Putting it together: selecting rows, selecting columns, sorting
# City and longitude of all cities with latitude > 50 and
# temperature > 9, sorted by longitude
temp1 = cities[(cities.latitude > 50) & (cities.temperature > 9)]
temp2 = temp1[['city','longitude']]
temp3 = temp2.sort_values('longitude')
temp3
# Show eliminating temp3, then temp2, then temp1 (use \ for long lines)
# Note similar functionality to SQL

Unnamed: 0,city,longitude
88,Galway,-9.05
67,Cork,-8.5
188,Swansea,-3.95
84,Exeter,-3.53
34,Blackpool,-3.05
40,Bournemouth,-1.9
58,Cambridge,0.12
123,Lille,3.08
49,Brugge,3.23


### <font color="green">**Your Turn**</font>

In [None]:
# Read the Countries.csv file into a dataframe
f = open('Countries.csv')
countries = pd.read_csv(f)
countries.head()

Unnamed: 0,country,population,EU,coastline
0,Albania,2.9,no,yes
1,Andorra,0.07,no,no
2,Austria,8.57,yes,no
3,Belarus,9.48,no,no
4,Belgium,11.37,yes,yes


In [None]:
# Find all countries that are not in the EU and don't
# have coastline, together with their populations,
# sorted by population (smallest to largest)
countries[(countries.EU == 'no') & (countries.coastline == 'no')]\
[['country','population']]\
.sort_values('population', ascending=True)

Unnamed: 0,country,population
22,Liechtenstein,0.04
1,Andorra,0.07
20,Kosovo,1.91
25,Macedonia,2.08
26,Moldova,4.06
38,Switzerland,8.38
33,Serbia,8.81
3,Belarus,9.48


### Aggregation

In [None]:
# Minimum and maximum temperature
print('Minimum temperature:', min(cities.temperature))
print('Maximum temperature:', max(cities.temperature))

Minimum temperature: -2.2
Maximum temperature: 18.67


In [None]:
# Average temperature
print('Using numpy average:', np.average(cities.temperature))
print('Using built-in mean:', cities.temperature.mean())

Using numpy average: 9.497840375586854
Using built-in mean: 9.497840375586854


In [None]:
# Average temperature of cities in each country
cities.groupby('country').mean().temperature
cities.groupby('country').mean()[['temperature']]
# or [['temperature']]
# Also show without column selection
# Change mean() to min()
# Or use .temperature.mean()

Unnamed: 0_level_0,temperature
country,Unnamed: 1_level_1
Albania,15.18
Andorra,9.6
Austria,6.144
Belarus,5.946667
Belgium,9.65
Bosnia and Herzegovina,9.6
Bulgaria,10.44
Croatia,10.865
Czech Republic,7.856667
Denmark,7.625


### <font color="green">**Your Turn**</font>

In [None]:
# Find the average population of countries with coastline
# and countries without coastline
# Hint: You can use groupby!
countries.groupby('coastline').mean().population

coastline
no      5.367692
yes    20.947931
Name: population, dtype: float64

In [None]:
# Then modify to group by both coastline and EU
countries.groupby(['coastline', 'EU']).mean().population

coastline  EU 
no         no      4.353750
           yes     6.990000
yes        no     19.595714
           yes    21.378182
Name: population, dtype: float64

### Joining

In [None]:
cities.merge(countries, on='country')

Unnamed: 0,city,country,latitude,longitude,temperature,population,EU,coastline
0,Aalborg,Denmark,57.03,9.92,7.52,5.69,yes,yes
1,Odense,Denmark,55.40,10.38,7.73,5.69,yes,yes
2,Aberdeen,United Kingdom,57.17,-2.08,8.10,65.11,yes,yes
3,Belfast,United Kingdom,54.60,-5.96,8.48,65.11,yes,yes
4,Birmingham,United Kingdom,52.47,-1.92,8.81,65.11,yes,yes
5,Blackpool,United Kingdom,53.83,-3.05,9.15,65.11,yes,yes
6,Bournemouth,United Kingdom,50.73,-1.90,9.97,65.11,yes,yes
7,Bradford,United Kingdom,53.80,-1.75,8.39,65.11,yes,yes
8,Cambridge,United Kingdom,52.20,0.12,9.25,65.11,yes,yes
9,Dundee,United Kingdom,56.47,-3.00,6.40,65.11,yes,yes


In [None]:
# Joining is symmetric
countries.merge(cities, on='country')

Unnamed: 0,country,population,EU,coastline,city,latitude,longitude,temperature
0,Albania,2.90,no,yes,Elbasan,41.12,20.08,15.18
1,Andorra,0.07,no,no,Andorra,42.50,1.52,9.60
2,Austria,8.57,yes,no,Graz,47.08,15.41,6.91
3,Austria,8.57,yes,no,Innsbruck,47.28,11.41,4.54
4,Austria,8.57,yes,no,Linz,48.32,14.29,6.79
5,Austria,8.57,yes,no,Salzburg,47.81,13.04,4.62
6,Austria,8.57,yes,no,Vienna,48.20,16.37,7.86
7,Belarus,9.48,no,no,Brest,52.10,23.70,6.73
8,Belarus,9.48,no,no,Hrodna,53.68,23.83,6.07
9,Belarus,9.48,no,no,Mazyr,52.05,29.27,6.25


### Miscellaneous features

In [None]:
# String operations - countries with 'ia' in their name
countries[countries.country.str.contains('ia')]

Unnamed: 0,country,population,EU,coastline
0,Albania,2.9,no,yes
2,Austria,8.57,yes,no
5,Bosnia and Herzegovina,3.8,no,yes
6,Bulgaria,7.1,yes,yes
7,Croatia,4.23,yes,yes
11,Estonia,1.31,yes,yes
21,Latvia,1.96,yes,yes
23,Lithuania,2.85,yes,yes
25,Macedonia,2.08,no,no
32,Romania,19.37,yes,yes


In [None]:
# Add fahrenheit column
cities['fahrenheit'] = (cities.temperature * 9/5) + 32
cities

Unnamed: 0,city,country,latitude,longitude,temperature,fahrenheit
0,Aalborg,Denmark,57.03,9.92,7.52,45.536
1,Aberdeen,United Kingdom,57.17,-2.08,8.10,46.580
2,Abisko,Sweden,63.35,18.83,0.20,32.360
3,Adana,Turkey,36.99,35.32,18.67,65.606
4,Albacete,Spain,39.00,-1.87,12.62,54.716
5,Algeciras,Spain,36.13,-5.47,17.38,63.284
6,Amiens,France,49.90,2.30,10.17,50.306
7,Amsterdam,Netherlands,52.35,4.92,8.93,48.074
8,Ancona,Italy,43.60,13.50,13.52,56.336
9,Andorra,Andorra,42.50,1.52,9.60,49.280


In [None]:
# Sometimes temporary dataframe is needed
# Cities with latitude > 50 not in the EU (error then fix)
citiesext = cities.merge(countries, on='country')
citiesext[(citiesext.latitude > 50) & (citiesext.EU == 'no')]

Unnamed: 0,city,country,latitude,longitude,temperature,fahrenheit,population,EU,coastline
135,Bergen,Norway,60.39,5.32,1.75,35.15,5.27,no,yes
136,Bodo,Norway,67.25,14.4,4.5,40.1,5.27,no,yes
137,Oslo,Norway,59.92,10.75,2.32,36.176,5.27,no,yes
138,Stavanger,Norway,58.97,5.68,5.53,41.954,5.27,no,yes
139,Trondheim,Norway,63.42,10.42,4.53,40.154,5.27,no,yes
151,Chernihiv,Ukraine,51.5,31.3,5.92,42.656,44.62,no,yes
155,Kiev,Ukraine,50.43,30.52,6.88,44.384,44.62,no,yes
160,Rivne,Ukraine,50.62,26.25,6.76,44.168,44.62,no,yes
161,Sumy,Ukraine,50.92,34.78,6.28,43.304,44.62,no,yes
163,Zhytomyr,Ukraine,50.25,28.66,6.67,44.006,44.62,no,yes


In [None]:
# Notebook only displays result of last line
# Before last line need to use print
cities[cities.longitude > 35]
cities[cities.longitude < -5]

Unnamed: 0,city,country,latitude,longitude,temperature,fahrenheit
5,Algeciras,Spain,36.13,-5.47,17.38,63.284
17,Badajoz,Spain,38.88,-6.97,15.61,60.098
24,Belfast,United Kingdom,54.6,-5.96,8.48,47.264
42,Braga,Portugal,41.55,-8.42,13.42,56.156
67,Cork,Ireland,51.9,-8.5,9.41,48.938
74,Dublin,Ireland,53.33,-6.25,8.49,47.282
88,Galway,Ireland,53.27,-9.05,10.0,50.0
103,Huelva,Spain,37.25,-6.93,17.09,62.762
126,Lisbon,Portugal,38.72,-9.14,15.52,59.936
155,Oviedo,Spain,43.35,-5.83,10.85,51.53


### <font color="green">**Your Turn**</font>

In [None]:
# Determine the average temperature for EU cities and the average
# temperature for non-EU cities, before and after "Brexit".
# That is, for one pair of averages use the current EU and non-EU countries,
# and for the other pair pretend the United Kingdom is not in the EU.
# Print the four numbers and make sure to label which is which!
# Hint: You can solve this one in just five lines by creating a
# joined dataframe, then averaging the temperatures for four different
# sets of conditions on the dataframe.
# Note: For the 'or' of two conditions in pandas, use '|' instead of '&'
countriesext = countries.merge(cities, on='country')
print('Before Brexit - EU:',countriesext[countriesext.EU == 'yes'].temperature.mean())
print('Before Brexit - non EU:',countriesext[countriesext.EU == 'no'].temperature.mean())
print('After Brexit - EU:',countriesext[(countriesext.EU == 'yes') & (countriesext.country != 'United Kingdom')].temperature.mean())
print('After Brexit - non EU:',countriesext[(countriesext.EU == 'no') | (countriesext.country == 'United Kingdom')].temperature.mean())

Before Brexit - EU: 9.694133333333333
Before Brexit - non EU: 9.03047619047619
After Brexit - EU: 9.793211678832117
After Brexit - non EU: 8.965394736842107


### <font color="green">**Your Turn: World Cup Data**</font>

In [None]:
# Read the Players and Teams data into dataframes
f = open('Players.csv')
players = pd.read_csv(f)
f = open('Teams.csv')
teams = pd.read_csv(f)

In [None]:
# What player on a team with “ia” in the team name played less than
# 200 minutes and made more than 100 passes? Print the player surname.
players[(players.team.str.contains('ia')) & (players.minutes < 200) & (players.passes > 100)].surname

431    Kuzmanovic
Name: surname, dtype: object

In [None]:
# What is the average number of passes made by forwards? By midfielders?
# Make sure the answer specifies which is which, and don't include other
# positions in your result.
players[players.position.isin(['forward', 'midfielder'])]\
.groupby('position').passes.mean()

position
forward       50.825175
midfielder    95.271930
Name: passes, dtype: float64

In [None]:
# Which team has the highest ratio of goalsFor to goalsAgainst?
# Print the team name only.
# Hint: Add a "ratio" column to the teams dataframe, then sort,
# then use head(1) or tail(1) depending how you sorted
teams['ratio'] = teams.goalsFor / teams.goalsAgainst
teams.sort_values('ratio', ascending=False).head(1).team

2    Portugal
Name: team, dtype: object

In [None]:
# How many players who play on a team with ranking <10 played
# more than 350 minutes?
# Reminder: len() gives number of rows in a dataframe
playersext = palyers.merge(teams, on='team')
amount = len(playersext[(playersext.ranking < 10) & (playersext.minutes > 350)])
print(f'Players amount: {amount}')

### <font color="green">**Your Turn Extra: Titanic Data**</font>

In [None]:
# Read the Titanic data into a dataframe
f = open('Titanic.csv')
titanic = pd.read_csv(f)

In [None]:
# How many married women over age 50 embarked in Cherbourg?
# Note: 'first' is a function in Pandas, so 'titanic.first' will generate an error;
# use 'titanic['first'] instead
YOUR CODE HERE

In [None]:
# How many passengers are missing their age, and what is the
# average fare paid by these passengers?
# NOTE: Missing values in Pandas are null, printed as 'NaN', and
# function pd.isnull(v) checks whether a value is null.
YOUR CODE HERE

In [None]:
# What is the average fare paid by passengers in the three classes,
# and the average age of passengers in the three classes?
YOUR CODE HERE

In [None]:
# What was the highest fare paid by a male survivor, female survivor,
# male non-survivor, and female non-survivor? (four numbers)
YOUR CODE HERE

In [None]:
# What is the survival rate of passengers in the three classes, i.e., what fraction
# of passengers in each class survived? What is the survival rate of females
# versus males? Of children (under 18) versus adults (age 18 or over)?
# NOTES: Missing ages automatically fail comparisons like in SQL, and 'class'
# is a function in Pandas, so 'titanic.class' will generate an error;
# use 'titanic['class'] instead
YOUR CODE HERE