Welcome to Pandas for Data Science
Todays agenda:

    Data Manipulation
    Concatenating Dataframes
    Dealing with Null Values
    Aggregate Statistics (Groupby)
    Regular Expressions and Dataframes
    Advanced Pandas Functionality
    Intro to Duckdb

DATA MANIPULATION

In [None]:
import pandas as pd
import numpy as np

print ('pandas version', pd.__version__)

# Tokyo Olympics
tokyo_data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv')

# index_col takes a column name or integer value. It doesn't take a boolean or None value.
#tokyo_data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv', index_col=0)
#tokyo_data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv', index_col=False)
#tokyo_data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv', index_col=None)

tokyo_data.head()

tokyo_data['firstname'] = tokyo_data['person_name'].str.split(' ').str[0]
tokyo_data

# Convert string object to title case. Firstname is the second element of the new list (index 1)
tokyo_data['firstname'] = tokyo_data['person_name'].str.split(' ').str[1].str.title()
tokyo_data

# Convert string object to title case. Firstname is the last element of the new list (index -1)
# It is better to use the name at the end of the list. Some athletes have two last names.
tokyo_data['firstname'] = tokyo_data['person_name'].str.split(' ').str[-1].str.title()
tokyo_data
tokyo_data.query('firstname == "Tamara"')

tokyo_data.head()
tokyo_data.info()


# You can create a new column by performing some mathematical operation on an existing column. For example:
tokyo_data['weight_pounds'] = tokyo_data['weight_kg'] * 2.20462
tokyo_data['weight_pounds'] = round(tokyo_data['weight_pounds'],2)
tokyo_data.head()

# Illustration of the axis:
tokyo_data[['weight_kg','weight_pounds']]
tokyo_data[['weight_kg','weight_pounds']].mean(axis=1)  # Axis 1 will act on all the COLUMNS in each ROW # Axis 0 will act on all the ROWS in each COLUMN
tokyo_data[['weight_kg','weight_pounds']].mean(axis=0)  # Axis 0 will act on all the ROWS in each COLUMN # Axis 1 will act on all the COLUMNS in each ROW
tokyo_data[['weight_kg','weight_pounds']]

# Rename a column by passing a dictionary as an argument to the .rename method
tokyo_data.rename(columns={'weight_pounds':'weight_lb'})
tokyo_data.head()

# You can create a new data frame like this:
tokyo_data2 = tokyo_data.rename(columns={'weight_pounds':'weight_lb'})
tokyo_data2.head()

# Add the birth year for each athlete
tokyo_data.info()

tokyo_data['born_date'] = pd.to_datetime(tokyo_data['born_date'])
tokyo_data.head()
tokyo_data.info()

# You can specify the format for the date, especially since the U.S. uses month-day-year as opposed to other formats
tokyo_data['born_date'] = pd.to_datetime(tokyo_data['born_date'], format="%Y-%m-%d")

# .dt is a method for date manipulations, similar to .str for string manipulations
tokyo_data['birth_year'] = tokyo_data['born_date'].dt.year
tokyo_data[['firstname', 'birth_year']]

# Using Lambda functions with data frames
tokyo_data['height_category'] = tokyo_data['height_cm'].apply(lambda x: 'Short' if x < 140 else ('Average-height' if x < 160 else 'Tall'))
tokyo_data.head()


def height_cat(athlete):
  if athlete['height_cm'] < 172 and athlete['weight_kg'] < 68:
    return 'Lightweight'
  if athlete['height_cm'] < 192 and athlete['weight_kg'] < 81:
    return 'Middleweight'
  else:
    return 'Heavyweight'

# Act on all the columns in each row
tokyo_data['height_category'] = tokyo_data.apply(height_cat, axis=1)
tokyo_data.head(29)


# FILTERING DATA
mask = tokyo_data['height_category'] != 'Heavyweight'
mask
tokyo_data[tokyo_data['height_category'] != 'Heavyweight']
tokyo_data[mask]


Unnamed: 0,height_category
0,False
1,False
2,True
3,True
4,True
5,False
6,False
7,True
8,True
9,False


CONCATENATING DATAFRAMES

In [None]:
import pandas as pd
import numpy as np

print ('pandas version', pd.__version__)

# Tokyo Olympics
tokyo_data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv')

# CONCATENATE DATA FRAMES
# We're interested in the athletes from specific countries;
europe = tokyo_data[tokyo_data['country'].isin(['Spain', 'Netherlands', 'Norway', 'Italy', 'France'])]
europe

european_subset = tokyo_data[(tokyo_data['country'] == 'Spain') | (tokyo_data['country'] == 'Norway') | (tokyo_data['country'] == 'Italy') | (tokyo_data['country'] == 'France')]
european_subset

africa = tokyo_data[tokyo_data['country'].isin(['Sudan', 'Egypt'])]
# Create a subset for America
# america
europe
africa

# We can also use this logical statement in order to filter by rows:
african_subset = tokyo_data[(tokyo_data['country'] == 'Sudan') | (tokyo_data['country'] == 'Egypt')]
african_subset
# Use a logical operator to create the European subset
# european_subset = tokyo_data[(tokyo_data['country'] == 'A') | (tokyo_data['country'] == 'B')]

# We can create a subset of our dataframe where the isin method evaluates to False
not_african_subset = tokyo_data[tokyo_data['country'].isin(['Sudan', 'Egypt']) == False]
not_african_subset
african_subset2 = tokyo_data[tokyo_data['country'].isin(['Sudan', 'Egypt']) == True]
african_subset2

concat_df = pd.concat([africa,europe])
concat_df


Unnamed: 0,person_name,country,discipline,born_date,height_cm,weight_kg
7,ABASS Abobakr,Sudan,Swimming,1886-12-12,155.33,75.0
14,ABDALLA Maryam,Egypt,Artistic Swimming,1882-12-01,130.17,76.3
15,ABDALLAH Shahd,Egypt,Artistic Swimming,1894-09-11,155.33,63.8
16,ABDALRASOOL Mohamed,Sudan,Judo,1899-02-12,135.67,75.2
17,ABDEL LATIF Radwa,Egypt,Shooting,1896-03-12,195.62,88.3
18,ABDEL RAZEK Samy,Egypt,Shooting,1881-01-12,130.17,99.2
19,ABDELAZIZ Abdalla,Egypt,Karate,1880-12-12,155.33,100.5
0,AALERUD Katrine,Norway,Cycling Road,1886-07-11,135.67,88.0
1,ABAD Nestor,Spain,Artistic Gymnastics,1882-12-01,195.62,75.0
2,ABAGNALE Giovanni,Italy,Rowing,1894-09-11,130.17,76.3


DEALING WITH NULL VALUES

In [None]:
import pandas as pd
import numpy as np

print ('pandas version', pd.__version__)

# Tokyo Olympics
tokyo_data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv')

# DEALING WITH NULL VALUES
# Assign null values to specific fields
tokyo_data.head()
tokyo_data.loc[[2,3], 'height_cm'] = np.nan
tokyo_data.head()
tokyo_data.info()
# View the number of Na values
tokyo_data.isna().sum()

# You can use .fillna method to fill out the NaN with a different value
tokyo_data.fillna(tokyo_data['height_cm'].mean())

tokyo_data.loc[[2,3], 'height_cm'] = np.nan
tokyo_data['height_cm'] = tokyo_data['height_cm'].interpolate()
tokyo_data.head()

tokyo_data.loc[[2,3], 'height_cm'] = np.nan
tokyo_data.head()
# We can drop the entire row if there is a NaN
tokyo_data.dropna()

# We can drop a row if a subset has NaN
tokyo_data.dropna()
tokyo_data.loc[[2,3], 'height_cm'] = np.nan
tokyo_data.loc[1, 'weight_kg'] = np.nan
tokyo_data.head()
# .dropna does not overwrite the existing data frame
tokyo_data.dropna(subset=['height_cm'])
tokyo_data.head()
# Assign the resulting data frame to a new variable
tokyo_nonan = tokyo_data.dropna(subset=['height_cm'])
tokyo_nonan.head()
# You can also update it in memory
tokyo_data = tokyo_data.dropna(subset=['height_cm'], inplace=True)
tokyo_data.head()

# We can get those rows that have NaN in them:

tokyo_data.loc[[2,3], 'height_cm'] = np.nan
nandf = tokyo_data[tokyo_data['height_cm'].isna()]
nandf
tokyo_data.head()

# We can get those rows that don't have NaN in them:
tokyo_data.loc[[2,3], 'height_cm'] = np.nan
tokyo_data.head()
tokyo_data[tokyo_data['height_cm'].notna()]



Unnamed: 0,person_name,country,discipline,born_date,height_cm,weight_kg
2,ABAGNALE Giovanni,Italy,Rowing,1894-09-11,,76.3
3,ABALDE Alberto,Spain,Basketball,1899-02-12,,63.8


AGGREGATING DATAFRAMES

In [160]:
import pandas as pd
import numpy as np

print ('pandas version', pd.__version__)

# Tokyo Olympics
tokyo_data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv')

# AGGREGATING DATA
tokyo_data['country'].value_counts()

tokyo_data['discipline'].value_counts()

# Get stats generally for athletes from Egypt
tokyo_data[tokyo_data['country'] == 'Egypt'].value_counts()
# Use Logical Operators to combine the conditions:
tokyo_data[(tokyo_data['country'] == 'Egypt') & (tokyo_data['discipline'] == 'Artistic Swimming')].value_counts()
tokyo_data[tokyo_data['country'] == 'Egypt'].value_counts().head(2)
tokyo_data[tokyo_data['country'] == 'Egypt'].value_counts().tail(1)

# Get stats on the discipline for Egyptian athletes

tokyo_data[tokyo_data['country'] == 'Egypt']['discipline'].value_counts()

tokyo_data[tokyo_data['country'] == 'Egypt']['height_cm'].value_counts()

# A useful Pandas function is .groupby
# We can find the total weight_kg for Artistic Swimmers, Karate, Shooting, Basketball, Rowing, Handball
tokyo_data.groupby(['discipline'])['weight_kg'].sum()

# Find the average weight of athletes (>1 athletes: Artistic Swimmers, Karate, Shooting, Basketball, Rowing, Handball)
tokyo_data.groupby(['discipline'])['weight_kg'].mean()

tokyo_data.head()
# agg() method allows you to apply a function or a list of function names (>1 athletes: Artistic Swimmers, Karate, Shooting, Basketball, Rowing, Handball)
# to be executed along one of the axis of the DataFrame
tokyo_data.groupby(['discipline']).agg({'weight_kg':'sum', 'height_cm':'mean'})

# Group by multiple things or columns
tokyo_data.groupby(['discipline', 'country']).agg({'weight_kg':'sum', 'height_cm':'mean'})

# Convert born_date to datetime and count athletes in each born
tokyo_data['born_date'] = pd.to_datetime(tokyo_data['born_date'])
tokyo_data.info()
tokyo_data.groupby(tokyo_data['born_date'].dt.year).count()
tokyo_data.groupby(tokyo_data['born_date'].dt.year)['discipline'].count().reset_index()
# Sort values with younger athletes at the top of the table and older athletes at the bottom
tokyo_data.groupby(tokyo_data['born_date'].dt.year)['discipline'].count().reset_index().sort_values('born_date')

# Sort values by born_date: Youngest athlete will be placed at the top of the dataframe
tokyo_data.groupby(tokyo_data['born_date'].dt.year)['discipline'].count().reset_index().sort_values('born_date', ascending=False)

# Sort values by discipline
tokyo_data.groupby(tokyo_data['born_date'].dt.year)['discipline'].count().reset_index().sort_values('discipline', ascending=False)

# Can we sort by athlete's country? KeyError
#tokyo_data.groupby(tokyo_data['born_date'].dt.year)['discipline'].count().reset_index().sort_values('country', ascending=False)

# Without reset_index(), born_date is the row name.
tokyo_data.groupby(tokyo_data['born_date'].dt.year)['discipline'].count()

# With reset_index(), an integer row name is used.
tokyo_data.groupby(tokyo_data['born_date'].dt.year)['country'].count().reset_index().sort_values('country', ascending=False)

tokyo_data['yearborn'] = tokyo_data['born_date'].dt.year
tokyo_data['monthborn'] = tokyo_data['born_date'].dt.month
tokyo_data['monthborn']
tokyo_data['yearborn']
# Sort by the month that the athlete was born
tokyo_data.groupby([tokyo_data['yearborn'], tokyo_data['monthborn']])['discipline'].count().reset_index().sort_values('monthborn', ascending=True)


Unnamed: 0,yearborn,monthborn,discipline
1,1881,1,3
7,1899,2,3
6,1896,3,3
3,1886,7,1
5,1894,9,3
0,1880,12,3
2,1882,12,3
4,1886,12,1


REGULAR EXPRESSIONS AND CONDITIONAL CHANGES

In [176]:
import pandas as pd

print ('pandas version', pd.__version__)

# Use this data, GDP for countries in Europe
data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/gapminder_gdp_europe.csv')

# Colab
store = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/cal_housing_small.csv')

# Ctrl + Shift + Enter = Specific Line Run
# Ctrl + Enter = All Lines Run

# REGULAR EXPRESSIONS
# Return rows where the country contains the word 'ria' or 'land' # using regular expressions
data[data["country"].str.contains("ria|land")]

# Return rows where the country name starts with a vowel # using regular expressions
data[data["country"].str.contains(r"^[AEIOUaeiou]", na=False)]

# Return rows where the country name ends with a vowel # using regular expressions
data[data["country"].str.contains(r"[AEIOUaeiou]$", na=False)]

# Return rows where the country name contains exactly two vowels # using regular expressions
data[data["country"].str.contains(r"^[^AEIOUaeiou]*[AEIOUaeiou][^AEIOUaeiou]*[AEIOUaeiou][^AEIOUaeiou]*$", na=False)]

# Return rows where the country name contains repeated letters # using regular expressions
data[data["country"].str.contains(r"(.)\1", na=False)]

# Return rows where the country name begins with Sp letters # using regular expressions
data[data["country"].str.contains(r"(^Sp)", na=False)]

data[data["country"].str.contains("(^Sp)", na=False)]
data[data["country"].str.contains("(^Ne)", na=False)]

# Return rows where the country name contains ny at the end # using regular expressions
data[data["country"].str.contains(r"(ny$)", na=False)]

# Return rows where the country name contains ria or any at the end # using regular expressions
data[data["country"].str.contains(r"any$|ria$", case=False, na=False)]

# Return rows where the country name contains 8 letters # using regular expressions
data[data["country"].str.contains(r"(^.{8}$)", na=False)]

# How do you return rows where the country name contains 10 letters? # using regular expressions
data[data["country"].str.contains(r"(^.{10}$)", na=False)]

# Return rows where the country name contains 3 or more vowels # using regular expressions
data[data["country"].str.contains(r"([AEIOUaeiou].*){3,}", na=False)]

# Return rows where the country name contains 4 or more vowels # using regular expressions
data[data["country"].str.contains(r"([AEIOUaeiou].*){4,}", na=False)]

# Return rows where the country name contains spaces # using regular expressions
data[data["country"].str.contains(r"[ ]", na=False)]

# Return rows where the country name starts and ends with the same letter # using regular expressions
data[data["country"].str.contains(r"^(.).*\1$", na=False)]

# To turn off regular expressions. Country name contains 3 or more vowels # using regular expressions
data[data["country"].str.contains(r"([AEIOUaeiou].*){3,}", na=False, regex=False)]

# To turn on regular expressions. Country name contains 3 or more vowels # using regular expressions
threemore = data[data["country"].str.contains(r"([AEIOUaeiou].*){3,}", na=False, regex=True)]
threemore
threemore.to_csv('threemoreresult.csv')
newdata = pd.read_csv('threemoreresult.csv')
newdata

  data[data["country"].str.contains(r"^(.).*\1$", na=False)]


Unnamed: 0,country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007


ADVANCED PANDAS FUNCTIONALITY

In [195]:
import pandas as pd
import numpy as np

print ('pandas version', pd.__version__)

# Tokyo Olympics
tokyo_data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv')

#  .rank() .cumsum()

tokyo_data['height_rank'] = tokyo_data['height_cm'].rank()
tokyo_data
tokyo_data['height_rank'].sort_values(ascending=False)

tokyo_data.head()

#for index, row in tokyo_data.iterrows():
#  print(index)
#  print("Athlete: ", row['person_name'])
#  print(row['country'])
#  print("\n\n")

# Sort by the month that the athlete was born, and take 10 random samples
tokyo_data.sort_values(['height_rank'], ascending=False).sample(10)[['person_name', 'discipline','height_rank']]
tokyo_data.sort_values(['height_rank'], ascending=True).sample(10)[['person_name', 'discipline','height_rank']]

# Sort by the month that the athlete was born, and display the first 10 rows
tokyo_data.sort_values(['height_rank'], ascending=False).head(10)[['person_name', 'discipline','height_rank']]
tokyo_data.sort_values(['height_rank'], ascending=True).head(10)[['person_name', 'discipline','height_rank']]



tokyo_data['height_rank'] = tokyo_data['height_cm'].rank()
tokyo_data['height_rank'].sort_values(ascending=False)

# Grab all the datetime columns
tokyo_data.select_dtypes('datetime')

# Grab all the boolean columns
tokyo_data.select_dtypes('bool')

# Grab all the integer columns
tokyo_data.select_dtypes('int64')

# Grab all the floating point columns
tokyo_data.select_dtypes(include='float')

# Grab all the floating point columns
tokyo_data.select_dtypes(include='float', exclude=None)

# At least one of include or exclude must be nonempty
#tokyo_data.select_dtypes(include=None, exclude=None)

# Cummulative Sum of height
tokyo_data['cumulative_height'] = tokyo_data['height_cm'].cumsum()
tokyo_data.head()



Unnamed: 0,person_name,country,discipline,born_date,height_cm,weight_kg,cumulative_height
0,AALERUD Katrine,Norway,Cycling Road,1886-07-11,135.67,88.0,135.67
1,ABAD Nestor,Spain,Artistic Gymnastics,1882-12-01,195.62,75.0,331.29
2,ABAGNALE Giovanni,Italy,Rowing,1894-09-11,130.17,76.3,461.46
3,ABALDE Alberto,Spain,Basketball,1899-02-12,155.33,63.8,616.79
4,ABALDE Tamara,Spain,Basketball,1896-03-12,135.67,75.2,752.46


INTRO TO DUCKDB

DuckDB is a relational (table-oriented) Database Management System (DBMS) that supports the Structured Query Language (SQL).
DuckDB is a high-performance analytical database system. It is designed to be fast, reliable, portable, and easy to use. DuckDB provides a rich SQL dialect, with support far beyond basic SQL.

DuckDB is available as a standalone CLI application and has clients for Python, R, Java, etc., with deep integration with packages such as pandas and dplyr.

Pandas DataFrames stored in local variables can be queried as if they are regular tables within DuckDB.

In [219]:
import duckdb
import pandas as pd
import numpy as np

print ('pandas version', pd.__version__)

# Tokyo Olympics
# Create a Pandas Dataframe
tokyo_data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv')

# query the Pandas DataFrame "tokyo_data"
# Note: duckdb.sql connects to the default in-memory database connection
# .df() formats the output as a dataframe
results = duckdb.sql("SELECT * FROM tokyo_data").df()
results = duckdb.sql("SELECT * FROM tokyo_data")
results

# SHOW DATABASES;

# USE memory;

# 'https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv'

# CREATE TABLE mammals (id INTEGER, breed VARCHAR, age INTEGER);
# CREATE TABLE mammals AS select 85 AS age, 'Human' AS breed;
# INSERT INTO mammals VALUES ('Cat', 13), ('Human', 96), ('Tiger', 24);
# SHOW tables;
# SELECT * FROM mammals;

#CREATE TABLE scientists (
#    personid integer primary key,
#    firstname varchar(255) not null,
#    lastname varchar(255),
#    age integer
#);

#CREATE TABLE scientists (
#    personid integer primary key,
#    firstname varchar(255) not null,
#    lastname varchar(255),
#    age integer
#);

# CREATE SEQUENCE seq_personid START 1;
# INSERT INTO scientists VALUES (nextval('seq_personid'), 'Francis', 'Crick', 99), (nextval('seq_personid'), 'James', 'Watson', 96);
# INSERT INTO scientists VALUES (nextval('seq_personid'), 'JoAnn', 'Mason', 70);
# INSERT INTO Pests VALUES (nextval('seq_personid'), 'Oliver', 'Lowry', 86);

# UPDATE scientists SET firstname='Albert', lastname='Einstein' WHERE firstname='Oliver';

# DELETE FROM scientists WHERE firstname='Albert' AND lastname='Einstein';

# INSERT INTO scientists VALUES (nextval('seq_personid'), 'Oliver', 'Lowry', 86);
# INSERT INTO scientists VALUES (nextval('seq_personid'), 'Albert', 'Einstein', 89);

#	Syntax: ALTER TABLE table_name ADD column_name datatype
# ALTER TABLE scientists ADD nationality varchar(255)

# When you add a column, you need to update the new field for the existing records
# You can update multiple fields
# UPDATE scientists SET nationality='German', lastname='Einstein2' WHERE firstname='Albert';
# UPDATE scientists SET nationality='American' WHERE personid>=2  AND personid<6;
# UPDATE scientists SET nationality='British' WHERE firstname='Francis';

# Syntax: ALTER TABLE table_name DROP COLUMN column_name
#	ALTER TABLE scientists DROP COLUMN nationality

# Ascending order by default
height1 = duckdb.sql("SELECT * FROM  tokyo_data  ORDER  BY  height_cm").df()
height1

height2 = duckdb.sql("SELECT * FROM  tokyo_data  ORDER  BY  height_cm desc").df()
height2

height3 = duckdb.sql("SELECT * FROM  tokyo_data  ORDER  BY  height_cm desc")
height3

# Introduce an error by using an invalid column name (weight)
#height3 = duckdb.sql("SELECT * FROM  tokyo_data  ORDER  BY  weight asc").df()
#height3

tokyoselect2 = duckdb.sql("SELECT * FROM  read_csv_auto('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv') LIMIT 13")
tokyoselect2

tokyoselect3 = duckdb.sql("SELECT * FROM read_csv_auto('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv') WHERE discipline='Basketball' LIMIT 3").df()
tokyoselect3

# Example
# Select all athletes that either: are from Spain and height greater than either 140, or greater than 160:

country_height = duckdb.sql("SELECT * FROM read_csv_auto('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv') WHERE country = 'Spain' AND height_cm > 140")
country_height

country_height_df = duckdb.sql("SELECT * FROM read_csv_auto('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv') WHERE country = 'Spain' AND height_cm > 140").df()
country_height_df

# Spain and 140 is seen as condition1, 160 is seen as condition2. It returns the union of the two.
country_height2 = duckdb.sql("SELECT * FROM read_csv_auto('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv') WHERE country = 'Spain' AND height_cm > 140  OR  height_cm > 160")
country_height2


┌───────────────────┬────────────┬─────────────────────┬────────────┬───────────┬───────────┐
│    person_name    │  country   │     discipline      │ born_date  │ height_cm │ weight_kg │
│      varchar      │  varchar   │       varchar       │    date    │  double   │  double   │
├───────────────────┼────────────┼─────────────────────┼────────────┼───────────┼───────────┤
│ ABAD Nestor       │ Spain      │ Artistic Gymnastics │ 1882-12-01 │    195.62 │      75.0 │
│ ABALO Luc         │ France     │ Handball            │ 1881-01-12 │    195.62 │      88.3 │
│ ABBASOV Islam     │ Azerbaijan │ Wrestling           │ 1894-09-11 │    195.62 │      63.8 │
│ ABDALLA Abubaker  │ Qatar      │ Athletics           │ 1880-12-12 │    195.62 │      75.0 │
│ ABDEL LATIF Radwa │ Egypt      │ Shooting            │ 1896-03-12 │    195.62 │      88.3 │
│ ABALDE Alberto    │ Spain      │ Basketball          │ 1899-02-12 │    155.33 │      63.8 │
└───────────────────┴────────────┴─────────────────────┴────