# 0. Dataset

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tutorial').getOrCreate()

In [2]:
from seaborn import load_dataset

(load_dataset('penguins')
    .drop(columns=['bill_length_mm', 'bill_depth_mm'])
    .rename(columns={'flipper_length_mm': 'flipper', 'body_mass_g': 'mass'})
    .to_csv('data/penguins.csv', index=False))

# 1. Comparisons

### 1.1. Basics

#### 1.1.1. Data Import

In [3]:
# 🐼 pandas 
df_p = pd.read_csv('data/penguins.csv')
df_p.shape

(344, 5)

In [4]:
# 🎇 PySpark
df_s = spark.read.csv('data/penguins.csv', header=True, inferSchema=True)
df_s.count(), len(df_s.columns)

(344, 5)

#### 1.1.2. To check high level information about the data

In [5]:
# 🐼 pandas 
df_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   species  344 non-null    object 
 1   island   344 non-null    object 
 2   flipper  342 non-null    float64
 3   mass     342 non-null    float64
 4   sex      333 non-null    object 
dtypes: float64(2), object(3)
memory usage: 13.6+ KB


In [6]:
# 🎇 PySpark
df_s.printSchema()

root
 |-- species: string (nullable = true)
 |-- island: string (nullable = true)
 |-- flipper: double (nullable = true)
 |-- mass: double (nullable = true)
 |-- sex: string (nullable = true)



#### 1.1.3. To look at the head of data

In [7]:
# 🐼 pandas 
df_p.head()

Unnamed: 0,species,island,flipper,mass,sex
0,Adelie,Torgersen,181.0,3750.0,Male
1,Adelie,Torgersen,186.0,3800.0,Female
2,Adelie,Torgersen,195.0,3250.0,Female
3,Adelie,Torgersen,,,
4,Adelie,Torgersen,193.0,3450.0,Female


In [8]:
# 🎇 PySpark
df_s.show(5)

+-------+---------+-------+------+------+
|species|   island|flipper|  mass|   sex|
+-------+---------+-------+------+------+
| Adelie|Torgersen|  181.0|3750.0|  Male|
| Adelie|Torgersen|  186.0|3800.0|Female|
| Adelie|Torgersen|  195.0|3250.0|Female|
| Adelie|Torgersen|   null|  null|  null|
| Adelie|Torgersen|  193.0|3450.0|Female|
+-------+---------+-------+------+------+
only showing top 5 rows



#### 1.1.4. How to select columns

In [9]:
# 🐼 pandas 
df_p[['island', 'mass']].head(3)

Unnamed: 0,island,mass
0,Torgersen,3750.0
1,Torgersen,3800.0
2,Torgersen,3250.0


In [10]:
# 🎇 PySpark
df_s[['island', 'mass']].show(3)

+---------+------+
|   island|  mass|
+---------+------+
|Torgersen|3750.0|
|Torgersen|3800.0|
|Torgersen|3250.0|
+---------+------+
only showing top 3 rows



In [11]:
# The following version of snippets are probably more common for selecting columns in PySpark:
df_s.select('island', 'mass').show(3)

+---------+------+
|   island|  mass|
+---------+------+
|Torgersen|3750.0|
|Torgersen|3800.0|
|Torgersen|3250.0|
+---------+------+
only showing top 3 rows



In [12]:
df_s.select(['island', 'mass']).show(3)

+---------+------+
|   island|  mass|
+---------+------+
|Torgersen|3750.0|
|Torgersen|3800.0|
|Torgersen|3250.0|
+---------+------+
only showing top 3 rows



### 1.2. Filtering

#### 1.2.1. How to filter the data based on a condition

In [13]:
# 🐼 pandas 
df_p[df_p['species'] == 'Gentoo'].head()

Unnamed: 0,species,island,flipper,mass,sex
220,Gentoo,Biscoe,211.0,4500.0,Female
221,Gentoo,Biscoe,230.0,5700.0,Male
222,Gentoo,Biscoe,210.0,4450.0,Female
223,Gentoo,Biscoe,218.0,5700.0,Male
224,Gentoo,Biscoe,215.0,5400.0,Male


In [14]:
# 🎇 PySpark
df_s[df_s['species'] == 'Gentoo'].show(5)

+-------+------+-------+------+------+
|species|island|flipper|  mass|   sex|
+-------+------+-------+------+------+
| Gentoo|Biscoe|  211.0|4500.0|Female|
| Gentoo|Biscoe|  230.0|5700.0|  Male|
| Gentoo|Biscoe|  210.0|4450.0|Female|
| Gentoo|Biscoe|  218.0|5700.0|  Male|
| Gentoo|Biscoe|  215.0|5400.0|  Male|
+-------+------+-------+------+------+
only showing top 5 rows



In [15]:
# We can also use the followings
df_s.filter(df_s['species'] == 'Gentoo').show(5)

+-------+------+-------+------+------+
|species|island|flipper|  mass|   sex|
+-------+------+-------+------+------+
| Gentoo|Biscoe|  211.0|4500.0|Female|
| Gentoo|Biscoe|  230.0|5700.0|  Male|
| Gentoo|Biscoe|  210.0|4450.0|Female|
| Gentoo|Biscoe|  218.0|5700.0|  Male|
| Gentoo|Biscoe|  215.0|5400.0|  Male|
+-------+------+-------+------+------+
only showing top 5 rows



In [16]:
df_s.filter("species == 'Gentoo'").show(5) 

+-------+------+-------+------+------+
|species|island|flipper|  mass|   sex|
+-------+------+-------+------+------+
| Gentoo|Biscoe|  211.0|4500.0|Female|
| Gentoo|Biscoe|  230.0|5700.0|  Male|
| Gentoo|Biscoe|  210.0|4450.0|Female|
| Gentoo|Biscoe|  218.0|5700.0|  Male|
| Gentoo|Biscoe|  215.0|5400.0|  Male|
+-------+------+-------+------+------+
only showing top 5 rows



#### Below shows a few common filter comparisons:

In [17]:
# # 🐼 pandas 
# 2a df[df['species'].isin(['Chinstrap', 'Gentoo'])].head()
# 3a df[df['species'].str.match('G.')].head()
# 4a df[df['flipper'].between(225,229)].head()
# 5a df[df['mass'].isnull()].head()
# 1b df.loc[df['species']!='Gentoo'].head()
# 2b df[~df['species'].isin(['Chinstrap', 'Gentoo'])].head()
# 3b df[-df['species'].str.match('G.')].head()
# 4b df[~df['flipper'].between(225,229)].head()
# 5b df[df['mass'].notnull()].head()
# 6 df[(df['mass']<3400) & (df['sex']=='Male')].head()
# 7 df[(df['mass']<3400) | (df['sex']=='Male')].head()
# # 🎇 PySpark
# 2a df[df['species'].isin(['Chinstrap', 'Gentoo'])].show(5)
# 3a df[df['species'].rlike('G.')].show(5)
# 4a df[df['flipper'].between(225,229)].show(5)
# 5a df[df['mass'].isNull()].show(5)
# 1b df[df['species']!='Gentoo'].show(5)
# 2b df[~df['species'].isin(['Chinstrap', 'Gentoo'])].show(5)
# 3b df[~df['species'].rlike('G.')].show(5)
# 4b df[~df['flipper'].between(225,229)].show(5)
# 5b df[df['mass'].isNotNull()].show(5)
# 6 df[(df['mass']<3400) & (df['sex']=='Male')].show(5)
# 7 df[(df['mass']<3400) |(df['sex']=='Male')].show(5)

### 1.3. Sorting

#### 1.3.1. To inspect 5 rows with smallest mass:

In [18]:
# 🐼 pandas 
df_p.nsmallest(5, 'mass')

Unnamed: 0,species,island,flipper,mass,sex
190,Chinstrap,Dream,192.0,2700.0,Female
58,Adelie,Biscoe,181.0,2850.0,Female
64,Adelie,Biscoe,184.0,2850.0,Female
54,Adelie,Biscoe,187.0,2900.0,Female
98,Adelie,Dream,178.0,2900.0,Female


In [19]:
# 🎇 PySpark
df_s.orderBy('mass').show(5)  # include nulls

+---------+---------+-------+------+------+
|  species|   island|flipper|  mass|   sex|
+---------+---------+-------+------+------+
|   Gentoo|   Biscoe|   null|  null|  null|
|   Adelie|Torgersen|   null|  null|  null|
|Chinstrap|    Dream|  192.0|2700.0|Female|
|   Adelie|   Biscoe|  181.0|2850.0|Female|
|   Adelie|   Biscoe|  184.0|2850.0|Female|
+---------+---------+-------+------+------+
only showing top 5 rows



Pandas' `.nsmallest()` and `.nlargest()` methods sensibly excludes missing values. However, PySpark doesn’t have equivalent methods.

In [20]:
# Correct way
df_s[df_s['mass'].isNotNull()].orderBy('mass').show(5)

+---------+------+-------+------+------+
|  species|island|flipper|  mass|   sex|
+---------+------+-------+------+------+
|Chinstrap| Dream|  192.0|2700.0|Female|
|   Adelie|Biscoe|  184.0|2850.0|Female|
|   Adelie|Biscoe|  181.0|2850.0|Female|
|   Adelie|Biscoe|  187.0|2900.0|Female|
|   Adelie| Dream|  178.0|2900.0|Female|
+---------+------+-------+------+------+
only showing top 5 rows



#### 1.3.2. another way of sorting using `.sort()` instead of `.orderBy()`:

In [21]:
# 🐼 pandas 
df_p.nlargest(5, 'mass')

Unnamed: 0,species,island,flipper,mass,sex
237,Gentoo,Biscoe,221.0,6300.0,Male
253,Gentoo,Biscoe,230.0,6050.0,Male
297,Gentoo,Biscoe,220.0,6000.0,Male
337,Gentoo,Biscoe,222.0,6000.0,Male
299,Gentoo,Biscoe,223.0,5950.0,Male


In [22]:
# 🎇 PySpark
df_s.sort('mass', ascending=False).show(5)

+-------+------+-------+------+----+
|species|island|flipper|  mass| sex|
+-------+------+-------+------+----+
| Gentoo|Biscoe|  221.0|6300.0|Male|
| Gentoo|Biscoe|  230.0|6050.0|Male|
| Gentoo|Biscoe|  220.0|6000.0|Male|
| Gentoo|Biscoe|  222.0|6000.0|Male|
| Gentoo|Biscoe|  229.0|5950.0|Male|
+-------+------+-------+------+----+
only showing top 5 rows



#### The following variations of the syntax also work:

In [23]:
# df_s.sort(df_s['mass'].desc()).show(5)
# df_s.orderBy('mass', ascending=False).show(5)
# df_s.orderBy(df_s['mass'].desc()).show(5)

#### 1.3.3. To sort by multiple columns

In [24]:
# 🐼 pandas 
df_p.sort_values(['mass', 'flipper'], ascending=False).head()

Unnamed: 0,species,island,flipper,mass,sex
237,Gentoo,Biscoe,221.0,6300.0,Male
253,Gentoo,Biscoe,230.0,6050.0,Male
337,Gentoo,Biscoe,222.0,6000.0,Male
297,Gentoo,Biscoe,220.0,6000.0,Male
331,Gentoo,Biscoe,229.0,5950.0,Male


In [25]:
# 🎇 PySpark
df_s.orderBy(['mass', 'flipper'], ascending=False).show(5)

+-------+------+-------+------+----+
|species|island|flipper|  mass| sex|
+-------+------+-------+------+----+
| Gentoo|Biscoe|  221.0|6300.0|Male|
| Gentoo|Biscoe|  230.0|6050.0|Male|
| Gentoo|Biscoe|  222.0|6000.0|Male|
| Gentoo|Biscoe|  220.0|6000.0|Male|
| Gentoo|Biscoe|  229.0|5950.0|Male|
+-------+------+-------+------+----+
only showing top 5 rows



In PySpark, you can get away without the list like this: `df.orderBy(‘mass’, ‘flipper’, ascending=False).show(5)`

#### 1.3.4. To sort by multiple columns but in different directions:

In [26]:
# 🐼 pandas 
df_p.sort_values(['mass', 'flipper'], ascending=[True, False]).head()

Unnamed: 0,species,island,flipper,mass,sex
190,Chinstrap,Dream,192.0,2700.0,Female
64,Adelie,Biscoe,184.0,2850.0,Female
58,Adelie,Biscoe,181.0,2850.0,Female
116,Adelie,Torgersen,188.0,2900.0,Female
54,Adelie,Biscoe,187.0,2900.0,Female


In [27]:
# 🎇 PySpark
df_s[df_s['mass'].isNotNull()]\
  .sort('mass', 'flipper', ascending=[True, False]).show(5)

+---------+---------+-------+------+------+
|  species|   island|flipper|  mass|   sex|
+---------+---------+-------+------+------+
|Chinstrap|    Dream|  192.0|2700.0|Female|
|   Adelie|   Biscoe|  184.0|2850.0|Female|
|   Adelie|   Biscoe|  181.0|2850.0|Female|
|   Adelie|Torgersen|  188.0|2900.0|Female|
|   Adelie|   Biscoe|  187.0|2900.0|Female|
+---------+---------+-------+------+------+
only showing top 5 rows



In [28]:
# an alternative:
df_s[df_s['mass'].isNotNull()]\
  .orderBy(df_s['mass'].asc(), df_s['flipper'].desc()).show(5)

+---------+---------+-------+------+------+
|  species|   island|flipper|  mass|   sex|
+---------+---------+-------+------+------+
|Chinstrap|    Dream|  192.0|2700.0|Female|
|   Adelie|   Biscoe|  184.0|2850.0|Female|
|   Adelie|   Biscoe|  181.0|2850.0|Female|
|   Adelie|Torgersen|  188.0|2900.0|Female|
|   Adelie|   Biscoe|  187.0|2900.0|Female|
+---------+---------+-------+------+------+
only showing top 5 rows



### 1.4. Aggregation

#### 1.4.1. Simple aggregation

In [29]:
# 🐼 pandas 
df_p.agg({'flipper': 'mean'})

flipper    200.915205
dtype: float64

In [30]:
# 🎇 PySpark
df_s.agg({'flipper': 'mean'}).show()

+------------------+
|      avg(flipper)|
+------------------+
|200.91520467836258|
+------------------+



#### 1.4.2. Multiple aggregations

In [31]:
# 🐼 pandas 
df_p.agg({'flipper': ['min', 'max']})

Unnamed: 0,flipper
min,172.0
max,231.0


In [32]:
# 🎇 PySpark
from pyspark.sql import functions as F
df_s.agg(F.min('flipper'), F.max('flipper')).show()

+------------+------------+
|min(flipper)|max(flipper)|
+------------+------------+
|       172.0|       231.0|
+------------+------------+



#### 1.4.3. To get distinct values in a column

In [33]:
# 🐼 pandas 
df_p['species'].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [34]:
# 🎇 PySpark
df_s.select('species').distinct().show()

+---------+
|  species|
+---------+
|   Gentoo|
|   Adelie|
|Chinstrap|
+---------+



#### 1.4.4. To get a number of distinct values in a column:

In [35]:
# 🐼 pandas 
df_p['species'].nunique()

3

In [36]:
# 🎇 PySpark
df_s.select('species').distinct().count()

3

### 1.5. Aggregation by group

#### 1.5.1. Simple group by aggregation

In [37]:
# 🐼 pandas 
df_p.groupby('species')['mass'].mean()

species
Adelie       3700.662252
Chinstrap    3733.088235
Gentoo       5076.016260
Name: mass, dtype: float64

In [38]:
# 🎇 PySpark
df_s.groupBy('species').agg({'mass': 'mean'}).show()

+---------+------------------+
|  species|         avg(mass)|
+---------+------------------+
|   Gentoo| 5076.016260162602|
|   Adelie| 3700.662251655629|
|Chinstrap|3733.0882352941176|
+---------+------------------+



#### 1.5.2. Aggregating multiple selected columns

In [39]:
# 🐼 pandas 
df_p.groupby('species').agg({'flipper': 'sum', 'mass': 'mean'})

Unnamed: 0_level_0,flipper,mass
species,Unnamed: 1_level_1,Unnamed: 2_level_1
Adelie,28683.0,3700.662252
Chinstrap,13316.0,3733.088235
Gentoo,26714.0,5076.01626


In [40]:
# 🎇 PySpark
df_s.groupBy('species').agg({'flipper': 'sum', 'mass': 'mean'}).show()

+---------+------------+------------------+
|  species|sum(flipper)|         avg(mass)|
+---------+------------+------------------+
|   Gentoo|     26714.0| 5076.016260162602|
|   Adelie|     28683.0| 3700.662251655629|
|Chinstrap|     13316.0|3733.0882352941176|
+---------+------------+------------------+



#### If we don’t specify a column, it will show stats for all numerical columns:

In [41]:
# 🐼 pandas 
df_p.groupby('species').mean()

Unnamed: 0_level_0,flipper,mass
species,Unnamed: 1_level_1,Unnamed: 2_level_1
Adelie,189.953642,3700.662252
Chinstrap,195.823529,3733.088235
Gentoo,217.186992,5076.01626


In [42]:
# 🎇 PySpark
df_s.groupBy('species').mean().show()

+---------+------------------+------------------+
|  species|      avg(flipper)|         avg(mass)|
+---------+------------------+------------------+
|   Gentoo| 217.1869918699187| 5076.016260162602|
|   Adelie|189.95364238410596| 3700.662251655629|
|Chinstrap| 195.8235294117647|3733.0882352941176|
+---------+------------------+------------------+

