In [13]:
# Libraries.
try:
  from pyspark.sql import SparkSession, DataFrame, functions as f
except:
  !pip install pyspark
  from pyspark.sql import SparkSession, DataFrame, functions as f

In [14]:
# Fictional data:
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
            'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
            'deaths': [523, 52, 25, 616, 43, 234, 523, 62, 62, 73, 37, 35],
            'battles': [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
            'size': [1045, 957, 1099, 1400, 1592, 1006, 987, 849, 973, 1005, 1099, 1523],
            'veterans': [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
            'readiness': [1, 2, 3, 3, 2, 1, 2, 3, 2, 1, 2, 3],
            'armored': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
            'deserters': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
            'origin': ['Arizona', 'California', 'Texas', 'Florida', 'Maine', 'Iowa', 'Alaska', 'Washington', 'Oregon', 'Wyoming', 'Louisana', 'Georgia']}

spark = SparkSession.builder.appName('Army').getOrCreate()

# Create a dataframe based on the fictional data.
data_rows = list(zip(*raw_data.values()))
schema = list(raw_data.keys())

army = spark.createDataFrame(data_rows, schema)

In [15]:
# Print only the column veterans.
army.select('veterans').limit(5).show()

+--------+
|veterans|
+--------+
|       1|
|       5|
|      62|
|      26|
|      73|
+--------+



In [16]:
# Print the columns 'veterans' and 'deaths'.
army.select('veterans', 'deaths').limit(5).show()

+--------+------+
|veterans|deaths|
+--------+------+
|       1|   523|
|       5|    52|
|      62|    25|
|      26|   616|
|      73|    43|
+--------+------+



In [17]:
# Print the name of all the columns.
print(f"Columns: {army.columns[0]}", *army.columns[1:], sep = ", ", end = ".")

Columns: regiment, company, deaths, battles, size, veterans, readiness, armored, deserters, origin.

In [26]:
# Select the 'deaths', 'size' and 'deserters' columns from Maine and Alaska.
army \
  .select("origin", "deaths", "size", "deserters") \
  .filter(army.origin.isin("Maine", "Alaska")) \
  .limit(10) \
  .show()

+------+------+----+---------+
|origin|deaths|size|deserters|
+------+------+----+---------+
| Maine|    43|1592|        3|
|Alaska|   523| 987|       24|
+------+------+----+---------+



In [35]:
# Select the rows 3 to 7 and the columns 3 to 6.
army.select(*army.columns[2:6]).collect()[2:7]

[Row(deaths=25, battles=2, size=1099, veterans=62),
 Row(deaths=616, battles=2, size=1400, veterans=26),
 Row(deaths=43, battles=4, size=1592, veterans=73),
 Row(deaths=234, battles=7, size=1006, veterans=37),
 Row(deaths=523, battles=8, size=987, veterans=949)]

In [44]:
# Select every row after the fourth row and all columns.
army.collect()[4:]

[Row(regiment='Dragoons', company='1st', deaths=43, battles=4, size=1592, veterans=73, readiness=2, armored=0, deserters=3, origin='Maine'),
 Row(regiment='Dragoons', company='1st', deaths=234, battles=7, size=1006, veterans=37, readiness=1, armored=1, deserters=4, origin='Iowa'),
 Row(regiment='Dragoons', company='2nd', deaths=523, battles=8, size=987, veterans=949, readiness=2, armored=0, deserters=24, origin='Alaska'),
 Row(regiment='Dragoons', company='2nd', deaths=62, battles=3, size=849, veterans=48, readiness=3, armored=1, deserters=31, origin='Washington'),
 Row(regiment='Scouts', company='1st', deaths=62, battles=4, size=973, veterans=48, readiness=2, armored=0, deserters=2, origin='Oregon'),
 Row(regiment='Scouts', company='1st', deaths=73, battles=7, size=1005, veterans=435, readiness=1, armored=0, deserters=3, origin='Wyoming'),
 Row(regiment='Scouts', company='2nd', deaths=37, battles=8, size=1099, veterans=63, readiness=2, armored=1, deserters=2, origin='Louisana'),
 Row(

In [45]:
# Select every row up to the fourth row and all columns.
army.collect()[:4]

[Row(regiment='Nighthawks', company='1st', deaths=523, battles=5, size=1045, veterans=1, readiness=1, armored=1, deserters=4, origin='Arizona'),
 Row(regiment='Nighthawks', company='1st', deaths=52, battles=42, size=957, veterans=5, readiness=2, armored=0, deserters=24, origin='California'),
 Row(regiment='Nighthawks', company='2nd', deaths=25, battles=2, size=1099, veterans=62, readiness=3, armored=1, deserters=31, origin='Texas'),
 Row(regiment='Nighthawks', company='2nd', deaths=616, battles=2, size=1400, veterans=26, readiness=3, armored=1, deserters=2, origin='Florida')]

In [46]:
# Select the 3rd column up to the 7th column.
army.select(*army.columns[3:7]).show()

+-------+----+--------+---------+
|battles|size|veterans|readiness|
+-------+----+--------+---------+
|      5|1045|       1|        1|
|     42| 957|       5|        2|
|      2|1099|      62|        3|
|      2|1400|      26|        3|
|      4|1592|      73|        2|
|      7|1006|      37|        1|
|      8| 987|     949|        2|
|      3| 849|      48|        3|
|      4| 973|      48|        2|
|      7|1005|     435|        1|
|      8|1099|      63|        2|
|      9|1523|     345|        3|
+-------+----+--------+---------+



In [47]:
# Select rows where df.deaths is greater than 50.
army.filter(army.deaths > 50).show()

+----------+-------+------+-------+----+--------+---------+-------+---------+----------+
|  regiment|company|deaths|battles|size|veterans|readiness|armored|deserters|    origin|
+----------+-------+------+-------+----+--------+---------+-------+---------+----------+
|Nighthawks|    1st|   523|      5|1045|       1|        1|      1|        4|   Arizona|
|Nighthawks|    1st|    52|     42| 957|       5|        2|      0|       24|California|
|Nighthawks|    2nd|   616|      2|1400|      26|        3|      1|        2|   Florida|
|  Dragoons|    1st|   234|      7|1006|      37|        1|      1|        4|      Iowa|
|  Dragoons|    2nd|   523|      8| 987|     949|        2|      0|       24|    Alaska|
|  Dragoons|    2nd|    62|      3| 849|      48|        3|      1|       31|Washington|
|    Scouts|    1st|    62|      4| 973|      48|        2|      0|        2|    Oregon|
|    Scouts|    1st|    73|      7|1005|     435|        1|      0|        3|   Wyoming|
+----------+-------+-

In [48]:
# Select rows where df.deaths is greater than 500 or less than 50.
army.filter((army.deaths < 50) | (army.deaths > 500)).show()

+----------+-------+------+-------+----+--------+---------+-------+---------+--------+
|  regiment|company|deaths|battles|size|veterans|readiness|armored|deserters|  origin|
+----------+-------+------+-------+----+--------+---------+-------+---------+--------+
|Nighthawks|    1st|   523|      5|1045|       1|        1|      1|        4| Arizona|
|Nighthawks|    2nd|    25|      2|1099|      62|        3|      1|       31|   Texas|
|Nighthawks|    2nd|   616|      2|1400|      26|        3|      1|        2| Florida|
|  Dragoons|    1st|    43|      4|1592|      73|        2|      0|        3|   Maine|
|  Dragoons|    2nd|   523|      8| 987|     949|        2|      0|       24|  Alaska|
|    Scouts|    2nd|    37|      8|1099|      63|        2|      1|        2|Louisana|
|    Scouts|    2nd|    35|      9|1523|     345|        3|      1|        3| Georgia|
+----------+-------+------+-------+----+--------+---------+-------+---------+--------+



In [49]:
# Select all the regiments not name "Dragoons".
army.filter(army.regiment != "Dragoons").show()

+----------+-------+------+-------+----+--------+---------+-------+---------+----------+
|  regiment|company|deaths|battles|size|veterans|readiness|armored|deserters|    origin|
+----------+-------+------+-------+----+--------+---------+-------+---------+----------+
|Nighthawks|    1st|   523|      5|1045|       1|        1|      1|        4|   Arizona|
|Nighthawks|    1st|    52|     42| 957|       5|        2|      0|       24|California|
|Nighthawks|    2nd|    25|      2|1099|      62|        3|      1|       31|     Texas|
|Nighthawks|    2nd|   616|      2|1400|      26|        3|      1|        2|   Florida|
|    Scouts|    1st|    62|      4| 973|      48|        2|      0|        2|    Oregon|
|    Scouts|    1st|    73|      7|1005|     435|        1|      0|        3|   Wyoming|
|    Scouts|    2nd|    37|      8|1099|      63|        2|      1|        2|  Louisana|
|    Scouts|    2nd|    35|      9|1523|     345|        3|      1|        3|   Georgia|
+----------+-------+-

In [51]:
# Select the rows called Texas and Arizona.
army.filter(army.origin.isin("Texas", "Arizona")).show()

+----------+-------+------+-------+----+--------+---------+-------+---------+-------+
|  regiment|company|deaths|battles|size|veterans|readiness|armored|deserters| origin|
+----------+-------+------+-------+----+--------+---------+-------+---------+-------+
|Nighthawks|    1st|   523|      5|1045|       1|        1|      1|        4|Arizona|
|Nighthawks|    2nd|    25|      2|1099|      62|        3|      1|       31|  Texas|
+----------+-------+------+-------+----+--------+---------+-------+---------+-------+



In [55]:
# Select the third cell in the row named Arizona.
army.select(army.columns[2]).filter(army.origin.isin("Arizona")).collect()

[Row(deaths=523)]

In [57]:
# Select the third cell down in the column named deaths.
army.select("deaths").collect()[2]

Row(deaths=25)