In [1]:

from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [2]:
conf = SparkConf()
spark_context = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:

spark = SparkSession.builder.master("spark://spark-master:7077").config("spark.jars.packages", 
                                                                        "org.apache.hadoop:hadoop-aws-2.7.3").appName("spark-example").getOrCreate()

In [4]:
csv_file = "/home/lec_14/data/oscars.csv"
df_csv = spark.read.csv(csv_file, header=True, inferSchema=True)

df_csv.show()

+--------+--------+------------+-------------------+--------------------+---+-------------+----------+--------+-------+-------+
|oscar_no|oscar_yr|       award|               name|               movie|age|     birth_pl|birth_date|birth_mo|birth_d|birth_y|
+--------+--------+------------+-------------------+--------------------+---+-------------+----------+--------+-------+-------+
|       1|    1929|Best actress|       Janet Gaynor|          7th Heaven| 22| Pennsylvania|1906-10-06|      10|      6|   1906|
|       2|    1930|Best actress|      Mary Pickford|            Coquette| 37|       Canada|1892-04-08|       4|      8|   1892|
|       3|    1931|Best actress|      Norma Shearer|        The Divorcee| 28|       Canada|1902-08-10|       8|     10|   1902|
|       4|    1932|Best actress|     Marie Dressler|        Min and Bill| 63|       Canada|1868-11-09|      11|      9|   1868|
|       5|    1933|Best actress|        Helen Hayes|The Sin of Madelo...| 32|Washington DC|1900-10-10|  

In [5]:
df_csv.explain(True)

== Parsed Logical Plan ==
Relation [oscar_no#17,oscar_yr#18,award#19,name#20,movie#21,age#22,birth_pl#23,birth_date#24,birth_mo#25,birth_d#26,birth_y#27] csv

== Analyzed Logical Plan ==
oscar_no: int, oscar_yr: int, award: string, name: string, movie: string, age: int, birth_pl: string, birth_date: date, birth_mo: int, birth_d: int, birth_y: int
Relation [oscar_no#17,oscar_yr#18,award#19,name#20,movie#21,age#22,birth_pl#23,birth_date#24,birth_mo#25,birth_d#26,birth_y#27] csv

== Optimized Logical Plan ==
Relation [oscar_no#17,oscar_yr#18,award#19,name#20,movie#21,age#22,birth_pl#23,birth_date#24,birth_mo#25,birth_d#26,birth_y#27] csv

== Physical Plan ==
FileScan csv [oscar_no#17,oscar_yr#18,award#19,name#20,movie#21,age#22,birth_pl#23,birth_date#24,birth_mo#25,birth_d#26,birth_y#27] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/lec_14/data/oscars.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<oscar_no:int,oscar_yr:int

In [6]:
df_csv.printSchema()

root
 |-- oscar_no: integer (nullable = true)
 |-- oscar_yr: integer (nullable = true)
 |-- award: string (nullable = true)
 |-- name: string (nullable = true)
 |-- movie: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- birth_pl: string (nullable = true)
 |-- birth_date: date (nullable = true)
 |-- birth_mo: integer (nullable = true)
 |-- birth_d: integer (nullable = true)
 |-- birth_y: integer (nullable = true)



In [7]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("year", StringType(), True),
    StructField("age", IntegerType(), True) #Wrong type
])

df_csv_with_schema = spark.read.schema(schema).csv(csv_file, header=True)

df_csv_with_schema.show()

+---+----+----+
| id|year| age|
+---+----+----+
|  1|1929|NULL|
|  2|1930|NULL|
|  3|1931|NULL|
|  4|1932|NULL|
|  5|1933|NULL|
|  6|1934|NULL|
|  7|1935|NULL|
|  8|1936|NULL|
|  9|1937|NULL|
| 10|1938|NULL|
| 11|1939|NULL|
| 12|1940|NULL|
| 13|1941|NULL|
| 14|1942|NULL|
| 15|1943|NULL|
| 16|1944|NULL|
| 17|1945|NULL|
| 18|1946|NULL|
| 19|1947|NULL|
| 20|1948|NULL|
+---+----+----+
only showing top 20 rows



In [8]:

df_csv_with_schema.printSchema()

root
 |-- id: integer (nullable = true)
 |-- year: string (nullable = true)
 |-- age: integer (nullable = true)



In [9]:
# Load CSV file with specific options
df_csv_options = spark.read.options(header='True', 
                                    inferSchema='True', 
                                    delimiter=',',
                                    quote='"',
                                    dateFormat='yyyy-MM-dd',
                                    escape='\\').csv(csv_file)

df_csv_options.show()

+--------+--------+------------+-------------------+--------------------+---+-------------+----------+--------+-------+-------+
|oscar_no|oscar_yr|       award|               name|               movie|age|     birth_pl|birth_date|birth_mo|birth_d|birth_y|
+--------+--------+------------+-------------------+--------------------+---+-------------+----------+--------+-------+-------+
|       1|    1929|Best actress|       Janet Gaynor|          7th Heaven| 22| Pennsylvania|1906-10-06|      10|      6|   1906|
|       2|    1930|Best actress|      Mary Pickford|            Coquette| 37|       Canada|1892-04-08|       4|      8|   1892|
|       3|    1931|Best actress|      Norma Shearer|        The Divorcee| 28|       Canada|1902-08-10|       8|     10|   1902|
|       4|    1932|Best actress|     Marie Dressler|        Min and Bill| 63|       Canada|1868-11-09|      11|      9|   1868|
|       5|    1933|Best actress|        Helen Hayes|The Sin of Madelo...| 32|Washington DC|1900-10-10|  

In [10]:
json_file = "/home/lec_14/data/sales.json"
df_json = spark.read.json(json_file, multiLine=True)

df_json.show()

+-----------------+-----+--------------+-------------+
|           client|price|       product|purchase_date|
+-----------------+-----+--------------+-------------+
|     Norma Fisher|  121|Vacuum cleaner|   2022-08-09|
|     Norma Fisher|  348|Microwave oven|   2022-08-13|
|     Norma Fisher| 1126|         Phone|   2022-08-12|
|   Jorge Sullivan|  171|Microwave oven|   2022-08-10|
|  Elizabeth Woods| 1766|            TV|   2022-08-26|
|     Susan Wagner|  461|Microwave oven|   2022-08-26|
|     Susan Wagner|  561|Microwave oven|   2022-08-05|
| Peter Montgomery| 1994|            TV|   2022-08-03|
| Peter Montgomery| 2804|coffee machine|   2022-08-16|
|Stephanie Collins|  403|Vacuum cleaner|   2022-08-20|
|Stephanie Collins| 1775|coffee machine|   2022-08-18|
| Stephanie Sutton|  613|Vacuum cleaner|   2022-08-09|
| Stephanie Sutton| 2148|            TV|   2022-08-30|
| Stephanie Sutton|  568|Microwave oven|   2022-08-01|
|       Susan Levy|  109|coffee machine|   2022-08-20|
|       Su

In [11]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("details", StructType([
        StructField("age", IntegerType(), True),
        StructField("address", StructType([
            StructField("city", StringType(), True),
            StructField("state", StringType(), True)
        ]), True)
    ]), True)
])

nested_json_file = "/home/lec_14/data/few_files/*.json"
df_nested_json = spark.read.schema(schema).json(nested_json_file, multiLine=True)

df_nested_json.show(truncate=False)

+---+---------------+-----------------------+
|id |name           |details                |
+---+---------------+-----------------------+
|2  |Jane Smith     |{25, {Los Angeles, CA}}|
|3  |Michael Johnson|{40, {Chicago, IL}}    |
|1  |John Doe       |{30, {New York, NY}}   |
+---+---------------+-----------------------+



In [12]:
df_nested_json.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- details: struct (nullable = true)
 |    |-- age: integer (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- state: string (nullable = true)



In [13]:
parquet_file = "/home/lec_14/data/titanic.parquet"
df_parquet = spark.read.parquet(parquet_file)

# Show DataFrame
df_parquet.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [14]:
text_file = "/home/lec_14/data/android.txt"
df_text = spark.read.text(text_file)

# Show DataFrame
df_text.show()

+--------------------+
|               value|
+--------------------+
|03-17 16:13:38.81...|
|03-17 16:13:38.81...|
|03-17 16:13:38.82...|
|03-17 16:13:38.83...|
|03-17 16:13:38.85...|
|03-17 16:13:38.86...|
|03-17 16:13:38.86...|
|03-17 16:13:38.87...|
|03-17 16:13:38.87...|
|03-17 16:13:38.87...|
|03-17 16:13:38.88...|
|03-17 16:13:38.88...|
|03-17 16:13:38.88...|
|03-17 16:13:38.88...|
|03-17 16:13:38.90...|
|03-17 16:13:38.90...|
|03-17 16:13:38.91...|
|03-17 16:13:38.92...|
|03-17 16:13:38.92...|
|03-17 16:13:38.93...|
+--------------------+
only showing top 20 rows



In [15]:
sc = spark.sparkContext

# Load text file into an RDD
rdd = sc.textFile(text_file)

In [16]:
rdd.take(10)

['03-17 16:13:38.811  1702  2395 D WindowManager: printFreezingDisplayLogsopening app wtoken = AppWindowToken{9f4ef63 token=Token{a64f992 ActivityRecord{de9231d u0 com.tencent.qt.qtl/.activity.info.NewsDetailXmlActivity t761}}}, allDrawn= false, startingDisplayed =  false, startingMoved =  false, isRelaunching =  false',
 '03-17 16:13:38.819  1702  8671 D PowerManagerService: acquire lock=233570404, flags=0x1, tag="View Lock", name=com.android.systemui, ws=null, uid=10037, pid=2227',
 '03-17 16:13:38.820  1702  8671 D PowerManagerService: ready=true,policy=3,wakefulness=1,wksummary=0x23,uasummary=0x1,bootcompleted=true,boostinprogress=false,waitmodeenable=false,mode=false,manual=38,auto=-1,adj=0.0userId=0',
 '03-17 16:13:38.839  1702  2113 V WindowManager: Skipping AppWindowToken{df0798e token=Token{78af589 ActivityRecord{3b04890 u0 com.tencent.qt.qtl/com.tencent.video.player.activity.PlayerActivity t761}}} -- going to hide',
 '03-17 16:13:38.859  2227  2227 D TextView: visible is syst

In [20]:
rdd.take(10)

['03-17 16:13:38.811  1702  2395 D WindowManager: printFreezingDisplayLogsopening app wtoken = AppWindowToken{9f4ef63 token=Token{a64f992 ActivityRecord{de9231d u0 com.tencent.qt.qtl/.activity.info.NewsDetailXmlActivity t761}}}, allDrawn= false, startingDisplayed =  false, startingMoved =  false, isRelaunching =  false',
 '03-17 16:13:38.819  1702  8671 D PowerManagerService: acquire lock=233570404, flags=0x1, tag="View Lock", name=com.android.systemui, ws=null, uid=10037, pid=2227',
 '03-17 16:13:38.820  1702  8671 D PowerManagerService: ready=true,policy=3,wakefulness=1,wksummary=0x23,uasummary=0x1,bootcompleted=true,boostinprogress=false,waitmodeenable=false,mode=false,manual=38,auto=-1,adj=0.0userId=0',
 '03-17 16:13:38.839  1702  2113 V WindowManager: Skipping AppWindowToken{df0798e token=Token{78af589 ActivityRecord{3b04890 u0 com.tencent.qt.qtl/com.tencent.video.player.activity.PlayerActivity t761}}} -- going to hide',
 '03-17 16:13:38.859  2227  2227 D TextView: visible is syst

In [21]:
line_count = rdd.count()
print(f"Number of lines: {line_count}")

Number of lines: 2000


In [22]:
first_line = rdd.first()
print(f"First line: {first_line}")

First line: 03-17 16:13:38.811  1702  2395 D WindowManager: printFreezingDisplayLogsopening app wtoken = AppWindowToken{9f4ef63 token=Token{a64f992 ActivityRecord{de9231d u0 com.tencent.qt.qtl/.activity.info.NewsDetailXmlActivity t761}}}, allDrawn= false, startingDisplayed =  false, startingMoved =  false, isRelaunching =  false


In [23]:
filtered_rdd = rdd.filter(lambda line: 'WindowManager:' in line)
filtered_rdd.take(10)

['03-17 16:13:38.811  1702  2395 D WindowManager: printFreezingDisplayLogsopening app wtoken = AppWindowToken{9f4ef63 token=Token{a64f992 ActivityRecord{de9231d u0 com.tencent.qt.qtl/.activity.info.NewsDetailXmlActivity t761}}}, allDrawn= false, startingDisplayed =  false, startingMoved =  false, isRelaunching =  false',
 '03-17 16:13:38.839  1702  2113 V WindowManager: Skipping AppWindowToken{df0798e token=Token{78af589 ActivityRecord{3b04890 u0 com.tencent.qt.qtl/com.tencent.video.player.activity.PlayerActivity t761}}} -- going to hide',
 '03-17 16:13:38.915  1702  3693 V WindowManager: Skipping AppWindowToken{df0798e token=Token{78af589 ActivityRecord{3b04890 u0 com.tencent.qt.qtl/com.tencent.video.player.activity.PlayerActivity t761}}} -- going to hide',
 '03-17 16:13:38.994  1702 27365 I WindowManager: Destroying surface Surface(name=SurfaceView - com.tencent.qt.qtl/com.tencent.video.player.activity.PlayerActivity) called by com.android.server.wm.WindowStateAnimator.destroyDeferre

In [24]:
line_lengths = rdd.map(lambda line: len(line))
line_lengths.take(10)

[318, 161, 223, 218, 74, 71, 72, 71, 76, 71]

In [25]:
# Reduce to get the total number of characters in the text file
total_characters = line_lengths.reduce(lambda a, b: a + b)
print(f"Total number of characters: {total_characters}")

Total number of characters: 275078


In [26]:
rdd.flatMap(lambda line: line.split(" ")).take(10)

['03-17',
 '16:13:38.811',
 '',
 '1702',
 '',
 '2395',
 'D',
 'WindowManager:',
 'printFreezingDisplayLogsopening',
 'app']

In [27]:
words = rdd.flatMap(lambda line: line.split(" "))
word_pairs = words.map(lambda word: (word, 1))
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

word_counts.take(10)

[('03-17', 2000),
 ('', 3819),
 ('D', 650),
 ('WindowManager:', 86),
 ('app', 6),
 ('token=Token{a64f992', 3),
 ('ActivityRecord{de9231d', 3),
 ('com.tencent.qt.qtl/.activity.info.NewsDetailXmlActivity', 3),
 ('allDrawn=', 1),
 ('false,', 66)]

In [28]:
schema = StructType([
    StructField("word", StringType(), True),
    StructField("count", IntegerType(), True)
])

word_counts_df = spark.createDataFrame(word_counts, schema).show()

+--------------------+-----+
|                word|count|
+--------------------+-----+
|               03-17| 2000|
|                    | 3819|
|                   D|  650|
|      WindowManager:|   86|
|                 app|    6|
| token=Token{a64f992|    3|
|ActivityRecord{de...|    3|
|com.tencent.qt.qt...|    3|
|           allDrawn=|    1|
|              false,|   66|
|   startingDisplayed|    1|
|       startingMoved|    1|
|PowerManagerService:|  387|
|     lock=233570404,|    1|
|              Lock",|    2|
|name=com.android....|    1|
|        16:13:38.839|    1|
|                2113|   29|
|ActivityRecord{3b...|    2|
|com.tencent.qt.qt...|    2|
+--------------------+-----+
only showing top 20 rows



In [29]:
import re

In [30]:
# Define the regular expression pattern for real words
pattern = re.compile(r'\b[A-Za-z]+\b')

def extract_words(line):
    return pattern.findall(line)

words_rdd = rdd.flatMap(extract_words)

words_rdd.take(10)

['D',
 'WindowManager',
 'printFreezingDisplayLogsopening',
 'app',
 'wtoken',
 'AppWindowToken',
 'token',
 'Token',
 'ActivityRecord',
 'com']

In [31]:
word_pairs = words_rdd.map(lambda word: (word, 1))
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

word_counts.sortByKey().take(20)

[('ACTIVITY', 2),
 ('AMP', 1),
 ('Acquiring', 33),
 ('Activity', 1),
 ('ActivityInfo', 4),
 ('ActivityManager', 253),
 ('ActivityManagerService', 9),
 ('ActivityRecord', 10),
 ('ActivityStack', 5),
 ('Alarm', 5),
 ('AlarmManager', 13),
 ('AndroidRuntimeException', 3),
 ('Animating', 85),
 ('ApStaDisabledState', 1),
 ('AppWindowToken', 16),
 ('Application', 3),
 ('AudioManager', 66),
 ('AudioMix', 11),
 ('Bad', 1),
 ('Binder', 1)]

In [32]:
word_counts.sortBy(lambda x: x[1], ascending=False).take(20)

[('I', 920),
 ('false', 914),
 ('true', 655),
 ('D', 650),
 ('PhoneStatusBar', 507),
 ('PowerManagerService', 453),
 ('com', 411),
 ('Rect', 404),
 ('android', 369),
 ('V', 257),
 ('DisplayPowerController', 255),
 ('ActivityManager', 253),
 ('vis', 221),
 ('policy', 201),
 ('adj', 200),
 ('newVal', 200),
 ('dockedStackBounds', 200),
 ('setSystemUiVisibility', 200),
 ('mask', 200),
 ('oldVal', 200)]

In [33]:
word_counts.distinct().take(20)

[('D', 650),
 ('AppWindowToken', 16),
 ('Token', 6),
 ('ActivityRecord', 10),
 ('tencent', 116),
 ('View', 2),
 ('Lock', 2),
 ('name', 78),
 ('systemui', 24),
 ('uid', 148),
 ('adj', 200),
 ('video', 4),
 ('player', 4),
 ('going', 4),
 ('hide', 4),
 ('TextView', 10),
 ('show', 2),
 ('release', 26),
 ('I', 920),
 ('StackScrollAlgorithm', 156)]

In [34]:
word_counts.count()

579

In [35]:
word_counts.collect()

word_counts.getNumPartitions()

2

In [36]:
rdd1 = sc.parallelize([("a", 1), ("b", 2)])
rdd2 = sc.parallelize([("a", 3), ("b", 4), ("c", 5)])
result = rdd1.join(rdd2)
result.take(10)

[('a', (1, 3)), ('b', (2, 4))]

In [37]:
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize([3, 4, 5])
result = rdd1.union(rdd2)

result.take(10)

[1, 2, 3, 3, 4, 5]

In [38]:
df_parquet.select("name").show()
df_parquet.select("name", "age").show()

+--------------------+
|                name|
+--------------------+
|Braund, Mr. Owen ...|
|Cumings, Mrs. Joh...|
|Heikkinen, Miss. ...|
|Futrelle, Mrs. Ja...|
|Allen, Mr. Willia...|
|    Moran, Mr. James|
|McCarthy, Mr. Tim...|
|Palsson, Master. ...|
|Johnson, Mrs. Osc...|
|Nasser, Mrs. Nich...|
|Sandstrom, Miss. ...|
|Bonnell, Miss. El...|
|Saundercock, Mr. ...|
|Andersson, Mr. An...|
|Vestrom, Miss. Hu...|
|Hewlett, Mrs. (Ma...|
|Rice, Master. Eugene|
|Williams, Mr. Cha...|
|Vander Planke, Mr...|
|Masselmani, Mrs. ...|
+--------------------+
only showing top 20 rows

+--------------------+----+
|                name| age|
+--------------------+----+
|Braund, Mr. Owen ...|22.0|
|Cumings, Mrs. Joh...|38.0|
|Heikkinen, Miss. ...|26.0|
|Futrelle, Mrs. Ja...|35.0|
|Allen, Mr. Willia...|35.0|
|    Moran, Mr. James|NULL|
|McCarthy, Mr. Tim...|54.0|
|Palsson, Master. ...| 2.0|
|Johnson, Mrs. Osc...|27.0|
|Nasser, Mrs. Nich...|14.0|
|Sandstrom, Miss. ...| 4.0|
|Bonnell, Miss. El...|58.0|
|S

In [39]:
df_parquet.filter(df_parquet["age"] > 25).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|      

In [40]:
df_parquet.filter("age > 25").show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|      

In [41]:
df_parquet.filter(F.col("age") > 25).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|      

In [42]:
df_parquet.filter((F.col("age") > 25) & (F.col("age") < 30)).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|          347742|11.1333| NULL|       S|
|         24|       1|     1|Sloper, Mr. Willi...|  male|28.0|    0|    0|          113788|   35.5|   A6|       S|
|         35|       0|     1|Meyer, Mr. Edgar ...|  male|28.0|    1|    0|        PC 17604|82.1708| NULL|       C|
|         42|       0|     2|Turpin, Mrs. Will...|female|27.0|    1|    0|           11668|   21.0| NULL|       S|
|         54|       1|     2|Faunthorpe, Mrs. ...|female|29.0|    1|    0|      

In [43]:
df_grouped = df_parquet.groupBy("age").count()
df_grouped.show()

+----+-----+
| age|count|
+----+-----+
| 8.0|    4|
|70.0|    2|
| 7.0|    3|
|20.5|    1|
|49.0|    6|
|29.0|   20|
|40.5|    2|
|64.0|    2|
|47.0|    9|
|42.0|   13|
|24.5|    1|
|44.0|    9|
|35.0|   18|
|NULL|  177|
|62.0|    4|
|18.0|   26|
|80.0|    1|
|34.5|    1|
|39.0|   14|
| 1.0|    7|
+----+-----+
only showing top 20 rows



In [44]:
df_parquet.groupBy("age")

GroupedData[grouping expressions: [age], value: [PassengerId: bigint, Survived: bigint ... 10 more fields], type: GroupBy]

In [45]:
df_parquet.agg(F.avg("age")).show()

+-----------------+
|         avg(age)|
+-----------------+
|29.69911764705882|
+-----------------+



In [46]:
df_parquet.agg(F.min("age")).show()

+--------+
|min(age)|
+--------+
|    0.42|
+--------+



In [47]:
df_parquet.agg(F.max("age")).show()

+--------+
|max(age)|
+--------+
|    80.0|
+--------+



In [48]:
df_parquet.withColumn("age_in_10_years", F.col("age") + 10).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+---------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|age_in_10_years|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+---------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|           32.0|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|           48.0|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|           36.0|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|           45.0|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|         

In [49]:
data = [("John", 30), ("Jane", 25), ("Doe", 22)]
df = spark.createDataFrame(data, ["name", "age"])

data2 = [("John", "USA"), ("Jane", "UK"), ("Doe", "Canada")]
df2 = spark.createDataFrame(data2, ["name", "country"])

df_joined = df.join(df2, on="name", how="inner")
df_joined.show()

+----+---+-------+
|name|age|country|
+----+---+-------+
| Doe| 22| Canada|
|Jane| 25|     UK|
|John| 30|    USA|
+----+---+-------+



In [50]:
df_parquet.orderBy("age").show()
df_parquet.orderBy(F.col("age").desc()).show()

+-----------+--------+------+--------------------+------+----+-----+-----+---------------+--------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|         Ticket|    Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+---------------+--------+-----+--------+
|         88|       0|     3|Slocovski, Mr. Se...|  male|NULL|    0|    0|SOTON/OQ 392086|    8.05| NULL|       S|
|        199|       1|     3|Madigan, Miss. Ma...|female|NULL|    0|    0|         370370|    7.75| NULL|       Q|
|         96|       0|     3|Shorney, Mr. Char...|  male|NULL|    0|    0|         374910|    8.05| NULL|       S|
|         83|       1|     3|McDermott, Miss. ...|female|NULL|    0|    0|         330932|  7.7875| NULL|       Q|
|        102|       0|     3|Petroff, Mr. Past...|  male|NULL|    0|    0|         349215|  7.8958| NULL|       S|
|         46|       0|     3|Rogers, Mr. Willi...|  male|NULL|    0|    0|S.C./A

In [51]:
df_parquet.drop("age").show()

+-----------+--------+------+--------------------+------+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|    0|    0|          330877| 8.4583| NULL|       Q|
|         

In [52]:
df_parquet.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [53]:
df_parquet.distinct().show()

+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|    Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----------+--------+
|        680|       1|     1|Cardeza, Mr. Thom...|  male|36.0|    0|    1|PC 17755|512.3292|B51 B53 B55|       C|
|        700|       0|     3|Humblen, Mr. Adol...|  male|42.0|    0|    0|  348121|    7.65|      F G63|       S|
|        873|       0|     1|Carlsson, Mr. Fra...|  male|33.0|    0|    0|     695|     5.0|B51 B53 B55|       S|
|        310|       1|     1|Francatelli, Miss...|female|30.0|    0|    0|PC 17485| 56.9292|        E36|       C|
|        436|       1|     1|Carter, Miss. Luc...|female|14.0|    1|    2|  113760|   120.0|    B96 B98|       S|
|        888|       1|     1|Graham, Miss. Mar...|female|19.0|    0|    0|  112053|    3

In [54]:
df_parquet.dropDuplicates().show()

+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|    Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----------+--------+
|        680|       1|     1|Cardeza, Mr. Thom...|  male|36.0|    0|    1|PC 17755|512.3292|B51 B53 B55|       C|
|        700|       0|     3|Humblen, Mr. Adol...|  male|42.0|    0|    0|  348121|    7.65|      F G63|       S|
|        873|       0|     1|Carlsson, Mr. Fra...|  male|33.0|    0|    0|     695|     5.0|B51 B53 B55|       S|
|        310|       1|     1|Francatelli, Miss...|female|30.0|    0|    0|PC 17485| 56.9292|        E36|       C|
|        436|       1|     1|Carter, Miss. Luc...|female|14.0|    1|    2|  113760|   120.0|    B96 B98|       S|
|        888|       1|     1|Graham, Miss. Mar...|female|19.0|    0|    0|  112053|    3

In [55]:
df_parquet.dropDuplicates(["age"]).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------+-------+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|    Ticket|   Fare|  Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------+-------+-------+--------+
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|    330877| 8.4583|   NULL|       Q|
|        804|       1|     3|Thomas, Master. A...|  male|0.42|    0|    1|      2625| 8.5167|   NULL|       C|
|        756|       1|     2|Hamalainen, Maste...|  male|0.67|    1|    1|    250649|   14.5|   NULL|       S|
|        470|       1|     3|Baclini, Miss. He...|female|0.75|    2|    1|      2666|19.2583|   NULL|       C|
|         79|       1|     2|Caldwell, Master....|  male|0.83|    0|    2|    248738|   29.0|   NULL|       S|
|        306|       1|     1|Allison, Master. ...|  male|0.92|    1|    2|    113781| 151.55|C22 C26|       S|
|

In [56]:
df_parquet.fillna({"age": 0}).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male| 0.0|    0|    0|      

In [57]:
df_parquet.replace({3: 30, 1: 10}, subset='Pclass').show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|    30|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|    10|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|    30|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|    10|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|    30|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|    30|    Moran, Mr. James|  male|NULL|    0|    0|      

In [58]:
df_parquet.sample(fraction=0.5).show()

+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Ticket|   Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|A/5 21171|   7.25|       NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|   113803|   53.1|       C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|   373450|   8.05|       NULL|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|    17463|51.8625|        E46|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|   347742|11.1333|       NULL|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female|14.0|    1|    0|   237736|30.0

In [59]:
data1 = [("John", 30), ("Jane", 25)]
data2 = [("John", 30), ("Doe", 22)]
df1 = spark.createDataFrame(data1, ["name", "age"])
df2 = spark.createDataFrame(data2, ["name", "age"])

df1.intersect(df2).show()

+----+---+
|name|age|
+----+---+
|John| 30|
+----+---+



In [60]:
data1 = [("John", 30), ("Jane", 25)]
data2 = [("Doe", 22), ("Smith", 35)]
df1 = spark.createDataFrame(data1, ["name", "age"])
df2 = spark.createDataFrame(data2, ["name", "age"])

df1.union(df2).show()

+-----+---+
| name|age|
+-----+---+
| John| 30|
| Jane| 25|
|  Doe| 22|
|Smith| 35|
+-----+---+



In [61]:
data1 = [("John", 30), ("Jane", 25)]
data2 = [("John", 30), ("Doe", 22)]
df1 = spark.createDataFrame(data1, ["name", "age"])
df2 = spark.createDataFrame(data2, ["name", "age"])

df1.exceptAll(df2).show()

+----+---+
|name|age|
+----+---+
|Jane| 25|
+----+---+



In [62]:
df_parquet.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                NULL|  NULL| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [63]:
df_parquet.select(df_parquet["name"].alias("full_name"), "age").show()

+--------------------+----+
|           full_name| age|
+--------------------+----+
|Braund, Mr. Owen ...|22.0|
|Cumings, Mrs. Joh...|38.0|
|Heikkinen, Miss. ...|26.0|
|Futrelle, Mrs. Ja...|35.0|
|Allen, Mr. Willia...|35.0|
|    Moran, Mr. James|NULL|
|McCarthy, Mr. Tim...|54.0|
|Palsson, Master. ...| 2.0|
|Johnson, Mrs. Osc...|27.0|
|Nasser, Mrs. Nich...|14.0|
|Sandstrom, Miss. ...| 4.0|
|Bonnell, Miss. El...|58.0|
|Saundercock, Mr. ...|20.0|
|Andersson, Mr. An...|39.0|
|Vestrom, Miss. Hu...|14.0|
|Hewlett, Mrs. (Ma...|55.0|
|Rice, Master. Eugene| 2.0|
|Williams, Mr. Cha...|NULL|
|Vander Planke, Mr...|31.0|
|Masselmani, Mrs. ...|NULL|
+--------------------+----+
only showing top 20 rows



In [64]:
pandas_df = df_parquet.toPandas()
pandas_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [65]:
df_parquet.withColumnRenamed("name", "full_name").show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|           full_name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [66]:
df_parquet.sort(df_parquet["age"].desc()).show()

+-----------+--------+------+--------------------+------+----+-----+-----+-----------+-------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|     Ticket|   Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-----------+-------+-----------+--------+
|        631|       1|     1|Barkworth, Mr. Al...|  male|80.0|    0|    0|      27042|   30.0|        A23|       S|
|        852|       0|     3| Svensson, Mr. Johan|  male|74.0|    0|    0|     347060|  7.775|       NULL|       S|
|         97|       0|     1|Goldschmidt, Mr. ...|  male|71.0|    0|    0|   PC 17754|34.6542|         A5|       C|
|        494|       0|     1|Artagaveytia, Mr....|  male|71.0|    0|    0|   PC 17609|49.5042|       NULL|       C|
|        117|       0|     3|Connors, Mr. Patrick|  male|70.5|    0|    0|     370369|   7.75|       NULL|       Q|
|        673|       0|     2|Mitchell, Mr. Hen...|  male|70.0|    0|    

In [67]:
df_parquet.rdd.take(
    10
)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S'),
 Row(PassengerId=5, Survived=0, Pclass=3, Name='Allen, Mr. William Henry', Sex='male', Age=35.0, SibSp=0, Parch=0, Ticket='373450', Fare=8.05, Cabin=None, Embarked='S'),
 Row(PassengerId=6, Survived=0, Pclass=3, Name='Moran, Mr. James', Sex='male',

In [69]:
df_parquet.limit(2).show()

+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0| PC 17599|71.2833|  C85|       C|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+



In [70]:
for part in df_parquet.randomSplit([0.3, 0.4, 0.3]):
    part.show()

+-----------+--------+------+--------------------+------+----+-----+-----+---------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|         Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+---------------+-------+-----+--------+
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|         113803|   53.1| C123|       S|
|         10|       1|     2|Nasser, Mrs. Nich...|female|14.0|    1|    0|         237736|30.0708| NULL|       C|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|        PP 9549|   16.7|   G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|         113783|  26.55| C103|       S|
|         13|       0|     3|Saundercock, Mr. ...|  male|20.0|    0|    0|      A/5. 2151|   8.05| NULL|       S|
|         20|       1|     3|Masselmani, Mrs. ...|female|NULL|    0|    0|           264

In [72]:
df_parquet.write.mode("overwrite").parquet("/home/lec_14/output/new_data.parquet")

In [73]:

df_parquet.filter(F.col("age").isNotNull()).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|      

In [74]:
df_parquet.filter(F.col("age").isNull()).show()

+-----------+--------+------+--------------------+------+----+-----+-----+---------------+--------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|         Ticket|    Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+---------------+--------+-----+--------+
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|         330877|  8.4583| NULL|       Q|
|         18|       1|     2|Williams, Mr. Cha...|  male|NULL|    0|    0|         244373|    13.0| NULL|       S|
|         20|       1|     3|Masselmani, Mrs. ...|female|NULL|    0|    0|           2649|   7.225| NULL|       C|
|         27|       0|     3|Emir, Mr. Farred ...|  male|NULL|    0|    0|           2631|   7.225| NULL|       C|
|         29|       1|     3|O'Dwyer, Miss. El...|female|NULL|    0|    0|         330959|  7.8792| NULL|       Q|
|         30|       0|     3| Todoroff, Mr. Lalio|  male|NULL|    0|    0|      

In [75]:

df_parquet.dropna().show()

+-----------+--------+------+--------------------+------+----+-----+-----+-----------+--------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|     Ticket|    Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-----------+--------+-----------+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|   PC 17599| 71.2833|        C85|       C|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|     113803|    53.1|       C123|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|      17463| 51.8625|        E46|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|    PP 9549|    16.7|         G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|     113783|   26.55|       C103|       S|
|         22|       1|     2|Beesley, Mr. Lawr...|  male|34.0|  

In [76]:
df_parquet.select(F.countDistinct("name")).show()

+--------------------+
|count(DISTINCT name)|
+--------------------+
|                 891|
+--------------------+



In [77]:
df_parquet.corr("age", "Fare")

0.135515853527051

In [78]:
df_parquet.cov("age", "Fare")

118.49631587080923

In [79]:
quantiles = df_parquet.approxQuantile("age", [0.25, 0.5, 0.75], 0.01)
print(f"Approximate quantiles: {quantiles}")

Approximate quantiles: [20.0, 28.0, 38.0]


In [80]:
df_parquet.sampleBy("name", fractions={"a": 0.5, "b": 1.0, "c": 0.2}).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [81]:

df_parquet.rollup("age").sum("Fare").show()

+----+------------------+
| age|         sum(Fare)|
+----+------------------+
|62.0|             143.6|
| 2.0|          375.3625|
|NULL| 3922.066300000001|
|20.5|              7.25|
|23.0| 569.9207999999999|
|74.0|             7.775|
|27.0| 546.5041000000001|
|40.0|482.42910000000006|
|36.0|         1319.2291|
|14.0|          255.7542|
|19.0| 696.7374000000001|
|55.5|              8.05|
|36.5|              26.0|
|32.0| 437.8208000000001|
|10.0|             52.05|
|21.0|          757.5749|
|13.0|           26.7292|
|30.5|              15.8|
| 8.0|             113.2|
|60.0|             220.0|
+----+------------------+
only showing top 20 rows



In [82]:
def add_years(df):
    return df.withColumn("age_plus_5", F.col("age") + 5)
    
df_parquet.transform(add_years).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|age_plus_5|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|      27.0|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|      43.0|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|      31.0|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|      40.0|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|    

In [83]:
data1 = [("John", 30), ("Jane", 25)]
data2 = [("USA",), ("UK",)]
df1 = spark.createDataFrame(data1, ["name", "age"])
df2 = spark.createDataFrame(data2, ["country"])

# Perform a cross join
df_cross_join = df1.crossJoin(df2)
df_cross_join.show()

+----+---+-------+
|name|age|country|
+----+---+-------+
|John| 30|    USA|
|John| 30|     UK|
|Jane| 25|    USA|
|Jane| 25|     UK|
+----+---+-------+



In [84]:
df_parquet.createOrReplaceTempView("people")

# Run SQL queries on the temporary view
spark.sql("SELECT * FROM people WHERE age > 25").show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|           17463|51.8625|  E46|       S|
|          9|       1|     3|Johnson, Mrs. Osc...|female|27.0|    0|    2|      

In [85]:
df_parquet.selectExpr("name as full_name", "age + 5 as age_in_5_years").show()

+--------------------+--------------+
|           full_name|age_in_5_years|
+--------------------+--------------+
|Braund, Mr. Owen ...|          27.0|
|Cumings, Mrs. Joh...|          43.0|
|Heikkinen, Miss. ...|          31.0|
|Futrelle, Mrs. Ja...|          40.0|
|Allen, Mr. Willia...|          40.0|
|    Moran, Mr. James|          NULL|
|McCarthy, Mr. Tim...|          59.0|
|Palsson, Master. ...|           7.0|
|Johnson, Mrs. Osc...|          32.0|
|Nasser, Mrs. Nich...|          19.0|
|Sandstrom, Miss. ...|           9.0|
|Bonnell, Miss. El...|          63.0|
|Saundercock, Mr. ...|          25.0|
|Andersson, Mr. An...|          44.0|
|Vestrom, Miss. Hu...|          19.0|
|Hewlett, Mrs. (Ma...|          60.0|
|Rice, Master. Eugene|           7.0|
|Williams, Mr. Cha...|          NULL|
|Vander Planke, Mr...|          36.0|
|Masselmani, Mrs. ...|          NULL|
+--------------------+--------------+
only showing top 20 rows



In [86]:
df_parquet.inputFiles()

['file:///home/lec_14/data/titanic.parquet']

In [87]:

df_parquet.isEmpty()

False

In [88]:
df_parquet.isStreaming

False

In [89]:

df_parquet.summary().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                NULL|  NULL| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [90]:
window_spec = Window.partitionBy("sex").orderBy("Survived")

In [91]:

# Apply row_number
df_parquet.withColumn("row_number", F.row_number().over(window_spec)).where("row_number = 1").show()

# Apply rank
df_parquet.withColumn("rank", F.rank().over(window_spec)).where("rank = 1").show()

# Apply dense_rank
df_parquet.withColumn("dense_rank", F.dense_rank().over(window_spec)).where(F.col("dense_rank").isin([1, 2])).show()

+-----------+--------+------+--------------------+------+----+-----+-----+---------+------+-----+--------+----------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Ticket|  Fare|Cabin|Embarked|row_number|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+------+-----+--------+----------+
|         15|       0|     3|Vestrom, Miss. Hu...|female|14.0|    0|    0|   350406|7.8542| NULL|       S|         1|
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|A/5 21171|  7.25| NULL|       S|         1|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+------+-----+--------+----------+

+-----------+--------+------+--------------------+------+----+-----+-----+----------+-------+-----+--------+----+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|    Ticket|   Fare|Cabin|Embarked|rank|
+-----------+--------+------+--------------------+------+----+-

In [92]:
# Filter names that contain the substring 'a'
df_parquet.filter(F.col("name").like("%a%")).show()

# Filter names that start with 'A'
df_parquet.filter(F.col("name").like("A%")).show()

# Filter names that end with 'e'
df_parquet.filter(F.col("name").like("%e")).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [93]:
df_parquet.filter(F.col("name").endswith("A")).show()

df_parquet.filter(F.col("name").startswith("D")).show()

+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+
|         65|       0|     1|Stewart, Mr. Albe...|  male|NULL|    0|    0|PC 17605|27.7208| NULL|       C|
|        304|       1|     2| Keane, Miss. Nora A|female|NULL|    0|    0|  226593|  12.35| E101|       Q|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----+--------+

+-----------+--------+------+--------------------+------+----+-----+-----+------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|      Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+------------+-------+-----+--------+
|         45|       1|  

In [94]:
spark.stop()