In [1]:
# Import SparkSession from pyspark.sql

from pyspark.sql.session import SparkSession

In [2]:
# 1 — Creating a SparkSession

spark = SparkSession.builder\
       .master("local[*]")\
       .appName("Spark_App")\
       .getOrCreate()

In [3]:
from pyspark.sql.functions import lit, expr, col, when, coalesce, greatest, least, months_between, trunc, row_number
from pyspark.sql.functions import sum as f_sum, min as f_min, max as f_max, mean as f_mean, count as f_count

In [4]:
# 2 — Loading DataFrames

df = spark.read.options(header='True', inferSchema='True', delimiter=',') \
.csv("smoking_driking_dataset_Ver01.csv")

df = df.select('sex', 'age', 'height', 'weight', 'urine_protein', 'serum_creatinine')

In [5]:
# 3 — Inspecting Data

#print the schema of df
df.printSchema()

#display the content of df
df.show(truncate = False)

#display the first 5 rows of df
df.show(5, truncate = False)

# Print spark
print(spark)

# Print the tables in the catalog
print(spark.catalog.listTables())

root
 |-- sex: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height: integer (nullable = true)
 |-- weight: integer (nullable = true)
 |-- urine_protein: double (nullable = true)
 |-- serum_creatinine: double (nullable = true)

+------+---+------+------+-------------+----------------+
|sex   |age|height|weight|urine_protein|serum_creatinine|
+------+---+------+------+-------------+----------------+
|Male  |35 |170   |75    |1.0          |1.0             |
|Male  |30 |180   |80    |1.0          |0.9             |
|Male  |40 |165   |75    |1.0          |0.9             |
|Male  |50 |175   |80    |1.0          |1.1             |
|Male  |50 |165   |60    |1.0          |0.8             |
|Male  |50 |165   |55    |3.0          |0.8             |
|Female|45 |150   |55    |1.0          |0.8             |
|Male  |35 |175   |65    |1.0          |0.8             |
|Male  |55 |170   |75    |1.0          |0.8             |
|Male  |40 |175   |75    |1.0          |0.9             

In [6]:
# 4 — Creating columns

# Creating or replacing a local temporary view with this DataFrame.
df.createOrReplaceTempView("smoking_driking")

# Define my query
query = "SELECT *, round(weight /(height*height), 5) as bmi from smoking_driking"
newdf = spark.sql(query)

# Creating columns
newdf = newdf.withColumn("age_minus_mean", col("age") - 47)

newdf = newdf.select("*", newdf.height.alias("height2"))

#display the content of new dataframe
newdf.show(truncate = False)

+------+---+------+------+-------------+----------------+-------+--------------+-------+
|sex   |age|height|weight|urine_protein|serum_creatinine|bmi    |age_minus_mean|height2|
+------+---+------+------+-------------+----------------+-------+--------------+-------+
|Male  |35 |170   |75    |1.0          |1.0             |0.0026 |-12           |170    |
|Male  |30 |180   |80    |1.0          |0.9             |0.00247|-17           |180    |
|Male  |40 |165   |75    |1.0          |0.9             |0.00275|-7            |165    |
|Male  |50 |175   |80    |1.0          |1.1             |0.00261|3             |175    |
|Male  |50 |165   |60    |1.0          |0.8             |0.0022 |3             |165    |
|Male  |50 |165   |55    |3.0          |0.8             |0.00202|3             |165    |
|Female|45 |150   |55    |1.0          |0.8             |0.00244|-2            |150    |
|Male  |35 |175   |65    |1.0          |0.8             |0.00212|-12           |175    |
|Male  |55 |170   |75

In [7]:
# Print the tables in the catalog
print(spark.catalog.listTables())

spark.sql("show databases").show(truncate=False)

spark.sql('show tables in default').show(20, truncate=False)

spark.sql('describe smoking_driking').show(20, truncate=False)

[Table(name='smoking_driking', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]
+---------+
|namespace|
+---------+
|default  |
+---------+

+---------+---------------+-----------+
|namespace|tableName      |isTemporary|
+---------+---------------+-----------+
|         |smoking_driking|true       |
+---------+---------------+-----------+

+----------------+---------+-------+
|col_name        |data_type|comment|
+----------------+---------+-------+
|sex             |string   |null   |
|age             |int      |null   |
|height          |int      |null   |
|weight          |int      |null   |
|urine_protein   |double   |null   |
|serum_creatinine|double   |null   |
+----------------+---------+-------+



In [8]:
# 5 — Selecting

df.select('sex', 'age', 'height', 'weight', 'urine_protein', 'serum_creatinine') \
.drop('urine_protein', 'serum_creatinine') \
.show(truncate = False)

+------+---+------+------+
|sex   |age|height|weight|
+------+---+------+------+
|Male  |35 |170   |75    |
|Male  |30 |180   |80    |
|Male  |40 |165   |75    |
|Male  |50 |175   |80    |
|Male  |50 |165   |60    |
|Male  |50 |165   |55    |
|Female|45 |150   |55    |
|Male  |35 |175   |65    |
|Male  |55 |170   |75    |
|Male  |40 |175   |75    |
|Male  |45 |155   |55    |
|Male  |65 |155   |75    |
|Female|55 |150   |55    |
|Male  |30 |175   |75    |
|Female|30 |160   |50    |
|Male  |40 |170   |65    |
|Female|25 |160   |65    |
|Male  |25 |170   |65    |
|Male  |50 |170   |85    |
|Male  |60 |165   |60    |
+------+---+------+------+
only showing top 20 rows



In [9]:
# 6 — Filtering

df.filter("age >= 27 and age < 47 and height > 155").show(truncate = False)

+------+---+------+------+-------------+----------------+
|sex   |age|height|weight|urine_protein|serum_creatinine|
+------+---+------+------+-------------+----------------+
|Male  |35 |170   |75    |1.0          |1.0             |
|Male  |30 |180   |80    |1.0          |0.9             |
|Male  |40 |165   |75    |1.0          |0.9             |
|Male  |35 |175   |65    |1.0          |0.8             |
|Male  |40 |175   |75    |1.0          |0.9             |
|Male  |30 |175   |75    |1.0          |0.9             |
|Female|30 |160   |50    |1.0          |0.6             |
|Male  |40 |170   |65    |2.0          |0.9             |
|Female|35 |170   |50    |1.0          |0.9             |
|Male  |40 |165   |75    |1.0          |1.0             |
|Female|35 |160   |55    |1.0          |0.8             |
|Female|30 |170   |55    |1.0          |0.8             |
|Male  |40 |175   |75    |1.0          |0.9             |
|Male  |40 |165   |60    |1.0          |1.1             |
|Male  |45 |17

In [10]:
# 7 — Aggregating

print(df.count())

df.groupBy("sex").agg({"age": "max"}).show(truncate = False)

df.groupBy("sex") \
    .agg(f_count(lit(1)).alias("count_obs"), \
         f_sum(lit(1)).alias("sum_obs"), \
         f_mean("age").alias("mean_age"), \
         f_min("weight").alias("min_weight"), \
         f_max("weight").alias("max_weight") \
     ).show(truncate=False)

991346
+------+--------+
|sex   |max(age)|
+------+--------+
|Female|85      |
|Male  |85      |
+------+--------+

+------+---------+-------+----------------+----------+----------+
|sex   |count_obs|sum_obs|mean_age        |min_weight|max_weight|
+------+---------+-------+----------------+----------+----------+
|Female|464931   |464931 |48.8703485033263|25        |120       |
|Male  |526415   |526415 |46.5053142482642|40        |140       |
+------+---------+-------+----------------+----------+----------+



In [11]:
df.select('sex') \
.dropDuplicates() \
.orderBy(col('sex').desc()) \
.show(truncate = False)

df.select('sex') \
.distinct() \
.orderBy(col('sex').asc()) \
.show(truncate = False)

+------+
|sex   |
+------+
|Male  |
|Female|
+------+

+------+
|sex   |
+------+
|Female|
|Male  |
+------+



In [12]:
# 9 — Running Queries Programmatically

# Creating or replacing a local temporary view with this DataFrame.
df.createOrReplaceTempView("smoking_driking")

# SQL statements can be run by using the sql method
query = """SELECT count(1) as people_count
            FROM smoking_driking
            WHERE age >= 27 
            and age < 47 
            and height > 155 """

peopleCountDf = spark.sql(query)

# Display the content of df
peopleCountDf.show(truncate=False)

+------------+
|people_count|
+------------+
|326788      |
+------------+



In [13]:
# 10 — Saving DataFrame to a parquet file
df.write.mode("overwrite").parquet("smoking_drikin.parquet")

df_parquet = spark.read.parquet("smoking_drikin.parquet")
df_parquet.show(truncate=False)

+------+---+------+------+-------------+----------------+
|sex   |age|height|weight|urine_protein|serum_creatinine|
+------+---+------+------+-------------+----------------+
|Male  |30 |170   |70    |1.0          |0.8             |
|Female|40 |160   |60    |1.0          |0.6             |
|Male  |50 |160   |70    |1.0          |1.3             |
|Female|65 |145   |50    |3.0          |0.6             |
|Male  |50 |175   |70    |1.0          |0.9             |
|Male  |40 |175   |80    |1.0          |1.0             |
|Female|45 |155   |45    |1.0          |0.7             |
|Male  |70 |160   |65    |1.0          |0.9             |
|Female|55 |160   |55    |1.0          |0.7             |
|Male  |50 |160   |65    |1.0          |0.9             |
|Male  |35 |175   |70    |1.0          |0.9             |
|Male  |40 |155   |55    |1.0          |1.1             |
|Female|35 |165   |50    |1.0          |0.7             |
|Male  |55 |155   |65    |1.0          |0.8             |
|Male  |55 |16

In [14]:
spark.stop()