In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir",f"/user/itv016269/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
sample_list = [("Spring",12.3),("Summer",10.5),("Autumn",8.2),("Winter",15.1)]

In [3]:
list_schema = "season string, windspeed float"

In [4]:
new_df = spark.createDataFrame(sample_list,list_schema)

In [5]:
new_df.show()

+------+---------+
|season|windspeed|
+------+---------+
|Spring|     12.3|
|Summer|     10.5|
|Autumn|      8.2|
|Winter|     15.1|
+------+---------+



In [8]:
from pyspark.sql.types import *

library_schema=StructType([StructField("library_name", StringType()),
                           StructField("location",StringType()),
                           StructField("books",ArrayType(StructType([
                               StructField("bookid",StringType()),
                               StructField("book_name",StringType()),
                               StructField("author_name",StringType()),
                               StructField("copies_available",IntegerType())
                           ]))),
                           StructField("members",ArrayType(StructType([
                               StructField("member_id",StringType()),
                               StructField("member_name",StringType()),
                               StructField("age",IntegerType()),
                               StructField("books_borrowed",StringType())
                           ]))),
                                       ])

In [12]:
library_df=spark.read \
.format('json') \
.option("header","true") \
.schema(library_schema) \
.load("/public/trendytech/datasets/library_data.json")

In [13]:
library_df.show()

+-----------------+-----------+--------------------+--------------------+
|     library_name|   location|               books|             members|
+-----------------+-----------+--------------------+--------------------+
|  Central Library|City Center|[{null, The Great...|[{M001, John Smit...|
|Community Library|     Suburb|[{null, 1984, nul...|[{M003, Michael B...|
+-----------------+-----------+--------------------+--------------------+



In [14]:
library_df.printSchema()

root
 |-- library_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- books: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bookid: string (nullable = true)
 |    |    |-- book_name: string (nullable = true)
 |    |    |-- author_name: string (nullable = true)
 |    |    |-- copies_available: integer (nullable = true)
 |-- members: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- member_id: string (nullable = true)
 |    |    |-- member_name: string (nullable = true)
 |    |    |-- age: integer (nullable = true)
 |    |    |-- books_borrowed: string (nullable = true)



In [15]:
train_df=spark.read \
.format("csv") \
.option("header","true") \
.load("/public/trendytech/datasets/train.csv")

In [16]:
train_df.show()

+------------+----------+---------------+--------------+---+-------------+-----------+
|train_number|train_name|seats_available|passenger_name|age|ticket_number|seat_number|
+------------+----------+---------------+--------------+---+-------------+-----------+
|         123|   Express|            100|          John| 25|         T123|         A1|
|         123|   Express|            100|          Emma| 30|         T124|         B2|
|         456| Superfast|            150|       Michael| 35|         T125|         C3|
|         456| Superfast|            150|        Sophia| 40|         T126|         D4|
|         789|     Local|             50|       William| 28|         T127|         E5|
|         789|     Local|             50|        Sophia| 32|         T128|         F6|
|         789|     Local|             50|        Oliver| 45|         T129|         G7|
+------------+----------+---------------+--------------+---+-------------+-----------+



In [17]:
new_train_df=train_df.drop("Passenger_name","age")

In [40]:
new_train_df.show()

+------------+----------+---------------+-------------+-----------+
|train_number|train_name|seats_available|ticket_number|seat_number|
+------------+----------+---------------+-------------+-----------+
|         123|   Express|            100|         T123|         A1|
|         123|   Express|            100|         T124|         B2|
|         456| Superfast|            150|         T125|         C3|
|         456| Superfast|            150|         T126|         D4|
|         789|     Local|             50|         T127|         E5|
|         789|     Local|             50|         T128|         F6|
|         789|     Local|             50|         T129|         G7|
+------------+----------+---------------+-------------+-----------+



In [18]:
new_train_df.count()

7

In [19]:
df_dropduplicates = train_df.dropDuplicates(["train_number","ticket_number"])
df_dropduplicates.count()

7

In [20]:
distinct_trains_df = train_df.select("train_name").distinct()
distinct_trains_df.count()

3

In [21]:
sales_df=spark.read \
.format('json') \
.option("header","true") \
.option("inferSchema", "true") \
.option("mode","permissive") \
.load("/public/trendytech/datasets/sales_data.json")

In [22]:
sales_df.show()

+---------------+----------+--------+-------+--------+
|_corrupt_record|   product|quantity|revenue|store_id|
+---------------+----------+--------+-------+--------+
|           null|     Apple|      10|  100.0|       1|
|           null|    Banana|      15|   75.0|       2|
|           null|    Orange|      12|   90.0|       3|
|           null|     Mango|       8|  120.0|       4|
|           null|     Grape|      20|  150.0|       5|
|           null|Watermelon|       5|   50.0|       6|
|           null|Strawberry|      18|  108.0|       7|
|           null| Pineapple|      14|  140.0|       8|
|           null|    Cherry|       7|  105.0|       9|
|           null|      Pear|       9|   81.0|      10|
|           null| Blueberry|      11|   88.0|      11|
|           null|      Kiwi|      16|  128.0|      12|
|           null|     Peach|      13|   91.0|      13|
|           null|      Plum|       6|   54.0|      14|
|           null|     Lemon|      10|   70.0|      15|
|         

In [23]:
no_of_records = sales_df.count()
print("Number of records on Permissive Mode:",no_of_records)

Number of records on Permissive Mode: 22


In [24]:
sales_df=spark.read \
.format('json') \
.option("header","true") \
.option("inferSchema", "true") \
.option("mode","dropmalformed") \
.load("/public/trendytech/datasets/sales_data.json")

In [25]:
no_of_records = sales_df.count()
print("Number of records on dropmalformed Mode:",no_of_records)

Number of records on dropmalformed Mode: 21


In [26]:
hospital_schema = "pateint_id int, admission_date date, discharge_date date, diagnosis string, doctor_id int, total_cost float"

In [27]:
hospital_df=spark.read \
.format("csv") \
.option("header","true") \
.option("dateFormat","MM-dd-yyyy") \
.schema(hospital_schema) \
.load("/public/trendytech/datasets/hospital.csv")

In [28]:
hospital_df.show()

+----------+--------------+--------------+-------------+---------+----------+
|pateint_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|
+----------+--------------+--------------+-------------+---------+----------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|    5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      102|    7000.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      103|    3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      104|   15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|      105|    2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|      106|    8000.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|      107|    5500.0|
|         8|    2022-08-25|    2022-09-01| Heart Attack|      108|   20000.0|
|         9|    2022-09-15|    2022-09-22|Fractured Leg|      109|    6000.0|
|        10|    2022-10-05|    2022-10-10| Appendicitis|      11

In [29]:
hospital_df.printSchema()

root
 |-- pateint_id: integer (nullable = true)
 |-- admission_date: date (nullable = true)
 |-- discharge_date: date (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- doctor_id: integer (nullable = true)
 |-- total_cost: float (nullable = true)



In [30]:
dropped_df = hospital_df.drop("doctor_id")
dropped_df.show()

+----------+--------------+--------------+-------------+----------+
|pateint_id|admission_date|discharge_date|    diagnosis|total_cost|
+----------+--------------+--------------+-------------+----------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|    5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|    7000.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|    3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|   15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|    2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|    8000.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|    5500.0|
|         8|    2022-08-25|    2022-09-01| Heart Attack|   20000.0|
|         9|    2022-09-15|    2022-09-22|Fractured Leg|    6000.0|
|        10|    2022-10-05|    2022-10-10| Appendicitis|    7500.0|
|        11|    2022-11-02|    2022-11-05|    Influenza|    2800.0|
|        12|    2022-12-10|    2022-12-18|    Pn

In [31]:
renamed_df = hospital_df.withColumnRenamed("total_cost","hospital_bill")
renamed_df.show()

+----------+--------------+--------------+-------------+---------+-------------+
|pateint_id|admission_date|discharge_date|    diagnosis|doctor_id|hospital_bill|
+----------+--------------+--------------+-------------+---------+-------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|       5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      102|       7000.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      103|       3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      104|      15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|      105|       2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|      106|       8000.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|      107|       5500.0|
|         8|    2022-08-25|    2022-09-01| Heart Attack|      108|      20000.0|
|         9|    2022-09-15|    2022-09-22|Fractured Leg|      109|       6000.0|
|        10|    2022-10-05| 

In [32]:
from pyspark.sql.functions import *

In [33]:
new_column_df = hospital_df.withColumn("DurationofStay", expr("datediff(discharge_date,admission_date)"))
new_column_df.show()

+----------+--------------+--------------+-------------+---------+----------+--------------+
|pateint_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|DurationofStay|
+----------+--------------+--------------+-------------+---------+----------+--------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|    5000.0|             9|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      102|    7000.0|             4|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      103|    3500.0|             6|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      104|   15000.0|             6|
|         5|    2022-05-05|    2022-05-07|    Influenza|      105|    2500.0|             2|
|         6|    2022-06-10|    2022-06-15| Appendicitis|      106|    8000.0|             5|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|      107|    5500.0|             5|
|         8|    2022-08-25|    2022-09-01| Heart Attack|      108|   2

In [34]:
adjusted_cost_df = renamed_df.withColumn("adjusted_total_cost",expr("CASE WHEN diagnosis LIKE 'HeartAttack' THEN hospital_bill*1.5 WHEN diagnosis LIKE 'Appendicitis' THEN hospital_bill*1.2 ELSE hospital_bill END"))
adjusted_cost_df.show()

+----------+--------------+--------------+-------------+---------+-------------+-------------------+
|pateint_id|admission_date|discharge_date|    diagnosis|doctor_id|hospital_bill|adjusted_total_cost|
+----------+--------------+--------------+-------------+---------+-------------+-------------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|       5000.0|             5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      102|       7000.0|             8400.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      103|       3500.0|             3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      104|      15000.0|            15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|      105|       2500.0|             2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|      106|       8000.0|             9600.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|      107|       5500.0|           

In [35]:
adjusted_cost_df.select("pateint_id","diagnosis","hospital_bill","adjusted_total_cost").show()

+----------+-------------+-------------+-------------------+
|pateint_id|    diagnosis|hospital_bill|adjusted_total_cost|
+----------+-------------+-------------+-------------------+
|         1|    Pneumonia|       5000.0|             5000.0|
|         2| Appendicitis|       7000.0|             8400.0|
|         3|Fractured Arm|       3500.0|             3500.0|
|         4| Heart Attack|      15000.0|            15000.0|
|         5|    Influenza|       2500.0|             2500.0|
|         6| Appendicitis|       8000.0|             9600.0|
|         7|    Pneumonia|       5500.0|             5500.0|
|         8| Heart Attack|      20000.0|            20000.0|
|         9|Fractured Leg|       6000.0|             6000.0|
|        10| Appendicitis|       7500.0|             9000.0|
|        11|    Influenza|       2800.0|             2800.0|
|        12|    Pneumonia|       6000.0|             6000.0|
|        13| Heart Attack|      18000.0|            18000.0|
|        14| Appendiciti

In [36]:
spark.stop()