In [3]:
pip install pyspark




# **FITNESS TRACKER** **DATA**

In [61]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, mean, count,avg

# Initialize SparkSession
spark = SparkSession.builder.appName("Fitness Tracker").getOrCreate()

# Load dataset
df = spark.createDataFrame([
    (1, '2023-07-01', 12000, 500, 8.5, 90),
    (2, '2023-07-01', 8000, 350, 5.6, 60),
    (3, '2023-07-01', 15000, 600, 10.2, 120),
    (1, '2023-07-02', 11000, 480, 7.9, 85),
    (2, '2023-07-02', 9000, 400, 6.2, 70),
    (3, '2023-07-02', 13000, 520, 9.0, 100),
    (1, '2023-07-03', 10000, 450, 7.1, 80),
    (2, '2023-07-03', 7000, 320, 4.9, 55),
    (3, '2023-07-03', 16000, 620, 11.0, 130),
], ["user_id", "date", "steps", "calories", "distance_km", "active_minutes"])

df.show


In [5]:
# 1. Find the Total Steps Taken by Each User
total_steps = df.groupBy("user_id").agg(sum("steps").alias("total_steps"))
total_steps.show()

+-------+-----------+
|user_id|total_steps|
+-------+-----------+
|      1|      33000|
|      3|      44000|
|      2|      24000|
+-------+-----------+



In [6]:
# 2. Filter Days Where a User Burned More Than 500 Calories
high_calorie_days = df.filter(col("calories") > 500)
high_calorie_days.show()

+-------+----------+-----+--------+-----------+--------------+
|user_id|      date|steps|calories|distance_km|active_minutes|
+-------+----------+-----+--------+-----------+--------------+
|      3|2023-07-01|15000|     600|       10.2|           120|
|      3|2023-07-02|13000|     520|        9.0|           100|
|      3|2023-07-03|16000|     620|       11.0|           130|
+-------+----------+-----+--------+-----------+--------------+



In [12]:
# 3. Calculate the Average Distance Traveled by Each User
avg_distance = df.groupBy("user_id").avg("distance_km").withColumnRenamed('avg(distance_km)',"avg_distance")
avg_distance.show()

+-------+------------------+
|user_id|      avg_distance|
+-------+------------------+
|      1| 7.833333333333333|
|      3|10.066666666666666|
|      2| 5.566666666666667|
+-------+------------------+



In [14]:
# 4. Identify the Day with the Maximum Steps for Each User
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec = Window.partitionBy("user_id").orderBy(col("steps").desc())
max_steps_day = df.withColumn("rank", row_number().over(windowSpec)).filter(col("rank") == 1)
max_steps_day.show()

+-------+----------+-----+--------+-----------+--------------+----+
|user_id|      date|steps|calories|distance_km|active_minutes|rank|
+-------+----------+-----+--------+-----------+--------------+----+
|      1|2023-07-01|12000|     500|        8.5|            90|   1|
|      2|2023-07-02| 9000|     400|        6.2|            70|   1|
|      3|2023-07-03|16000|     620|       11.0|           130|   1|
+-------+----------+-----+--------+-----------+--------------+----+



In [15]:
# 5. Find Users Who Were Active for More Than 100 Minutes on Any Day
active_users = df.filter(col("active_minutes") > 100).select("user_id").distinct()
active_users.show()

+-------+
|user_id|
+-------+
|      3|
+-------+



In [16]:
# 6. Calculate the Total Calories Burned per Day
total_calories_per_day = df.groupBy("date").agg(sum("calories").alias("total_calories"))
total_calories_per_day.show()

+----------+--------------+
|      date|total_calories|
+----------+--------------+
|2023-07-02|          1400|
|2023-07-01|          1450|
|2023-07-03|          1390|
+----------+--------------+



In [17]:
# 7. Calculate the Average Steps per Day
avg_steps_per_day = df.groupBy("date").agg(mean("steps").alias("avg_steps"))
avg_steps_per_day.show()

+----------+------------------+
|      date|         avg_steps|
+----------+------------------+
|2023-07-02|           11000.0|
|2023-07-01|11666.666666666666|
|2023-07-03|           11000.0|
+----------+------------------+



In [18]:
# 8. Rank Users by Total Distance Traveled
total_distance = df.groupBy("user_id").agg(sum("distance_km").alias("total_distance"))
ranked_users = total_distance.orderBy(col("total_distance").desc())
ranked_users.show()

+-------+------------------+
|user_id|    total_distance|
+-------+------------------+
|      3|              30.2|
|      1|              23.5|
|      2|16.700000000000003|
+-------+------------------+



In [19]:
# 9. Find the Most Active User by Total Active Minutes
total_active_minutes = df.groupBy("user_id").agg(sum("active_minutes").alias("total_active_minutes"))
most_active_user = total_active_minutes.orderBy(col("total_active_minutes").desc()).limit(1)
most_active_user.show()

+-------+--------------------+
|user_id|total_active_minutes|
+-------+--------------------+
|      3|                 350|
+-------+--------------------+



In [20]:
# 10. Create a New Column for Calories Burned per Kilometer
df = df.withColumn("calories_per_km", col("calories") / col("distance_km"))
df.show()

+-------+----------+-----+--------+-----------+--------------+-----------------+
|user_id|      date|steps|calories|distance_km|active_minutes|  calories_per_km|
+-------+----------+-----+--------+-----------+--------------+-----------------+
|      1|2023-07-01|12000|     500|        8.5|            90| 58.8235294117647|
|      2|2023-07-01| 8000|     350|        5.6|            60|62.50000000000001|
|      3|2023-07-01|15000|     600|       10.2|           120|58.82352941176471|
|      1|2023-07-02|11000|     480|        7.9|            85|60.75949367088607|
|      2|2023-07-02| 9000|     400|        6.2|            70|64.51612903225806|
|      3|2023-07-02|13000|     520|        9.0|           100|57.77777777777778|
|      1|2023-07-03|10000|     450|        7.1|            80|63.38028169014085|
|      2|2023-07-03| 7000|     320|        4.9|            55| 65.3061224489796|
|      3|2023-07-03|16000|     620|       11.0|           130|56.36363636363637|
+-------+----------+-----+--

# **BOOK SALES DATA**

In [21]:
# Initialize SparkSession
spark = SparkSession.builder.appName("Book Sales").getOrCreate()

# Load dataset
df = spark.createDataFrame([
    (1, "The Catcher in the Rye", "J.D. Salinger", "Fiction", 15.99, 2, '2023-01-05'),
    (2, "To Kill a Mockingbird", "Harper Lee", "Fiction", 18.99, 1, '2023-01-10'),
    (3, "Becoming", "Michelle Obama", "Biography", 20.00, 3, '2023-02-12'),
    (4, "Sapiens", "Yuval Noah Harari", "Non-Fiction", 22.50, 1, '2023-02-15'),
    (5, "Educated", "Tara Westover", "Biography", 17.99, 2, '2023-03-10'),
    (6, "The Great Gatsby", "F. Scott Fitzgerald", "Fiction", 10.99, 5, '2023-03-15'),
    (7, "Atomic Habits", "James Clear", "Self-Help", 16.99, 3, '2023-04-01'),
    (8, "Dune", "Frank Herbert", "Science Fiction", 25.99, 1, '2023-04-10'),
    (9, "1984", "George Orwell", "Fiction", 14.99, 2, '2023-04-12'),
    (10, "The Power of Habit", "Charles Duhigg", "Self-Help", 18.00, 1, '2023-05-01')
], ["sale_id", "book_title", "author", "genre", "sale_price", "quantity", "date"])


In [22]:
# 1. Find Total Sales Revenue per Genre
df = df.withColumn("total_sales", col("sale_price") * col("quantity"))
total_sales_per_genre = df.groupBy("genre").agg(sum("total_sales").alias("total_sales"))
total_sales_per_genre.show()

+---------------+-----------------+
|          genre|      total_sales|
+---------------+-----------------+
|        Fiction|            135.9|
|    Non-Fiction|             22.5|
|      Biography|95.97999999999999|
|      Self-Help|            68.97|
|Science Fiction|            25.99|
+---------------+-----------------+



In [23]:
# 2. Filter Books Sold in the "Fiction" Genre
fiction_books = df.filter(col("genre") == "Fiction")
fiction_books.show()

+-------+--------------------+-------------------+-------+----------+--------+----------+-----------+
|sale_id|          book_title|             author|  genre|sale_price|quantity|      date|total_sales|
+-------+--------------------+-------------------+-------+----------+--------+----------+-----------+
|      1|The Catcher in th...|      J.D. Salinger|Fiction|     15.99|       2|2023-01-05|      31.98|
|      2|To Kill a Mocking...|         Harper Lee|Fiction|     18.99|       1|2023-01-10|      18.99|
|      6|    The Great Gatsby|F. Scott Fitzgerald|Fiction|     10.99|       5|2023-03-15|      54.95|
|      9|                1984|      George Orwell|Fiction|     14.99|       2|2023-04-12|      29.98|
+-------+--------------------+-------------------+-------+----------+--------+----------+-----------+



In [24]:
# 3. Find the Book with the Highest Sale Price
highest_price_book = df.orderBy(col("sale_price").desc()).limit(1)
highest_price_book.show()

+-------+----------+-------------+---------------+----------+--------+----------+-----------+
|sale_id|book_title|       author|          genre|sale_price|quantity|      date|total_sales|
+-------+----------+-------------+---------------+----------+--------+----------+-----------+
|      8|      Dune|Frank Herbert|Science Fiction|     25.99|       1|2023-04-10|      25.99|
+-------+----------+-------------+---------------+----------+--------+----------+-----------+



In [25]:
# 4. Calculate Total Quantity of Books Sold by Author
total_quantity_per_author = df.groupBy("author").agg(sum("quantity").alias("total_quantity"))
total_quantity_per_author.show()

+-------------------+--------------+
|             author|total_quantity|
+-------------------+--------------+
|         Harper Lee|             1|
|     Michelle Obama|             3|
|      J.D. Salinger|             2|
|  Yuval Noah Harari|             1|
|      Tara Westover|             2|
|      Frank Herbert|             1|
|      George Orwell|             2|
|F. Scott Fitzgerald|             5|
|     Charles Duhigg|             1|
|        James Clear|             3|
+-------------------+--------------+



In [26]:
# 5. Identify Sales Transactions Worth More Than $50
high_value_sales = df.filter(col("total_sales") > 50)
high_value_sales.show()

+-------+----------------+-------------------+---------+----------+--------+----------+-----------+
|sale_id|      book_title|             author|    genre|sale_price|quantity|      date|total_sales|
+-------+----------------+-------------------+---------+----------+--------+----------+-----------+
|      3|        Becoming|     Michelle Obama|Biography|      20.0|       3|2023-02-12|       60.0|
|      6|The Great Gatsby|F. Scott Fitzgerald|  Fiction|     10.99|       5|2023-03-15|      54.95|
|      7|   Atomic Habits|        James Clear|Self-Help|     16.99|       3|2023-04-01|      50.97|
+-------+----------------+-------------------+---------+----------+--------+----------+-----------+



In [27]:
# 6. Find the Average Sale Price per Genre
avg_price_per_genre = df.groupBy("genre").agg(mean("sale_price").alias("avg_sale_price"))
avg_price_per_genre.show()

+---------------+------------------+
|          genre|    avg_sale_price|
+---------------+------------------+
|        Fiction|15.239999999999998|
|    Non-Fiction|              22.5|
|      Biography|18.994999999999997|
|      Self-Help|17.494999999999997|
|Science Fiction|             25.99|
+---------------+------------------+



In [28]:
# 7. Count the Number of Unique Authors in the Dataset
unique_authors_count = df.select("author").distinct().count()
print(f"Unique authors count: {unique_authors_count}")

Unique authors count: 10


In [29]:
# 8. Find the Top 3 Best-Selling Books by Quantity
total_quantity_per_book = df.groupBy("book_title").agg(sum("quantity").alias("total_quantity"))
top_3_books = total_quantity_per_book.orderBy(col("total_quantity").desc()).limit(3)
top_3_books.show()

+----------------+--------------+
|      book_title|total_quantity|
+----------------+--------------+
|The Great Gatsby|             5|
|        Becoming|             3|
|   Atomic Habits|             3|
+----------------+--------------+



In [30]:
# 9. Calculate Total Sales for Each Month
from pyspark.sql.functions import date_format

df = df.withColumn("month", date_format("date", "yyyy-MM"))
total_sales_per_month = df.groupBy("month").agg(sum("total_sales").alias("total_sales"))
total_sales_per_month.show()

+-------+-----------+
|  month|total_sales|
+-------+-----------+
|2023-03|      90.93|
|2023-02|       82.5|
|2023-01|      50.97|
|2023-04|     106.94|
|2023-05|       18.0|
+-------+-----------+



In [31]:
# 10. Create a New Column for Total Sales Amount
df = df.withColumn("total_sales", col("sale_price") * col("quantity"))
df.show()


+-------+--------------------+-------------------+---------------+----------+--------+----------+-----------+-------+
|sale_id|          book_title|             author|          genre|sale_price|quantity|      date|total_sales|  month|
+-------+--------------------+-------------------+---------------+----------+--------+----------+-----------+-------+
|      1|The Catcher in th...|      J.D. Salinger|        Fiction|     15.99|       2|2023-01-05|      31.98|2023-01|
|      2|To Kill a Mocking...|         Harper Lee|        Fiction|     18.99|       1|2023-01-10|      18.99|2023-01|
|      3|            Becoming|     Michelle Obama|      Biography|      20.0|       3|2023-02-12|       60.0|2023-02|
|      4|             Sapiens|  Yuval Noah Harari|    Non-Fiction|      22.5|       1|2023-02-15|       22.5|2023-02|
|      5|            Educated|      Tara Westover|      Biography|     17.99|       2|2023-03-10|      35.98|2023-03|
|      6|    The Great Gatsby|F. Scott Fitzgerald|      

# **FOOD DELIVERY ORDERS**

In [52]:
# Initialize SparkSession
spark = SparkSession.builder.appName("Food Delivery Orders").getOrCreate()

# Sample Data
data = [
    (1, 201, "McDonald's", "Burger", 2, 5.99, 30, '2023-06-15'),
    (2, 202, "Pizza Hut", "Pizza", 1, 12.99, 45, '2023-06-16'),
    (3, 203, "KFC", "Fried Chicken", 3, 8.99, 25, '2023-06-17'),
    (4, 201, "Subway", "Sandwich", 2, 6.50, 20, '2023-06-17'),
    (5, 204, "Domino's", "Pizza", 2, 11.99, 40, '2023-06-18'),
    (6, 205, "Starbucks", "Coffee", 1, 4.50, 15, '2023-06-18'),
    (7, 202, "KFC", "Fried Chicken", 1, 8.99, 25, '2023-06-19'),
    (8, 206, "McDonald's", "Fries", 3, 2.99, 15, '2023-06-19'),
    (9, 207, "Burger King", "Burger", 1, 6.99, 30, '2023-06-20'),
    (10, 203, "Starbucks", "Coffee", 2, 4.50, 20, '2023-06-20')
]
columns = ["order_id", "customer_id", "restaurant_name", "food_item", "quantity", "price", "delivery_time_mins", "order_date"]

# Create DataFrame
df = spark.createDataFrame(data, columns)
df.show()

+--------+-----------+---------------+-------------+--------+-----+------------------+----------+
|order_id|customer_id|restaurant_name|    food_item|quantity|price|delivery_time_mins|order_date|
+--------+-----------+---------------+-------------+--------+-----+------------------+----------+
|       1|        201|     McDonald's|       Burger|       2| 5.99|                30|2023-06-15|
|       2|        202|      Pizza Hut|        Pizza|       1|12.99|                45|2023-06-16|
|       3|        203|            KFC|Fried Chicken|       3| 8.99|                25|2023-06-17|
|       4|        201|         Subway|     Sandwich|       2|  6.5|                20|2023-06-17|
|       5|        204|       Domino's|        Pizza|       2|11.99|                40|2023-06-18|
|       6|        205|      Starbucks|       Coffee|       1|  4.5|                15|2023-06-18|
|       7|        202|            KFC|Fried Chicken|       1| 8.99|                25|2023-06-19|
|       8|        20

In [33]:
# 1. Calculate Total Revenue per Restaurant
df = df.withColumn("total_order_value", col("price") * col("quantity"))
total_revenue_per_restaurant = df.groupBy("restaurant_name").agg(sum("total_order_value").alias("total_revenue"))
total_revenue_per_restaurant.show()

+---------------+------------------+
|restaurant_name|     total_revenue|
+---------------+------------------+
|         Subway|              13.0|
|      Pizza Hut|             12.99|
|            KFC|             35.96|
|       Domino's|             23.98|
|     McDonald's|20.950000000000003|
|    Burger King|              6.99|
|      Starbucks|              13.5|
+---------------+------------------+



In [34]:
# 2. Find the Fastest Delivery
fastest_delivery = df.orderBy("delivery_time_mins").limit(1)
fastest_delivery.show()

+--------+-----------+---------------+---------+--------+-----+------------------+----------+-----------------+
|order_id|customer_id|restaurant_name|food_item|quantity|price|delivery_time_mins|order_date|total_order_value|
+--------+-----------+---------------+---------+--------+-----+------------------+----------+-----------------+
|       6|        205|      Starbucks|   Coffee|       1|  4.5|                15|2023-06-18|              4.5|
+--------+-----------+---------------+---------+--------+-----+------------------+----------+-----------------+



In [35]:
# 3. Calculate Average Delivery Time per Restaurant
avg_delivery_time_per_restaurant = df.groupBy("restaurant_name").agg(mean("delivery_time_mins").alias("avg_delivery_time"))
avg_delivery_time_per_restaurant.show()

+---------------+-----------------+
|restaurant_name|avg_delivery_time|
+---------------+-----------------+
|         Subway|             20.0|
|      Pizza Hut|             45.0|
|            KFC|             25.0|
|       Domino's|             40.0|
|     McDonald's|             22.5|
|    Burger King|             30.0|
|      Starbucks|             17.5|
+---------------+-----------------+



In [36]:
# 4. Filter Orders for a Specific Customer
customer_orders = df.filter(col("customer_id") == 201)
customer_orders.show()

+--------+-----------+---------------+---------+--------+-----+------------------+----------+-----------------+
|order_id|customer_id|restaurant_name|food_item|quantity|price|delivery_time_mins|order_date|total_order_value|
+--------+-----------+---------------+---------+--------+-----+------------------+----------+-----------------+
|       1|        201|     McDonald's|   Burger|       2| 5.99|                30|2023-06-15|            11.98|
|       4|        201|         Subway| Sandwich|       2|  6.5|                20|2023-06-17|             13.0|
+--------+-----------+---------------+---------+--------+-----+------------------+----------+-----------------+



In [37]:
# 5. Find Orders Where Total Amount Spent is Greater Than $20
high_value_orders = df.filter(col("total_order_value") > 20)
high_value_orders.show()

+--------+-----------+---------------+-------------+--------+-----+------------------+----------+-----------------+
|order_id|customer_id|restaurant_name|    food_item|quantity|price|delivery_time_mins|order_date|total_order_value|
+--------+-----------+---------------+-------------+--------+-----+------------------+----------+-----------------+
|       3|        203|            KFC|Fried Chicken|       3| 8.99|                25|2023-06-17|            26.97|
|       5|        204|       Domino's|        Pizza|       2|11.99|                40|2023-06-18|            23.98|
+--------+-----------+---------------+-------------+--------+-----+------------------+----------+-----------------+



In [38]:
# 6. Calculate the Total Quantity of Each Food Item Sold
total_quantity_per_food_item = df.groupBy("food_item").agg(sum("quantity").alias("total_quantity"))
total_quantity_per_food_item.show()

+-------------+--------------+
|    food_item|total_quantity|
+-------------+--------------+
|       Burger|             3|
|     Sandwich|             2|
|        Pizza|             3|
|Fried Chicken|             4|
|        Fries|             3|
|       Coffee|             3|
+-------------+--------------+



In [48]:
# 7. Find the Top 3 Most Popular Restaurants by Number of Orders
order_per_restaurant = dff.groupBy("restaurant_name").count()
top_3_restaurants = order_per_restaurant.orderBy(col("count").desc()).limit(3)
top_3_restaurants.show()

+---------------+-----+
|restaurant_name|count|
+---------------+-----+
|            KFC|    2|
|     McDonald's|    2|
|      Starbucks|    2|
+---------------+-----+



In [55]:
# 8. Calculate Total Revenue per Day
total_revenue_per_day = df.groupBy("order_date").agg(sum("price").alias("total_revenue"))
total_revenue_per_day.show()

+----------+------------------+
|order_date|     total_revenue|
+----------+------------------+
|2023-06-16|             12.99|
|2023-06-18|16.490000000000002|
|2023-06-15|              5.99|
|2023-06-17|             15.49|
|2023-06-20|             11.49|
|2023-06-19|             11.98|
+----------+------------------+



In [57]:
# 9. Find the Longest Delivery Time for Each Restaurant
longest_delivery_time_per_restaurant = df.groupBy("restaurant_name").agg(max("delivery_time_mins").alias("longest_delivery_time"))
longest_delivery_time_per_restaurant.show()

+---------------+---------------------+
|restaurant_name|longest_delivery_time|
+---------------+---------------------+
|         Subway|                   20|
|      Pizza Hut|                   45|
|            KFC|                   25|
|       Domino's|                   40|
|     McDonald's|                   30|
|    Burger King|                   30|
|      Starbucks|                   20|
+---------------+---------------------+



In [58]:
# 10. Create a New Column for Total Order Value
df = df.withColumn("total_order_value", col("price") * col("quantity"))
df.show()


+--------+-----------+---------------+-------------+--------+-----+------------------+----------+-----------------+
|order_id|customer_id|restaurant_name|    food_item|quantity|price|delivery_time_mins|order_date|total_order_value|
+--------+-----------+---------------+-------------+--------+-----+------------------+----------+-----------------+
|       1|        201|     McDonald's|       Burger|       2| 5.99|                30|2023-06-15|            11.98|
|       2|        202|      Pizza Hut|        Pizza|       1|12.99|                45|2023-06-16|            12.99|
|       3|        203|            KFC|Fried Chicken|       3| 8.99|                25|2023-06-17|            26.97|
|       4|        201|         Subway|     Sandwich|       2|  6.5|                20|2023-06-17|             13.0|
|       5|        204|       Domino's|        Pizza|       2|11.99|                40|2023-06-18|            23.98|
|       6|        205|      Starbucks|       Coffee|       1|  4.5|     

# **WEATHER DATA**

In [59]:
# Initialize SparkSession
spark = SparkSession.builder.appName("Weather Data").getOrCreate()

# Sample Data with Alternate Column Names
data = [
    ('2023-01-01', 'New York', 5, 60, 20, 'Cloudy'),
    ('2023-01-01', 'Los Angeles', 15, 40, 10, 'Sunny'),
    ('2023-01-01', 'Chicago', -2, 75, 25, 'Snow'),
    ('2023-01-02', 'New York', 3, 65, 15, 'Rain'),
    ('2023-01-02', 'Los Angeles', 18, 35, 8, 'Sunny'),
    ('2023-01-02', 'Chicago', -5, 80, 30, 'Snow'),
    ('2023-01-03', 'New York', 6, 55, 22, 'Sunny'),
    ('2023-01-03', 'Los Angeles', 20, 38, 12, 'Sunny'),
    ('2023-01-03', 'Chicago', -1, 70, 18, 'Cloudy')
]
columns = ["date", "city_name", "temp_celsius", "humidity_percent", "wind_speed_kph", "weather_condition"]

# Create DataFrame
weather_df = spark.createDataFrame(data, columns)
weather_df.show()

+----------+-----------+------------+----------------+--------------+-----------------+
|      date|  city_name|temp_celsius|humidity_percent|wind_speed_kph|weather_condition|
+----------+-----------+------------+----------------+--------------+-----------------+
|2023-01-01|   New York|           5|              60|            20|           Cloudy|
|2023-01-01|Los Angeles|          15|              40|            10|            Sunny|
|2023-01-01|    Chicago|          -2|              75|            25|             Snow|
|2023-01-02|   New York|           3|              65|            15|             Rain|
|2023-01-02|Los Angeles|          18|              35|             8|            Sunny|
|2023-01-02|    Chicago|          -5|              80|            30|             Snow|
|2023-01-03|   New York|           6|              55|            22|            Sunny|
|2023-01-03|Los Angeles|          20|              38|            12|            Sunny|
|2023-01-03|    Chicago|        

In [62]:
# 1. Find the Average Temperature for Each City
avg_temperature_per_city = weather_df.groupBy("city_name").agg(avg("temp_celsius").alias("avg_temperature"))
avg_temperature_per_city.show()

+-----------+-------------------+
|  city_name|    avg_temperature|
+-----------+-------------------+
|Los Angeles| 17.666666666666668|
|    Chicago|-2.6666666666666665|
|   New York|  4.666666666666667|
+-----------+-------------------+



In [63]:
# 2. Filter Days with Temperature Below Freezing
below_freezing_days = weather_df.filter(col("temp_celsius") < 0)
below_freezing_days.show()

+----------+---------+------------+----------------+--------------+-----------------+
|      date|city_name|temp_celsius|humidity_percent|wind_speed_kph|weather_condition|
+----------+---------+------------+----------------+--------------+-----------------+
|2023-01-01|  Chicago|          -2|              75|            25|             Snow|
|2023-01-02|  Chicago|          -5|              80|            30|             Snow|
|2023-01-03|  Chicago|          -1|              70|            18|           Cloudy|
+----------+---------+------------+----------------+--------------+-----------------+



In [64]:
# 3. Find the City with the Highest Wind Speed on a Specific Day
highest_wind_speed_day = weather_df.filter(col("date") == "2023-01-02").orderBy(col("wind_speed_kph").desc()).limit(1)
highest_wind_speed_day.show()

+----------+---------+------------+----------------+--------------+-----------------+
|      date|city_name|temp_celsius|humidity_percent|wind_speed_kph|weather_condition|
+----------+---------+------------+----------------+--------------+-----------------+
|2023-01-02|  Chicago|          -5|              80|            30|             Snow|
+----------+---------+------------+----------------+--------------+-----------------+



In [65]:
# 4. Calculate the Total Number of Days with Rainy Weather
rainy_days_count = weather_df.filter(col("weather_condition") == "Rain").count()
print(f"Total number of days with rainy weather: {rainy_days_count}")

Total number of days with rainy weather: 1


In [66]:
# 5. Calculate the Average Humidity for Each Weather Condition
avg_humidity_per_condition = weather_df.groupBy("weather_condition").agg(avg("humidity_percent").alias("avg_humidity"))
avg_humidity_per_condition.show()

+-----------------+------------+
|weather_condition|avg_humidity|
+-----------------+------------+
|           Cloudy|        65.0|
|            Sunny|        42.0|
|             Snow|        77.5|
|             Rain|        65.0|
+-----------------+------------+



In [67]:
# 6. Find the Hottest Day in Each City
hottest_day_per_city = weather_df.groupBy("city_name").agg(max("temp_celsius").alias("hottest_temperature"))
hottest_day_per_city.show()

+-----------+-------------------+
|  city_name|hottest_temperature|
+-----------+-------------------+
|Los Angeles|                 20|
|    Chicago|                 -1|
|   New York|                  6|
+-----------+-------------------+



In [69]:
# 7. Identify Cities That Experienced Snow
cities_with_snow = weather_df.filter(col("weather_condition") == "Snow").select("city_name").distinct()
cities_with_snow.show()

+---------+
|city_name|
+---------+
|  Chicago|
+---------+



In [71]:
# 8. Calculate the Average Wind Speed for Days When the Condition was Sunny
avg_wind_speed_sunny_days = weather_df.filter(col("weather_condition") == "Sunny").agg(avg("wind_speed_kph").alias("avg_wind_speed"))
avg_wind_speed_sunny_days.show()

+--------------+
|avg_wind_speed|
+--------------+
|          13.0|
+--------------+



In [72]:
# 9. Find the Coldest Day Across All Cities
coldest_day = weather_df.orderBy(col("temp_celsius").asc()).limit(1)
coldest_day.show()

+----------+---------+------------+----------------+--------------+-----------------+
|      date|city_name|temp_celsius|humidity_percent|wind_speed_kph|weather_condition|
+----------+---------+------------+----------------+--------------+-----------------+
|2023-01-02|  Chicago|          -5|              80|            30|             Snow|
+----------+---------+------------+----------------+--------------+-----------------+



In [73]:
# 10. Create a New Column for Wind Chill
weather_df = weather_df.withColumn("wind_chill",
                                   13.12 + 0.6215 * col("temp_celsius") -
                                   11.37 * (col("wind_speed_kph") ** 0.16) +
                                   0.3965 * col("temp_celsius") * (col("wind_speed_kph") ** 0.16))
weather_df.show()

+----------+-----------+------------+----------------+--------------+-----------------+-------------------+
|      date|  city_name|temp_celsius|humidity_percent|wind_speed_kph|weather_condition|         wind_chill|
+----------+-----------+------------+----------------+--------------+-----------------+-------------------+
|2023-01-01|   New York|           5|              60|            20|           Cloudy| 1.0669572525115663|
|2023-01-01|Los Angeles|          15|              40|            10|            Sunny| 14.604602843130213|
|2023-01-01|    Chicago|          -2|              75|            25|             Snow| -8.479874917414646|
|2023-01-02|   New York|           3|              65|            15|             Rain|-0.7170927775232809|
|2023-01-02|Los Angeles|          18|              35|             8|            Sunny| 18.403050060338735|
|2023-01-02|    Chicago|          -5|              80|            30|             Snow|-12.996724811921073|
|2023-01-03|   New York|    

# **AIRLINE FLIGHT DATA**

In [74]:
# Initialize SparkSession
spark = SparkSession.builder.appName("Airline Flight Data").getOrCreate()

# Sample Data with Alternate Column Names
data = [
    (1, "Delta", "DL123", "JFK", "LAX", "08:00", "11:00", 30, 3970, '2023-07-01'),
    (2, "United", "UA456", "SFO", "ORD", "09:30", "15:00", 45, 2960, '2023-07-01'),
    (3, "Southwest", "SW789", "DAL", "ATL", "06:00", "08:30", 0, 1150, '2023-07-01'),
    (4, "Delta", "DL124", "LAX", "JFK", "12:00", "20:00", 20, 3970, '2023-07-02'),
    (5, "American", "AA101", "MIA", "DEN", "07:00", "10:00", 15, 2770, '2023-07-02'),
    (6, "United", "UA457", "ORD", "SFO", "11:00", "14:30", 0, 2960, '2023-07-02'),
    (7, "JetBlue", "JB302", "BOS", "LAX", "06:30", "09:45", 10, 4180, '2023-07-03')
]
columns = ["flight_id", "airline_name", "flight_code", "departure_city", "arrival_city", "departure_time", "arrival_time", "delay_minutes", "distance_km", "flight_date"]

# Create DataFrame
flight_df = spark.createDataFrame(data, columns)
flight_df.show()


+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+
|flight_id|airline_name|flight_code|departure_city|arrival_city|departure_time|arrival_time|delay_minutes|distance_km|flight_date|
+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+
|        1|       Delta|      DL123|           JFK|         LAX|         08:00|       11:00|           30|       3970| 2023-07-01|
|        2|      United|      UA456|           SFO|         ORD|         09:30|       15:00|           45|       2960| 2023-07-01|
|        3|   Southwest|      SW789|           DAL|         ATL|         06:00|       08:30|            0|       1150| 2023-07-01|
|        4|       Delta|      DL124|           LAX|         JFK|         12:00|       20:00|           20|       3970| 2023-07-02|
|        5|    American|      AA101|           MIA|         DEN|         07:00|    

In [75]:
# 1. Find the Total Distance Traveled by Each Airline
total_distance_per_airline = flight_df.groupBy("airline_name").agg(sum("distance_km").alias("total_distance"))
total_distance_per_airline.show()

+------------+--------------+
|airline_name|total_distance|
+------------+--------------+
|       Delta|          7940|
|      United|          5920|
|   Southwest|          1150|
|     JetBlue|          4180|
|    American|          2770|
+------------+--------------+



In [76]:
# 2. Filter Flights with Delays Greater than 30 Minutes
flights_with_delays = flight_df.filter(col("delay_minutes") > 30)
flights_with_delays.show()

+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+
|flight_id|airline_name|flight_code|departure_city|arrival_city|departure_time|arrival_time|delay_minutes|distance_km|flight_date|
+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+
|        2|      United|      UA456|           SFO|         ORD|         09:30|       15:00|           45|       2960| 2023-07-01|
+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+



In [77]:
# 3. Find the Flight with the Longest Distance
longest_distance_flight = flight_df.orderBy(col("distance_km").desc()).limit(1)
longest_distance_flight.show()

+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+
|flight_id|airline_name|flight_code|departure_city|arrival_city|departure_time|arrival_time|delay_minutes|distance_km|flight_date|
+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+
|        7|     JetBlue|      JB302|           BOS|         LAX|         06:30|       09:45|           10|       4180| 2023-07-03|
+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+



In [78]:
# 4. Calculate the Average Delay Time for Each Airline
avg_delay_per_airline = flight_df.groupBy("airline_name").agg(avg("delay_minutes").alias("avg_delay"))
avg_delay_per_airline.show()

+------------+---------+
|airline_name|avg_delay|
+------------+---------+
|       Delta|     25.0|
|      United|     22.5|
|   Southwest|      0.0|
|     JetBlue|     10.0|
|    American|     15.0|
+------------+---------+



In [79]:
# 5. Identify Flights That Were Not Delayed
on_time_flights = flight_df.filter(col("delay_minutes") == 0)
on_time_flights.show()

+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+
|flight_id|airline_name|flight_code|departure_city|arrival_city|departure_time|arrival_time|delay_minutes|distance_km|flight_date|
+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+
|        3|   Southwest|      SW789|           DAL|         ATL|         06:00|       08:30|            0|       1150| 2023-07-01|
|        6|      United|      UA457|           ORD|         SFO|         11:00|       14:30|            0|       2960| 2023-07-02|
+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+



In [80]:
# 6. Find the Top 3 Most Frequent Routes
top_routes = flight_df.groupBy("departure_city", "arrival_city").agg(count("flight_id").alias("route_count"))
top_3_routes = top_routes.orderBy(col("route_count").desc()).limit(3)
top_3_routes.show()

+--------------+------------+-----------+
|departure_city|arrival_city|route_count|
+--------------+------------+-----------+
|           SFO|         ORD|          1|
|           JFK|         LAX|          1|
|           DAL|         ATL|          1|
+--------------+------------+-----------+



In [81]:
# 7. Calculate the Total Number of Flights per Day
flights_per_day = flight_df.groupBy("flight_date").agg(count("flight_id").alias("total_flights"))
flights_per_day.show()

+-----------+-------------+
|flight_date|total_flights|
+-----------+-------------+
| 2023-07-01|            3|
| 2023-07-03|            1|
| 2023-07-02|            3|
+-----------+-------------+



In [82]:
# 8. Find the Airline with the Most Flights
most_flights_airline = flight_df.groupBy("airline_name").agg(count("flight_id").alias("total_flights")).orderBy(col("total_flights").desc()).limit(1)
most_flights_airline.show()


+------------+-------------+
|airline_name|total_flights|
+------------+-------------+
|       Delta|            2|
+------------+-------------+



In [83]:
# 9. Calculate the Average Flight Distance per Day
avg_distance_per_day = flight_df.groupBy("flight_date").agg(avg("distance_km").alias("avg_distance"))
avg_distance_per_day.show()

+-----------+------------------+
|flight_date|      avg_distance|
+-----------+------------------+
| 2023-07-01|2693.3333333333335|
| 2023-07-03|            4180.0|
| 2023-07-02|3233.3333333333335|
+-----------+------------------+



In [84]:
# 10. Create a New Column for On-Time Status
flight_df = flight_df.withColumn("on_time", col("delay_minutes") == 0)
flight_df.show()

+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+-------+
|flight_id|airline_name|flight_code|departure_city|arrival_city|departure_time|arrival_time|delay_minutes|distance_km|flight_date|on_time|
+---------+------------+-----------+--------------+------------+--------------+------------+-------------+-----------+-----------+-------+
|        1|       Delta|      DL123|           JFK|         LAX|         08:00|       11:00|           30|       3970| 2023-07-01|  false|
|        2|      United|      UA456|           SFO|         ORD|         09:30|       15:00|           45|       2960| 2023-07-01|  false|
|        3|   Southwest|      SW789|           DAL|         ATL|         06:00|       08:30|            0|       1150| 2023-07-01|   true|
|        4|       Delta|      DL124|           LAX|         JFK|         12:00|       20:00|           20|       3970| 2023-07-02|  false|
|        5|    American|   