# In this tutorial you will learn 

- how to create spark session
- how to load data and check schema
- how to check duplicate records
- how to check missing values
- how to extract date and time from datetime column
- how to create new columns
- how to group, aggregate, and filter data
- how to make meaningful business insights

@Author: Ivy Wang





In [3]:
!pip3 install pyspark




## 1. Create spark session

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tutorial').getOrCreate()


## 2. Load dataset

In [5]:
# read dataset and show the header
df = spark.read.option('header','true').csv('/Users/ivyw/pyspark/bike_rental.csv')

In [6]:
# check schema
df.printSchema()

root
 |-- rental_id: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- bike_id: string (nullable = true)
 |-- bike_model: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- end_station_logical_terminal: string (nullable = true)
 |-- start_station_logical_terminal: string (nullable = true)
 |-- end_station_priority_id: string (nullable = true)



In [7]:
df.show()

+---------+--------+-----------+-------+----------+--------------------+--------------+--------------------+--------------------+----------------+--------------------+----------------------------+------------------------------+-----------------------+
|rental_id|duration|duration_ms|bike_id|bike_model|            end_date|end_station_id|    end_station_name|          start_date|start_station_id|  start_station_name|end_station_logical_terminal|start_station_logical_terminal|end_station_priority_id|
+---------+--------+-----------+-------+----------+--------------------+--------------+--------------------+--------------------+----------------+--------------------+----------------------------+------------------------------+-----------------------+
|125727958|     751|     751211|  55054|   CLASSIC|2022-10-06 21:00:...|        200243|St. John's Park, ...|2022-10-06 20:48:...|          200163|Jubilee Plaza, Ca...|                        NULL|                          NULL|                 

## 3. Check duplicates and missing values

In [8]:
# check duplicated records

#df.columns
df_dup = df.groupBy('rental_id',
 'duration',
 'duration_ms',
 'bike_id',
 'bike_model',
 'end_date',
 'end_station_id',
 'end_station_name',
 'start_date',
 'start_station_id',
 'start_station_name',
 'end_station_logical_terminal',
 'start_station_logical_terminal',
 'end_station_priority_id').count().filter('count>1').show()


[Stage 2:>                                                          (0 + 2) / 2]

+---------+--------+-----------+-------+----------+--------+--------------+----------------+----------+----------------+------------------+----------------------------+------------------------------+-----------------------+-----+
|rental_id|duration|duration_ms|bike_id|bike_model|end_date|end_station_id|end_station_name|start_date|start_station_id|start_station_name|end_station_logical_terminal|start_station_logical_terminal|end_station_priority_id|count|
+---------+--------+-----------+-------+----------+--------+--------------+----------------+----------+----------------+------------------+----------------------------+------------------------------+-----------------------+-----+
+---------+--------+-----------+-------+----------+--------+--------------+----------------+----------+----------------+------------------+----------------------------+------------------------------+-----------------------+-----+



                                                                                

In [9]:
# check missing values
from pyspark.sql.functions import col, sum

# Check for missing values in all columns
missing_values_count = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])

# Display the results
missing_values_count.show()

+---------+--------+-----------+-------+----------+--------+--------------+----------------+----------+----------------+------------------+----------------------------+------------------------------+-----------------------+
|rental_id|duration|duration_ms|bike_id|bike_model|end_date|end_station_id|end_station_name|start_date|start_station_id|start_station_name|end_station_logical_terminal|start_station_logical_terminal|end_station_priority_id|
+---------+--------+-----------+-------+----------+--------+--------------+----------------+----------+----------------+------------------+----------------------------+------------------------------+-----------------------+
|        0|       0|          0|      0|     13432|       0|           502|               0|         0|               0|                 0|                       47368|                         47368|                  47368|
+---------+--------+-----------+-------+----------+--------+--------------+----------------+----------+-

## 4. Extract date and time from datetime column

In [10]:
# parse datetime and extract year, month, day of week, hour of day
from pyspark.sql.functions import year, month, dayofweek, hour

# Extract year
df = df.withColumn("year", year("start_date"))

# Extract month
df = df.withColumn("month", month("start_date"))

# Extract day of the week (1 = Sunday, 2 = Monday, ..., 7 = Saturday)
df = df.withColumn("day_of_week", dayofweek("start_date"))

# Extract hour of the day
df = df.withColumn("hour_of_day", hour("start_date"))

In [11]:
# add a new column called duration_min
from pyspark.sql.functions import col, round
df = df.withColumn("duration_min",df["duration"]/60)
df = df.withColumn("duration_min", round(col("duration_min"),2))

In [12]:
df.show()

+---------+--------+-----------+-------+----------+--------------------+--------------+--------------------+--------------------+----------------+--------------------+----------------------------+------------------------------+-----------------------+----+-----+-----------+-----------+------------+
|rental_id|duration|duration_ms|bike_id|bike_model|            end_date|end_station_id|    end_station_name|          start_date|start_station_id|  start_station_name|end_station_logical_terminal|start_station_logical_terminal|end_station_priority_id|year|month|day_of_week|hour_of_day|duration_min|
+---------+--------+-----------+-------+----------+--------------------+--------------+--------------------+--------------------+----------------+--------------------+----------------------------+------------------------------+-----------------------+----+-----+-----------+-----------+------------+
|125727958|     751|     751211|  55054|   CLASSIC|2022-10-06 21:00:...|        200243|St. John's Pa

In [13]:
# check the distribution of duration_min

df.describe(['duration_min']).show()

+-------+------------------+
|summary|      duration_min|
+-------+------------------+
|  count|             47368|
|   mean| 36.81580645161297|
| stddev|228.91379025049127|
|    min|              0.05|
|    max|          24055.77|
+-------+------------------+



## 5. Group, aggregate, and filter data

In [14]:
# find out the insights of date time and rental

df.groupby('year').count().show()

+----+-----+
|year|count|
+----+-----+
|2023| 3415|
|2022|43953|
+----+-----+



In [15]:
# find out the insights of date time and rental

df.groupby(['year','month']).count().show()

+----+-----+-----+
|year|month|count|
+----+-----+-----+
|2022|   10|12038|
|2022|    2|  779|
|2022|    7| 2159|
|2022|   11| 8424|
|2022|    3| 1425|
|2022|    1|  796|
|2022|    5| 1859|
|2022|    6| 2270|
|2022|    9| 4608|
|2022|    4| 1670|
|2022|   12| 5845|
|2023|    1| 3415|
|2022|    8| 2080|
+----+-----+-----+



In [16]:
# with filter
df.filter(df['year']== '2022').groupby(['year','month']).count().orderBy('month').show()


+----+-----+-----+
|year|month|count|
+----+-----+-----+
|2022|    1|  796|
|2022|    2|  779|
|2022|    3| 1425|
|2022|    4| 1670|
|2022|    5| 1859|
|2022|    6| 2270|
|2022|    7| 2159|
|2022|    8| 2080|
|2022|    9| 4608|
|2022|   10|12038|
|2022|   11| 8424|
|2022|   12| 5845|
+----+-----+-----+



In [17]:
# which time of day that people prefer renting a bike?
df.filter(df['year']== '2022').groupby(['hour_of_day']).count().orderBy('hour_of_day').show()



+-----------+-----+
|hour_of_day|count|
+-----------+-----+
|          0| 1015|
|          1|  687|
|          2|  478|
|          3|  310|
|          4|  190|
|          5|  159|
|          6|  153|
|          7|  385|
|          8| 1255|
|          9| 2467|
|         10| 2839|
|         11| 2019|
|         12| 2002|
|         13| 2445|
|         14| 2599|
|         15| 2741|
|         16| 2771|
|         17| 3008|
|         18| 3618|
|         19| 4133|
+-----------+-----+
only showing top 20 rows



                                                                                

## 6. Make business insights

In [18]:
# it's not clear, i ll add a new column called time_of_day

from pyspark.sql.functions import when

df = df.withColumn(
    "time_of_day",
    when((col("hour_of_day") >= 6) & (col("hour_of_day") < 12), "morning")
    .when((col("hour_of_day") >= 12) & (col("hour_of_day") < 18), "afternoon")
    .when((col("hour_of_day") >= 18) & (col("hour_of_day") < 24), "evening")
    .otherwise("night")
)


In [19]:
df.filter(df['year']== '2022').groupby(['time_of_day']).count().orderBy('time_of_day').show()

+-----------+-----+
|time_of_day|count|
+-----------+-----+
|  afternoon|15566|
|    evening|16430|
|    morning| 9118|
|      night| 2839|
+-----------+-----+



In [28]:
# which bikes are the most popular ones 


data = df.filter(df['year'] == '2022') \
              .groupBy('bike_id') \
              .count() \
              .orderBy('count', ascending=False) \
              .withColumnRenamed('count', 'Bike_rent') \
              .groupBy('Bike_rent').count().orderBy('Bike_rent').show()



+---------+-----+
|Bike_rent|count|
+---------+-----+
|        1| 6402|
|        2| 4386|
|        3| 2836|
|        4| 1799|
|        5| 1056|
|        6|  583|
|        7|  302|
|        8|  142|
|        9|   68|
|       10|   28|
|       11|   13|
|       12|    1|
+---------+-----+



In [29]:
# close the session and clean up
spark.stop()