# Supervised ML

The goal of this model is to predict the ridership that occurs within the University of Chicago Lyft Program Area. We will do this by using as features the ridership counts of other Chicago community areas, as well as using weather. The labels are the daily ridership counts within the program area.

We will create the model that functions up until the introduction of the University Lyft program and then look at the difference between the predictions and the actual ridership as a rough estimate of the effect of the program on rideshare usage in the area. We will do this by looking at both the change when the program was introduced, as well as when the program was reduced from 10 rides of up to 15 dollars each, to 7 rides up to 10 dollars. 

In [1]:
# read in packages create spark environment
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import matplotlib.pyplot as plt
%matplotlib inline

spark = SparkSession.builder.appName('supervised').getOrCreate()

#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])

#print spark configuration settings
spark.sparkContext.getConf().getAll()

[('spark.stage.maxConsecutiveAttempts', '10'),
 ('spark.dynamicAllocation.minExecutors', '1'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.submit.pyFiles',
  '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-

### Reading in cleaned data, partitioning

In [2]:
# read in rideshare data for all years, concatenate, create appropriate partitioning
# we are dropping 2020 because covid will affect the performance of our model

df_2018 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2018.csv", inferSchema=True, header=True)
df_2019 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2019.csv", inferSchema=True, header=True)
df_2021 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2021.csv", inferSchema=True, header=True)
df_2022 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2022.csv", inferSchema=True, header=True)
df_2023 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2023.csv", inferSchema=True, header=True)

# dropping new columns in 2023
df_2023 = df_2023.drop('Shared Trip Match','Percent Time Chicago','Percent Distance Chicago')

df_all = df_2018.union(df_2019).union(df_2021).union(df_2022).union(df_2023)
df_all.show(5)

                                                                                

+--------------------+-------------------+-------------------+-------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+-------------+--------------+-----+------------+----+---+
|                  ID|    start_timestamp|      end_timestamp|seconds|miles|pickup_tract|dropoff_tract|pickup_area|dropoff_area|Fare|Tip|total|   pickup_lat|    pickup_lon|  dropoff_lat|   dropoff_lon|month|day_of_month|hour|day|
+--------------------+-------------------+-------------------+-------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+-------------+--------------+-----+------------+----+---+
|625e77ae6e0ff7191...|2018-11-06 19:00:00|2018-11-06 19:15:00|   1142|  5.8| 17031063400|  17031010400|          6|           1|12.5|  0| 15.0|41.9346591566|-87.6467297286| 42.004764559| -87.659122427|   11|           6|  19|  3|
|62945fdb2e70957f0...|2018-11-06 19:00:00|2018-11-06 19:00:00|    341|  1.2| 170

In [4]:
#display number of records by partition
def displaypartitions(df):
    #number of records by partition
    num = df.rdd.getNumPartitions()
    print("Partitions:", num)
    df.withColumn("partitionId", F.spark_partition_id())\
        .groupBy("partitionId")\
        .count()\
        .orderBy(F.asc("count"))\
        .show(num)

df_all.rdd.getNumPartitions()
displaypartitions(df_all)


Partitions: 534




+-----------+------+
|partitionId| count|
+-----------+------+
|         33|152646|
|        233|328837|
|        232|328975|
|        231|329131|
|        230|329163|
|        229|329209|
|        227|329245|
|        228|329263|
|        225|329263|
|        224|329311|
|        226|329315|
|        222|329332|
|        223|329344|
|        221|329373|
|        218|329389|
|        219|329390|
|        217|329399|
|        216|329410|
|        215|329410|
|        214|329418|
|        220|329427|
|        213|329428|
|        210|329461|
|        212|329481|
|        211|329505|
|        207|329507|
|        208|329513|
|        209|329519|
|        206|329523|
|        204|329533|
|        203|329555|
|        205|329574|
|        201|329587|
|        202|329591|
|        198|329607|
|        200|329623|
|        196|329624|
|        199|329630|
|        197|329633|
|        195|329646|
|        192|329654|
|        194|329673|
|        193|329678|
|        184|329704|
|        191|

                                                                                

In [6]:
# repartitioning to 600 partitions, seems to be balanced now. 
df_all = df_all.repartition(600)
displaypartitions(df_all)



Partitions: 600




+-----------+------+
|partitionId| count|
+-----------+------+
|        263|362150|
|        258|362151|
|        265|362151|
|        256|362151|
|        259|362151|
|        255|362152|
|        267|362152|
|        266|362152|
|        257|362152|
|        262|362152|
|        260|362152|
|        264|362152|
|        254|362153|
|        261|362153|
|        181|362154|
|        268|362154|
|        179|362154|
|        188|362154|
|        180|362154|
|        250|362155|
|        183|362155|
|        269|362155|
|        272|362155|
|        189|362155|
|        251|362155|
|        186|362155|
|        253|362155|
|        252|362155|
|        172|362155|
|        173|362155|
|        178|362155|
|        182|362155|
|        273|362155|
|        184|362155|
|        187|362155|
|        277|362156|
|        285|362156|
|        232|362156|
|        245|362156|
|        190|362156|
|        219|362156|
|        554|362156|
|        287|362156|
|        177|362156|
|        286|

                                                                                

In [4]:
# we will need a year column in this model
df_all = df_all.withColumn('year', F.year(df_all.start_timestamp))

In [8]:
df_all.printSchema()

root
 |-- ID: string (nullable = true)
 |-- start_timestamp: timestamp (nullable = true)
 |-- end_timestamp: timestamp (nullable = true)
 |-- seconds: integer (nullable = true)
 |-- miles: double (nullable = true)
 |-- pickup_tract: long (nullable = true)
 |-- dropoff_tract: long (nullable = true)
 |-- pickup_area: integer (nullable = true)
 |-- dropoff_area: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: integer (nullable = true)
 |-- total: double (nullable = true)
 |-- pickup_lat: double (nullable = true)
 |-- pickup_lon: double (nullable = true)
 |-- dropoff_lat: double (nullable = true)
 |-- dropoff_lon: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- year: integer (nullable = true)



## Next steps

I'm assuming we are predicting using the full dataset and not restricting ourselves to being within the program hours.

I started writing code that goes through the steps that I think will probably be necessary. The code is unfinished because I ran out of time to test in all or formally think through the problems I was seeing. Feel free to change things or make your own assumptions.

Here is the process that I was thinking of. I was trying all this on a sample dataframe so I could code faster.
1. Get Daily counts for each community area
2. pivot so that there is a column for each community area (y is when hyde park or woodlawn or kenwood are 1, otherwise the column is a feature)
3. merge with daily weather data
4. separate out y (counts for every day in program area) and X (column of counts for each community area outside of the program area)
5. filter for pre-program rides.
- Research which model works best and which one is the most parallelizable
6. create supervised model on all that data
7. predict the next month or so of counts after sept 29 2021
8. Graph predictions versus reality
9. maybe do the same thing in 2023 once data is available

In [5]:
# take a sample to test these operations out on first
sample_df = df_all.sample(fraction=1/1000000)

# get only the columns needed for the model
selected_columns = ["pickup_area","dropoff_area","day","month","year","ID"]
sample_selected = sample_df.select(selected_columns)


# group the rideshare data by day and community area and create counts
#sample_df = sample_df.groupby('day',"month","year",'pickup_area','dropoff_area').agg({'ID':'count'})
#sample_df.show(5)

In [21]:
# grouping by community area getting daily counts - one where the community is the pickup area
pickup_counts = sample_df.groupby('day', 'month', 'year', 'pickup_area').agg({'ID':'count'})

In [25]:
pickup_counts.select('count(ID)').distinct().show(5)

                                                                                

+---------+
|count(ID)|
+---------+
|        1|
|        2|
+---------+



In [26]:
dropoff_counts = sample_df.groupby('day', 'month', 'year', 'dropoff_area').agg({'ID':'count'})

In [27]:
dropoff_counts.select('count(ID)').distinct().show(5)

                                                                                

+---------+
|count(ID)|
+---------+
|        1|
|        2|
+---------+



**Daily counts for each community area**

we had to group by pickup area and dropoff area seperately- daily counts of number of trips to that particular community area when it was either a pickup or dropoff area

In [6]:
# Calculate daily counts for pickup areas
pickup_counts = sample_df.groupby('day', 'month', 'year', 'pickup_area').count().withColumnRenamed('count', 'pickup_count')
pickup_counts = pickup_counts.withColumnRenamed('pickup_area', 'area')

In [7]:
# Calculate daily counts for dropoff areas
dropoff_counts = sample_df.groupby('day', 'month', 'year', 'dropoff_area').count().withColumnRenamed('count', 'dropoff_count')
dropoff_counts = dropoff_counts.withColumnRenamed('dropoff_area', 'area')
#dropoff_counts.printSchema()

In [8]:
pickup_counts = pickup_counts.withColumn('dropoff_count', F.lit(0))
dropoff_counts = dropoff_counts.withColumn('pickup_count', F.lit(0))

# ensuring same column order
pickup_counts = pickup_counts.select('day', 'month', 'year', 'area', 'pickup_count', 'dropoff_count')
dropoff_counts = dropoff_counts.select('day', 'month', 'year', 'area', 'pickup_count', 'dropoff_count')

# Union the pickup and dropoff dataframes
combined_df = pickup_counts.union(dropoff_counts)

# Group by day, month, year, and area, summing up the counts
daily_counts_by_area = combined_df.groupby('day', 'month', 'year', 'area').sum('pickup_count', 'dropoff_count')

# the relatively smaller numbers are mostly a result of the sample size, should be fine when we 
# make it to the entire dataframe
daily_counts_by_area = daily_counts_by_area.withColumn('total_counts', F.col('sum(pickup_count)') + F.col('sum(dropoff_count)'))
daily_counts_by_area.drop('sum(pickup_count)','sum(dropoff_count)')
#daily_counts_by_area.show(10)

DataFrame[day: int, month: int, year: int, area: int, total_counts: bigint]

                                                                                

+---+-----+----+----+-----------------+------------------+------------+
|day|month|year|area|sum(pickup_count)|sum(dropoff_count)|total_counts|
+---+-----+----+----+-----------------+------------------+------------+
|  2|    5|2023|  29|                1|                 0|           1|
|  7|    2|2019|  28|                1|                 0|           1|
|  3|    9|2021|  32|                1|                 0|           1|
|  5|    1|2022|  38|                0|                 1|           1|
|  2|    6|2019|  28|                1|                 0|           1|
|  3|    4|2021|  32|                1|                 0|           1|
|  3|    1|2023|   4|                1|                 0|           1|
|  3|   11|2018|  15|                0|                 1|           1|
|  3|    3|2021|   6|                1|                 0|           1|
|  7|    7|2021|  43|                1|                 0|           1|
+---+-----+----+----+-----------------+------------------+------

Pivoting the dataset for community areas

In [9]:
#keep datetime as part of the pivot. 

# pivot so that each community area is a column
# one row for each day, each column represents a community area (with its entry being daily count of rides for that area).

# Pivot the DataFrame
pivoted_df = daily_counts_by_area.groupBy("day", "month", "year").pivot("area").sum("total_counts")

# Show the results
pivoted_df.show()

23/11/22 22:08:31 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|day|month|year|   1|   2|   3|   4|   5|   6|   7|   8|   9|  10|  11|  13|  14|  16|  19|  20|  21|  22|  23|  24|  25|  26|  27|  28|  29|  30|  31|  32|  33|  34|  35|  36|  38|  39|  41|  42|  43|  44|  46|  50|  52|  56|  59|  60|  61|  62|  63|  65|  66|  67|  68|  69|  70|  71|  72|  75|  76|  77|
+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|  2|    7|2021|null|null|null|null|null|null|null|null|null|null|null|null|nul

In [44]:
# the output of the sample df above looks off. investigate

#pivoted_df = sample_df.groupBy("day","month","year").pivot("dropoff_area").agg({"count": "first"})

                                                                                

AnalysisException: Cannot resolve column name "count" among (ID, start_timestamp, end_timestamp, seconds, miles, pickup_tract, dropoff_tract, pickup_area, dropoff_area, Fare, Tip, total, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, month, day_of_month, hour, day, year, Hyde_Park, Kenwood, Woodlawn)

In [None]:
# read in weather data, merge with rideshare data

In [11]:
df_weather_1 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2018-01-01 to 2020-01-01.csv", inferSchema=True, header=True)
df_weather_2 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv", inferSchema=True, header=True)
df_weather_3 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2022-09-01 to 2022-12-31.csv", inferSchema=True, header=True)
# add 2023 data
df_weather = df_weather_1.union(df_weather_2).union(df_weather_3)

                                                                                

In [13]:
df_weather = df_weather.select('name', 'datetime', 'temp','precip','snow','snowdepth','sunset')
df_weather = df_weather.withColumn('datetime',F.to_date(df_weather['datetime'], "yyyy-MM-dd"))
df_weather.printSchema()
# name, datetime, temp, precip, snow, snowdepth, sunset.
# merge on datetime- keep datetime as part of the pivot. 

root
 |-- name: string (nullable = true)
 |-- datetime: date (nullable = true)
 |-- temp: double (nullable = true)
 |-- precip: double (nullable = true)
 |-- snow: double (nullable = true)
 |-- snowdepth: double (nullable = true)
 |-- sunset: timestamp (nullable = true)



In [None]:
# Create a new column 'date_of_rides' by combining day, month, and year columns
pivoted_df_with_date = pivoted_df.withColumn('date_of_rides', F.to_date(
    F.concat_ws('-', F.col('year'), F.col('month'), F.col('day')),
    'yyyy-MM-dd'
))

# Show the DataFrame with the new date column
pivoted_df_with_date.show()

