In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spark = SparkSession.builder.appName('2021EDA').getOrCreate()

#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])

#print spark configuration settings
spark.sparkContext.getConf().getAll()

[('spark.stage.maxConsecutiveAttempts', '10'),
 ('spark.dynamicAllocation.minExecutors', '1'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.submit.pyFiles',
  '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-

In [31]:
df_2021 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/2021", inferSchema=True, header=True)
# figure out how to read in shp file msca-bdp-student-gcs/bdp-rideshare-project/neighborhoods/shp files
df_weather = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv", inferSchema=True, header=True)
df_2021.printSchema()
df_weather.printSchema()

[Stage 180:>                                                        (0 + 1) / 1]

root
 |-- Trip ID: string (nullable = true)
 |-- Trip Start Timestamp: string (nullable = true)
 |-- Trip End Timestamp: string (nullable = true)
 |-- Trip Seconds: integer (nullable = true)
 |-- Trip Miles: double (nullable = true)
 |-- Pickup Census Tract: long (nullable = true)
 |-- Dropoff Census Tract: long (nullable = true)
 |-- Pickup Community Area: integer (nullable = true)
 |-- Dropoff Community Area: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: integer (nullable = true)
 |-- Additional Charges: string (nullable = true)
 |-- Trip Total: double (nullable = true)
 |-- Shared Trip Authorized: boolean (nullable = true)
 |-- Trips Pooled: integer (nullable = true)
 |-- Pickup Centroid Latitude: double (nullable = true)
 |-- Pickup Centroid Longitude: double (nullable = true)
 |-- Pickup Centroid Location: string (nullable = true)
 |-- Dropoff Centroid Latitude: double (nullable = true)
 |-- Dropoff Centroid Longitude: double (nullable = true)
 |-- Dropof

                                                                                

In [4]:
#display number of records by partition
def displaypartitions(df):
    #number of records by partition
    num = df.rdd.getNumPartitions()
    print("Partitions:", num)
    df.withColumn("partitionId", F.spark_partition_id())\
        .groupBy("partitionId")\
        .count()\
        .orderBy(F.asc("count"))\
        .show(num)

df_2021.rdd.getNumPartitions()

6

In [5]:
displaypartitions(df_2021)

Partitions: 6




+-----------+------+
|partitionId| count|
+-----------+------+
|          5| 69847|
|          4|523726|
|          3|527064|
|          1|527581|
|          2|528719|
|          0|531719|
+-----------+------+



                                                                                

In [32]:
df_2021 = df_2021.repartition(10)

In [7]:
displaypartitions(df_2021)



Partitions: 10




+-----------+------+
|partitionId| count|
+-----------+------+
|          5|270864|
|          0|270865|
|          6|270865|
|          7|270865|
|          1|270866|
|          4|270866|
|          8|270866|
|          9|270866|
|          2|270866|
|          3|270867|
+-----------+------+



                                                                                

In [8]:
df_2021.describe().show()



+-------+----------------+--------------------+--------------------+-----------------+-----------------+--------------------+--------------------+---------------------+----------------------+------------------+------------------+------------------+------------------+--------------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+
|summary|         Trip ID|Trip Start Timestamp|  Trip End Timestamp|     Trip Seconds|       Trip Miles| Pickup Census Tract|Dropoff Census Tract|Pickup Community Area|Dropoff Community Area|              Fare|               Tip|Additional Charges|        Trip Total|        Trips Pooled|Pickup Centroid Latitude|Pickup Centroid Longitude|Pickup Centroid Location|Dropoff Centroid Latitude|Dropoff Centroid Longitude|Dropoff Centroid Location|
+-------+----------------+--------------------+--------------------+-----------------+-----------------+--------

                                                                                

In [9]:
#Find the number of missing values for each column
from pyspark.sql.functions import isnan, when, count, col
df_2021.select([count(when(df_2021[c].isNull(), c)).alias(c) for c in df_2021.columns]).show()



+-------+--------------------+------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+
|Trip ID|Trip Start Timestamp|Trip End Timestamp|Trip Seconds|Trip Miles|Pickup Census Tract|Dropoff Census Tract|Pickup Community Area|Dropoff Community Area|Fare|Tip|Additional Charges|Trip Total|Shared Trip Authorized|Trips Pooled|Pickup Centroid Latitude|Pickup Centroid Longitude|Pickup Centroid Location|Dropoff Centroid Latitude|Dropoff Centroid Longitude|Dropoff Centroid Location|
+-------+--------------------+------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+---+------------------+----------+----------------------

                                                                                

In [10]:
# number of observations with all the data in each column
df_2021.dropna(how='any').count()

                                                                                

921081

In [11]:
# Working with just data that contains full information and check for dupes
df_2021 = df_2021.dropna(how='any')
df_2021 = df_2021.dropDuplicates()

In [16]:
# Drop columns unlikely to be useful for analysis for speed of computation and rename columns to remove spacing for ease of code writing
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

df_2021 = df_2021.drop('Trip Seconds','Trips Pooled','Additional Charges','Shared Trip Authorized')
df_2021 = df_2021.withColumnRenamed("Trip ID","ID").withColumnRenamed("Trip Start Timestamp","start_timestamp").withColumnRenamed("Trip End Timestamp","end_timestamp").withColumnRenamed("Trip Miles",\
                "miles").withColumnRenamed("Pickup Census Tract","pickup_tract").withColumnRenamed("Dropoff Census Tract","dropoff_tract").withColumnRenamed("Pickup Community Area","pickup_area"\
                ).withColumnRenamed("Dropoff Community Area","dropoff_area").withColumnRenamed("Trip Total","total").withColumnRenamed("Pickup Centroid Latitude","pickup_lat").withColumnRenamed(\
                "Pickup Centroid Longitude","pickup_lon").withColumnRenamed("Pickup Centroid Location","pickup_location").withColumnRenamed("Dropoff Centroid Latitude","dropoff_lat").withColumnRenamed(\
                "Dropoff Centroid Longitude","dropoff_lon").withColumnRenamed("Dropoff Centroid Location","dropoff_location")
# fix datatypes
df_2021 = df_2021.withColumn('start_timestamp', F.to_timestamp(df_2021['start_timestamp'], 'MM/DD/YYYY HH:mm:ss AM/PM')).withColumn('end_timestamp', F.to_timestamp(df_2021['end_timestamp'], 'MM/DD/YYYY HH:mm:ss AM/PM'))
df_weather = df_weather.withColumn('datetime',F.to_date(df_weather['datetime'], "MM/dd/yyyy"))


In [23]:
df_2021 = df_2021.withColumn('month', F.month(df_2021.start_timestamp))

In [17]:
df_2021.show(5)



+--------------------+-------------------+-------------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+--------------------+-------------+--------------+--------------------+
|                  ID|    start_timestamp|      end_timestamp|miles|pickup_tract|dropoff_tract|pickup_area|dropoff_area|Fare|Tip|total|   pickup_lat|    pickup_lon|     pickup_location|  dropoff_lat|   dropoff_lon|    dropoff_location|
+--------------------+-------------------+-------------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+--------------------+-------------+--------------+--------------------+
|960742b2998c7908e...|2021-01-04 01:45:00|2021-01-04 02:00:00|  5.9| 17031063000|  17031830600|          6|           1|15.0|  0|16.23|41.9363101308|-87.6515625922|POINT (-87.651562...|42.0016981937|-87.6735740325|POINT (-87.673574...|
|97d44c73c92c79cc4...|2021-01-04 04:30:00|2021-01-04 05:

                                                                                

In [20]:
df_weather = df_weather.withColumn('datetime',F.to_date(df_weather['datetime'], "yyyy-mm-dd"))
df_weather.show(5)

+-------+----------+-------+-------+----+------------+------------+---------+----+--------+------+----------+-----------+----------+----+---------+--------+---------+-------+----------------+----------+----------+--------------+-----------+-------+----------+-------------------+-------------------+---------+--------------------+--------------------+-----------------+--------------------+
|   name|  datetime|tempmax|tempmin|temp|feelslikemax|feelslikemin|feelslike| dew|humidity|precip|precipprob|precipcover|preciptype|snow|snowdepth|windgust|windspeed|winddir|sealevelpressure|cloudcover|visibility|solarradiation|solarenergy|uvindex|severerisk|            sunrise|             sunset|moonphase|          conditions|         description|             icon|            stations|
+-------+----------+-------+-------+----+------------+------------+---------+----+--------+------+----------+-----------+----------+----+---------+--------+---------+-------+----------------+----------+----------+-----

In [24]:
hp_census_tracts_2010_2020 = [17031411000,17031410900,17031410100,17031411100,17031410800,17031410200,17031410700,17031411200,17031836200,17031410600,17031836300,17031410500,
                    17031410300,17031410400,17031410600,17031410800,17031411300,17031411400]
df_hp = df_2021.filter((df_2021.pickup_tract.isin(hp_census_tracts_2010_2020)) & (df_2021.dropoff_tract.isin(hp_census_tracts_2010_2020)))

In [25]:
df_hp.show(5)



+--------------------+-------------------+-------------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+--------------------+-------------+--------------+--------------------+-----+
|                  ID|    start_timestamp|      end_timestamp|miles|pickup_tract|dropoff_tract|pickup_area|dropoff_area|Fare|Tip|total|   pickup_lat|    pickup_lon|     pickup_location|  dropoff_lat|   dropoff_lon|    dropoff_location|month|
+--------------------+-------------------+-------------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+--------------------+-------------+--------------+--------------------+-----+
|edb44ac5f0ca61fcd...|2021-01-17 04:15:00|2021-01-17 04:15:00|  1.4| 17031410800|  17031836200|         41|          41| 5.0|  0|  8.1|41.7979652088|-87.5896070309|POINT (-87.589607...|41.7904693995|-87.6012851221|POINT (-87.601285...|    1|
|a6ff5154d2a4e8b45...|2021-01-22

                                                                                

In [27]:
df_hp.groupby("month").agg({'ID':'count'}).orderBy(F.col('month').asc()).toPandas() #.plot(x="month",y="count(ID)")

                                                                                

Unnamed: 0,month,count(ID)
0,1,1825


In [28]:
df_2021.count()

                                                                                

921081

In [29]:
df_hp.count()

                                                                                

1825

In [30]:
df_2021.select('month').distinct().show()
df_hp.select('month').distinct().show()


                                                                                

+-----+
|month|
+-----+
|    1|
+-----+



                                                                                

+-----+
|month|
+-----+
|    1|
+-----+

