In [7]:
!pip install pyspark
!pip install kafka-python
from pyspark.sql import SparkSession



In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.getOrCreate()

#approximate ratio to sample ~2GB from 27GB
sample_size = 2 / 27
sampled_df = None
for i in range(1, 13):
    path = f"input/trip_data/trip_data_{i}.csv"
    df = spark.read.option("header", True).option("inferSchema", True).csv(path)
    #random sample from each file (~7.4%)
    sampled_part = df.sample(withReplacement=False, fraction=sample_size, seed=80085)
    if sampled_df is None:
        sampled_df = sampled_part
    else:
        sampled_df = sampled_df.union(sampled_part)
#I loop through all files, sample them individually, and union the samples together

In [12]:
sampled_df.printSchema()

root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- rate_code: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)



In [13]:
sampled_df.limit(20).toPandas()

Unnamed: 0,medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,312E0CB058D7FC1A6494EDB66D360CD2,7B5156F38990963332B33298C8BAE25E,CMT,1,N,2013-01-05 11:54:49,2013-01-05 12:03:48,1,539,0.8,-73.977127,40.74831,-73.990913,40.751053
1,0F9E0728AB1E40D5CEB0C6EDBF805CCB,8434E8A33D8C0150573FAA00B8A9ABF5,CMT,1,N,2013-01-05 19:04:43,2013-01-05 19:13:58,1,555,2.8,-73.966682,40.761139,-73.938515,40.792332
2,4A4DA06C65CFA356CD538AF4B899430E,7CE1605151178F02B4EBBD8472C86A13,CMT,1,N,2013-01-02 08:12:21,2013-01-02 08:24:43,1,742,2.5,-73.99115,40.750023,-73.952614,40.74115
3,02BFD1B64C2C80B433D3A282C828912B,375201058CFE70CE40B69903041C2310,CMT,1,N,2013-01-01 19:41:43,2013-01-01 19:49:17,2,453,2.1,-73.984734,40.76931,-73.997787,40.744205
4,1944EB168702ED8E6B9FD94E988D0197,1D24DD6F12731B59E72DD7CF387F5011,CMT,1,N,2013-01-08 08:30:16,2013-01-08 08:41:03,1,647,0.8,-73.976288,40.765472,-73.975105,40.757393
5,D5367E940A20B9D2550BF7CF7AE01681,5B34A2589D1D2106FC0C47564A4833F0,CMT,1,N,2013-01-08 10:08:49,2013-01-08 10:18:43,2,594,1.6,-73.972366,40.762138,-73.957542,40.782902
6,37BDEA2E54A3B70CBFA5B0D1C3A75FE2,730A179A7F97126B694CABFB93CC3A0C,CMT,1,N,2013-01-08 07:49:28,2013-01-08 07:57:15,2,467,2.6,-74.00679,40.754345,-73.980316,40.784081
7,251012565308E6E0E67550CFBBD253AB,8B204068A2EB2F13F762C05AC1F0B075,CMT,1,N,2013-01-05 19:52:03,2013-01-05 20:08:02,4,959,3.5,-73.999146,40.734352,-73.983322,40.773598
8,1E65B7E2D1297CF3B2CA87888C05FE43,F9ABCCCC4483152C248634ADE2435CF0,VTS,1,,2013-01-13 04:26:00,2013-01-13 04:46:00,1,1200,2.46,-73.956451,40.771442,-73.972733,40.74345
9,90D83E0D0B4FF8DE2923C2977EF22C36,92153937578731DA2B1EC83D91E7FA3E,VTS,1,,2013-01-13 04:37:00,2013-01-13 04:44:00,2,420,2.21,-73.998657,40.74015,-73.98526,40.763435


In [14]:
#this saves the dataframe as csv
sampled_df.coalesce(1).write \
    .option("header", True) \
    .mode("overwrite") \
    .csv("input/trip data/trip_data_sampled.csv")