In [None]:
import pyspark

In [None]:
### Student Section - Please compete all the functions below

In [None]:
#export
def load_data(gcp_storage_path):
    """
        :param gcp_storage_path: string (full gs path including file name e.g gs://bucket_name/data.csv) 
        :return: spark dataframe  
        
    """
    ################################################################
    # code to load yellow_tripdata_2019-01.csv data from your GCP  #
    # storage bucket                                               #      
    ################################################################
    sc = spark.SparkContext.getOrCreate()
    df = spark.read.csv(path = gcp_storage_path, header=True)
    
    return df

In [None]:
#export
def exclude_no_pickuplocations(df):
    """
        :param nyc tax trips dataframe: spark dataframe 
        :return: spark dataframe  
    """
    ################################################################
    # code to exclude trips with no pickup locations               #
    # Note: Exclude nulls and zeros                                #        
    ################################################################
    df.createOrReplaceTempView("df")
    df = df.filter("pulocationid is not null and pulocationid != 0")
    
    return df
    

#### Function to exclude trips with no distance

In [None]:
#export
def exclude_no_tripdistance(df):
    """
        :param nyc tax trips dataframe: spark dataframe 
        :return: spark dataframe  
    """
    ################################################################
    # code to exclude trips with no trip distances                 #
    # Note: Exclude nulls and zeros                                #        
    ################################################################
    df.createOrReplaceTempView("df")
    df = df.filter("trip_distance is not null and trip_distance != 0")
    
    return df

In [None]:
#export
def include_fare_range(df):
    
    """
        :param nyc tax trips dataframe: spark dataframe 
        :return: spark dataframe  
    """
    ################################################################
    # code to include trips with only within the fare range of     #
    # 20 to 60 dollars (including 20 and 60 dollars)               #        
    ################################################################
    df.createOrReplaceTempView("df")
    df = df.filter("fare_amount >= 20 and fare_amount <=60")
    
    return df

In [None]:
#export
def get_highest_tip(df):
    """
        :param nyc tax trips dataframe: spark dataframe 
        :return: decimal (rounded to 2 digits)  (NOTE: DON'T USE FLOAT)
    """
    
    ################################################################
    # code to get the highest tip amount                           #
    #                                                              #        
    ################################################################
    df.createOrReplaceTempView("df")
    df = df.withColumn("tip_amount", df["tip_amount"].cast("decimal(38, 10)")).groupby().max("tip_amount").withColumnRenamed("max(tip_amount)", "max_tip_amount")
    df = df.withColumn("max_tip_amount", df["max_tip_amount"].cast("decimal(38, 10)"))
    
    return round(df.collect()[0][0], 2)

In [None]:
#export
def get_total_toll(df):
    """
        :param nyc tax trips dataframe: spark dataframe 
        :return: decimal (rounded to 2 digits)  (NOTE: DON'T USE FLOAT)
    """
    
    ################################################################
    # code to get total toll amount                                #
    #                                                              #        
    ################################################################
    df.createOrReplaceTempView("df")
    df = df.withColumn("tolls_amount", df["tolls_amount"].cast("decimal(38, 10)")).groupby().sum("tolls_amount").withColumnRenamed("sum(tolls_amount)", "sum_tolls_amount")
    df = df.withColumn("sum_tolls_amount", df["sum_tolls_amount"].cast("decimal(38, 10)"))
    
    return round(df.collect()[0][0], 2)

In [None]:
# gcp_storage_path = "gs://hhong96/yellow_tripdata09-08-2021.csv"
# df = load_data(gcp_storage_path)
# df.printSchema()

#### Print total numbers of rows in the dataframe

In [None]:
# df.count()

#### Print total number of rows in the dataframe after excluding trips with no pickup location

In [None]:
# df_no_pickup_locations = exclude_no_pickuplocations(df)
# df_no_pickup_locations.count()

#### Print total number of rows in the dataframe after exclude trips with no distance

In [None]:
# df_no_trip_distance = exclude_no_tripdistance(df_no_pickup_locations)
# df_no_trip_distance.count()

#### Print total number of rows in the dataframe after including trips with fair amount between the range of 20 to 60 Dollars

In [None]:
# df_include_fare_range = include_fare_range(df_no_trip_distance)
# df_include_fare_range.count()

#### Print the highest tip amount

In [None]:
# max_tip = get_highest_tip(df_include_fare_range)
# print(max_tip)

#### Print the total toll amount

In [None]:
# total_toll = get_total_toll(df_include_fare_range)
# print(total_toll)