In [25]:
from pathlib import Path

from pyspark.sql.session import SparkSession
from pyspark.sql import DataFrame

MAX_MEMORY = "5g"
spark = (
    SparkSession.builder.master("local[*]")
    .appName("TripAnaliysis")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.excutor.memory", MAX_MEMORY)
    .config("spark.driver.memory", MAX_MEMORY)
    .getOrCreate()
)

In [26]:
import os
from pathlib import Path
from pyspark.sql import DataFrame
from pyspark.sql.session import SparkSession


class FileLoadInParquet:
    def __init__(self, year: int, taxi_type: str) -> None:
        self.year = year
        self.taxi_type = taxi_type
    
    def location(self) -> list[str]:
        return str(Path(os.getcwd()).parent.joinpath(f"data/{self.taxi_type}"))

    def read_parquet_data(self, spark: SparkSession) -> DataFrame:
        # 파케이 파일 경로
        data: str = self.location()
        return spark.read.parquet(f"file:///{data}/*")

In [27]:
from pyspark.sql.functions import col 
from pyspark.sql import functions as F

In [28]:
data = FileLoadInParquet(2019, "YellowTaxi").read_parquet_data(spark)

                                                                                

In [29]:
data.columns

['vendor_name',
 'Trip_Pickup_DateTime',
 'Trip_Dropoff_DateTime',
 'Passenger_Count',
 'Trip_Distance',
 'Start_Lon',
 'Start_Lat',
 'Rate_Code',
 'store_and_forward',
 'End_Lon',
 'End_Lat',
 'Payment_Type',
 'Fare_Amt',
 'surcharge',
 'mta_tax',
 'Tip_Amt',
 'Tolls_Amt',
 'Total_Amt']

In [123]:
from pyspark.sql.types import IntegerType

def month_classification(name: str) -> DataFrame:
    return F.split(col(name), " ", )[0].alias("datetime")
    
from pyspark.sql.functions import lit 
col_name = "Trip_Pickup_DateTime"
null_check = data.na.fill("NA").withColumn(
    "year", 
    F.when(col(col_name) == lit("NA"), "NA").otherwise(F.year(month_classification(col_name)).cast(IntegerType()))
).withColumn(
    "month", 
    F.when(col(col_name) == lit("NA"), "NA").otherwise(F.month(month_classification(col_name)).cast(IntegerType()))
).withColumn(
    "day",
    F.when(col(col_name) == lit("NA"), "NA").otherwise(F.dayofweek(month_classification(col_name)).cast(IntegerType()))
).select(
    col("Trip_Pickup_DateTime"),
    col("year"),
    col("month"),
    col("day")
).groupBy(col("month"), col("day")).agg(F.count("*").alias("count")).cache()

In [124]:
na_delect = null_check.filter(col("month") != "NA")

In [125]:
data_saving = na_delect.orderBy(F.asc("month"), F.asc("day")).toPandas().to_csv("2019_null_checking.csv", index=False, index_label=False)

                                                                                

In [126]:
spark.stop()