In [1]:
import polars as pd
import time

In [2]:
def pd_read_csv(path):
    """
    Converting csv file into Pandas dataframe
    """
    df= pd.read_csv(path)
    return df

def pd_read_parquet(path, ):
    """
    Converting parquet file into Pandas dataframe
    """
    df= pd.read_parquet(path,)
    return df

In [3]:
def mean_test_speed_pd(df_pd):
    """
    Getting Mean per PULocationID
    """
    df_pd = df_pd[['PULocationID', 'trip_distance']]    
    df_pd = df_pd.with_columns(pd.col('PULocationID').alias('PULocationID_column'))  
    df_pd = df_pd.groupby('PULocationID').mean() 
    df_pd = df_pd.with_columns(pd.col('PULocationID_column').cast(pd.Int64))    
    return df_pd

def endwith_test_speed_pd(df_pd):
    """
    Only getting Zones that end with East
    """
    df_pd.dropna(inplace=True)
    df_pd = df_pd[df_pd.Zone.str.endswith('East')]        
    return df_pd

In [4]:
def loading_into_parquet(df_pd):
    """
    Save dataframe in parquet
    """        
    df_pd.to_parquet(f'yellow_tripdata_2021-01_{pd.__version__}.parquet')

def loading_into_csv(df_pd):
    """
    Save dataframe in parquet
    """        
    df_pd.write_csv('sample_polars.csv')    

In [5]:
def main():    
    print(f'Starting ETL for Polars version {pd.__version__}')
    start_time = time.perf_counter()
    
    print('Extracting...')
    path1="D:\Twitter_n_Insta\pandas_polars_pandas2\yellow_tripdata_2021-01.csv"
    df_trips= pd_read_csv(path1)    
    path2 = "D:\Twitter_n_Insta\pandas_polars_pandas2\\taxi+_zone_lookup.csv"
    df_zone = pd_read_csv(path2)    
    end_extract = time.perf_counter() 
    time_extract = end_extract- start_time
    print(f'Extraction csv end in {round(time_extract,3)} seconds')

    print('Transforming...')    
    df_trips= mean_test_speed_pd(df_trips)    
    df = df_trips.join(df_zone,how="inner", left_on="PULocationID_column", right_on="LocationID",)
    df = df[["Borough","Zone","trip_distance"]]    
    # df = endwith_test_speed_pd(df)    
    end_transform = time.perf_counter() 
    time_transformation = time.perf_counter() - end_extract
    print(f'Transformation end in {round(time_transformation,3)} seconds')

    print('Loading...')               
    loading_into_csv(df)
    load_transformation = time.perf_counter() - end_transform
    print(f'Loading end in {round(load_transformation,3)} seconds')

    print(f'End ETL for Pandas version {pd.__version__}')

In [6]:
main()

Starting ETL for Modin version 0.16.16
Extracting...
Extraction csv end in 0.215 seconds
Transforming...
Transformation end in 0.133 seconds
Loading...
Loading end in 0.002 seconds
End ETL for Pandas version 0.16.16
