In [1]:
import pandas as pd
import numpy as np
import time
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
import findspark
findspark.init('/usr/local/spark')

In [2]:
spark = SparkSession\
        .builder\
        .master("yarn")\
        .config("spark.driver.memory", "16g")\
        .config("spark.driver.maxResultSize", "12g")\
        .config('spark.executor.instances','16')\
        .config('spark.executor.memory','9G')\
        .appName("feature engineering")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-30 16:47:48,134 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

In [4]:
spark.sparkContext.appName
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [5]:
import pyspark.pandas as ps
ps.set_option("compute.default_index_type", "distributed")



In [6]:
df = ps.read_parquet(f'hdfs://nncluster:/data/etag/m05/parquet/')
df = df.drop(["month"],axis = 1)
df.shape

                                                                                

(219775680, 6)

In [7]:
df_holiday = ps.read_csv('hdfs://nncluster:/data/etag/m05/2015_2021_holiday.csv')
df_holiday.shape

                                                                                

(166, 3)

In [8]:
# add date string
df["Date_Start"] = ps.to_datetime(df["Time"],format = "%Y/%m/%d").dt.strftime('%Y%m%d')

In [9]:
# remane some station
df.loc[(df['S-Station']=='01F0509S'),'S-Station'] = '01F0511S'
df.loc[(df['S-Station']=='01F0509N'),'S-Station'] = '01F0511N'
df.loc[(df['E-Station']=='01F0509S'),'E-Station'] = '01F0511S'
df.loc[(df['E-Station']=='01F0509N'),'E-Station'] = '01F0511N'

In [10]:
# add Segment feature
df["Segment"] = df["S-Station"]+"-"+df["E-Station"]

In [11]:
# delete diff direction and road
def filters(x):
    if x[:2] != x[9:11]:
        return 0
    elif x[7] != x[-1]:
        return 0
    else:
        return 1
    
df['Keep'] = df['Segment'].apply(lambda x : filters(x))
df = df[df['Keep']==1]

In [12]:
# add Direction feature
df["Direction"] = df["S-Station"].apply(lambda x: 0 if x[-1] == "S" else 1)

                                                                                

In [13]:
# add hour feature
df["Hour"] = df["Time"].apply(lambda x: int(x[-5:-3]))

                                                                                

In [14]:
# drop speed = 0 rows
df.loc[df["Speed"]==0,"Speed"] = np.nan
df = df.dropna(subset="Speed")

In [16]:
# groupby every hour then sum count and avg speeds
df = df.groupby(['Hour','Date_Start','Segment', 'Direction','V-Type']).agg({"Count":"sum","Speed":"mean"}).reset_index()

In [17]:
df.head()

                                                                                

Unnamed: 0,Hour,Date_Start,Segment,Direction,V-Type,Count,Speed
77309411328,11,20200808,03F1992N-03F1941N,1,31,1705,107.0
77309411329,11,20200808,03F2535S-03F2614S,0,5,45,91.272727
77309411330,11,20200808,05F0528N-05F0438N,1,42,8,75.5
77309411331,11,20200808,01F2425S-01F2483S,0,41,61,96.0
77309411332,11,20200808,03F0116N-03F0054N,1,5,61,82.833333


In [18]:
# add year month weekday feature
date = ps.to_datetime(df["Date_Start"],format = "%Y%m%d")

df["Year"] = date.dt.strftime("%Y").astype(int)
df["Month"] = date.dt.strftime("%m").astype(int)
df["Wday"] = date.dt.strftime("%w").astype(int)

In [19]:
# add segment category feature
df["Seg_cat"] = df["Segment"]
df["Seg_cat"] = df["Seg_cat"].astype("category").cat.codes.astype(int)

                                                                                

In [20]:
# holiday dataframe preprocess
df_holiday["Date Start"] = df_holiday["Date Start"].astype(str)
df_holiday = df_holiday.drop(["_c0"],axis=1)
df_holiday = df_holiday.rename(columns={'Date Start':'Date_Start'})
df_holiday.head()

Unnamed: 0,Date_Start,Holiday
0,20150101,2
1,20150102,2
2,20150103,2
3,20150104,2
4,20150218,2


In [21]:
# merge df_holiday to df
df = df.merge(df_holiday,how = "left")

In [22]:
# fill holiday na
con = (df["Holiday"].isnull()) & (df["Wday"].isin([0,6]))
df.loc[con, "Holiday"] = 1

df["Holiday"] = df["Holiday"].fillna(0)

In [23]:
# check df
df = df.sort_values(["Date_Start","Seg_cat","Hour"])
df.head()

                                                                                

Unnamed: 0,Hour,Date_Start,Segment,Direction,V-Type,Count,Speed,Year,Month,Wday,Seg_cat,Holiday
42949952686,0,20200101,01F0005S-01F0017S,0,32,90,85.333333,2020,1,3,0,0
51540406592,0,20200101,01F0005S-01F0017S,0,31,404,86.916667,2020,1,3,0,0
60129800706,0,20200101,01F0005S-01F0017S,0,41,7,82.333333,2020,1,3,0,0
77309944167,0,20200101,01F0005S-01F0017S,0,42,7,73.571429,2020,1,3,0,0
8590460694,1,20200101,01F0005S-01F0017S,0,32,103,85.25,2020,1,3,0,0


In [None]:
# output to parquet
df.to_parquet("hdfs://nncluster:/user/joyefan/2020_AFE")

In [None]:
# save csv
df = ps.read_parquet("hdfs://nncluster:/user/joyefan/2020_AFE")

pdf = df.to_pandas()

pdf.to_csv('2020_AFE.csv')