In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql import Row
from pyspark.sql.types import *

In [7]:
from pyspark.sql.functions import to_timestamp

In [28]:
from pyspark.sql.functions import lit

In [2]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [3]:
spark.conf.set("mapreduce.fileoutputcommitter.algorithm.version", "2")

In [4]:
spark.version

'3.0.1'

In [5]:
df = spark.read.csv("cv19_data/202011-citibike-tripdata.csv",header=True)

# Get Date From Time Stamp Column

In [12]:
# test turning string into timestamp
df.limit(5).select(df.stoptime.cast(TimestampType()).alias('datetime')).collect()

[Row(datetime=datetime.datetime(2020, 11, 1, 0, 8, 48, 301000)),
 Row(datetime=datetime.datetime(2020, 11, 1, 0, 8, 23, 317000)),
 Row(datetime=datetime.datetime(2020, 11, 1, 0, 33, 14, 164000)),
 Row(datetime=datetime.datetime(2020, 11, 1, 0, 39, 57, 3000)),
 Row(datetime=datetime.datetime(2020, 11, 1, 0, 3, 2, 898000))]

## Get Stop Time

In [10]:
# turn stoptime column to timestamp from string
df = df.withColumn("stoptime_ts",to_timestamp(col("stoptime")))

## Get Start Time

In [14]:
# turn stoptime column to timestamp from string
df = df.withColumn("starttime_ts",to_timestamp(col("starttime")))

# Save To Correct Column Order

In [16]:
df.columns

['tripduration',
 'starttime',
 'stoptime',
 'start station id',
 'start station name',
 'start station latitude',
 'start station longitude',
 'end station id',
 'end station name',
 'end station latitude',
 'end station longitude',
 'bikeid',
 'usertype',
 'birth year',
 'gender',
 'stoptime_ts',
 'starttime_ts']

In [17]:
# select columns to drop
columns_to_drop = ['tripduration',
 'starttime',
 'stoptime',
 'start station id',
 'start station latitude',
 'start station longitude',
 'end station id',
 'end station latitude',
 'end station longitude']

In [23]:
df = df.drop(*columns_to_drop)

In [26]:
df

DataFrame[start station name: string, end station name: string, bikeid: string, usertype: string, birth year: string, gender: string, stoptime_ts: timestamp, starttime_ts: timestamp]

# Turn Stop Time To date

In [30]:
# # correct code
# df.select(date_format('stoptime_ts', 'MM/dd/yyy').alias('date')).collect()

In [31]:
# create date column
df = df.withColumn("stoptime_date",date_format('stoptime_ts','MM/dd/yyy'))

## Turn Stop Time to Time

In [33]:
# create time column
df = df.withColumn('stoptime_time', date_format('stoptime_ts', 'HH:mm:ss'))

# Group By Date, Hour, Station From, Station End, Bikeid Count

### Create Week Column

In [35]:
df.dtypes

[('start station name', 'string'),
 ('end station name', 'string'),
 ('bikeid', 'string'),
 ('usertype', 'string'),
 ('birth year', 'string'),
 ('gender', 'string'),
 ('stoptime_ts', 'timestamp'),
 ('starttime_ts', 'timestamp'),
 ('stoptime_date', 'string'),
 ('stoptime_time', 'string')]

In [47]:
from pyspark.sql.functions import hour, count, dayofyear

In [60]:
final_df = (df.groupBy("stoptime_date", hour("stoptime_ts").alias("hour"), "start station name", "end station name")
    .agg(count("bikeid").alias("no_of_trips")))

In [61]:
# check size of dataframe
print(final_df.count(), len(df.columns))

1560703 10


In [67]:
## Rename columns
final_df = final_df.withColumnRenamed("start station name", "start_station_name") \
    .withColumnRenamed("end station name", "end_station_name")

In [62]:
## Group By Station Info, Hour, and Dayofyear
# (df.groupBy("end station name", dayofyear("stoptime_ts").alias("date"), hour("stoptime_ts").alias("hour"))
#     .agg(count("bikeid").alias("count"))
#     .show())

# Test

In [68]:
final_df.limit(5).toPandas()

Unnamed: 0,stoptime_date,hour,start_station_name,end_station_name,no_of_trips
0,11/01/2020,0,E 4 St & 2 Ave,36 Ave & 31 St,1
1,11/01/2020,0,Washington Park,Central Ave & Starr Street,1
2,11/01/2020,0,Spruce St & Nassau St,E 7 St & Avenue A,1
3,11/01/2020,1,W 25 St & 6 Ave,Christopher St & Greenwich St,1
4,11/01/2020,1,1 Ave & E 16 St,10 Ave & W 28 St,1
