In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql import Row
from pyspark.sql.types import *

In [7]:
from pyspark.sql.functions import to_timestamp

In [2]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [3]:
spark.conf.set("mapreduce.fileoutputcommitter.algorithm.version", "2")

In [4]:
spark.version

'3.0.1'

In [5]:
df = spark.read.csv("cv19_data/202011-citibike-tripdata.csv",header=True)

# Get Date From Time Stamp Column

In [12]:
# test turning string into timestamp
df.limit(5).select(df.stoptime.cast(TimestampType()).alias('datetime')).collect()

[Row(datetime=datetime.datetime(2020, 11, 1, 0, 8, 48, 301000)),
 Row(datetime=datetime.datetime(2020, 11, 1, 0, 8, 23, 317000)),
 Row(datetime=datetime.datetime(2020, 11, 1, 0, 33, 14, 164000)),
 Row(datetime=datetime.datetime(2020, 11, 1, 0, 39, 57, 3000)),
 Row(datetime=datetime.datetime(2020, 11, 1, 0, 3, 2, 898000))]

## Get Stop Time

In [10]:
# turn stoptime column to timestamp from string
df = df.withColumn("stoptime_ts",to_timestamp(col("stoptime")))

## Get Start Time

In [14]:
# turn stoptime column to timestamp from string
df = df.withColumn("starttime_ts",to_timestamp(col("starttime")))

# Save To Correct Column Order

In [16]:
df.columns

['tripduration',
 'starttime',
 'stoptime',
 'start station id',
 'start station name',
 'start station latitude',
 'start station longitude',
 'end station id',
 'end station name',
 'end station latitude',
 'end station longitude',
 'bikeid',
 'usertype',
 'birth year',
 'gender',
 'stoptime_ts',
 'starttime_ts']

In [17]:
# select columns to drop
columns_to_drop = ['tripduration',
 'starttime',
 'stoptime',
 'start station id',
 'start station latitude',
 'start station longitude',
 'end station id',
 'end station latitude',
 'end station longitude']

In [20]:
test_df = df.drop(*columns_to_drop)

In [22]:
test_df.limit(5).toPandas()

Unnamed: 0,start station name,end station name,bikeid,usertype,birth year,gender,stoptime_ts,starttime_ts
0,W Broadway & Spring St,Clinton St & Grand St,40405,Subscriber,1989,1,2020-11-01 00:08:48.301,2020-11-01 00:00:07.015
1,40 Ave & 9 St,40 Ave & 9 St,46504,Subscriber,1970,2,2020-11-01 00:08:23.317,2020-11-01 00:00:10.808
2,Roebling St & N 4 St,Morgan Ave & Maspeth Ave,37452,Subscriber,1989,2,2020-11-01 00:33:14.164,2020-11-01 00:00:14.704
3,Cliff St & Fulton St,Amsterdam Ave & W 73 St,40417,Subscriber,1981,1,2020-11-01 00:39:57.003,2020-11-01 00:00:14.707
4,W 59 St & 10 Ave,W 70 St & Amsterdam Ave,35776,Subscriber,1990,1,2020-11-01 00:03:02.898,2020-11-01 00:00:15.969


# Test

In [15]:
df.limit(5).toPandas()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,stoptime_ts,starttime_ts
0,521,2020-11-01 00:00:07.0150,2020-11-01 00:08:48.3010,3467,W Broadway & Spring St,40.72494672359416,-74.00165855884552,350,Clinton St & Grand St,40.71559509,-73.9870295,40405,Subscriber,1989,1,2020-11-01 00:08:48.301,2020-11-01 00:00:07.015
1,492,2020-11-01 00:00:10.8080,2020-11-01 00:08:23.3170,3557,40 Ave & 9 St,40.75742,-73.945133,3557,40 Ave & 9 St,40.75742,-73.945133,46504,Subscriber,1970,2,2020-11-01 00:08:23.317,2020-11-01 00:00:10.808
2,1979,2020-11-01 00:00:14.7040,2020-11-01 00:33:14.1640,3085,Roebling St & N 4 St,40.71469,-73.95739,3854,Morgan Ave & Maspeth Ave,40.716657,-73.93637,37452,Subscriber,1989,2,2020-11-01 00:33:14.164,2020-11-01 00:00:14.704
3,2382,2020-11-01 00:00:14.7070,2020-11-01 00:39:57.0030,3783,Cliff St & Fulton St,40.70838,-74.00495,3167,Amsterdam Ave & W 73 St,40.77966809007312,-73.98093044757842,40417,Subscriber,1981,1,2020-11-01 00:39:57.003,2020-11-01 00:00:14.707
4,166,2020-11-01 00:00:15.9690,2020-11-01 00:03:02.8980,422,W 59 St & 10 Ave,40.770513,-73.988038,3175,W 70 St & Amsterdam Ave,40.77748046,-73.98288594,35776,Subscriber,1990,1,2020-11-01 00:03:02.898,2020-11-01 00:00:15.969
