# Import

In [49]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F

from pyspark.sql.functions import udf, col, lit, to_timestamp
from pyspark.sql.functions import hour, count, dayofyear, year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, TimestampType, LongType

In [None]:
from pyspark.sql.functions import hour, count, dayofyear

# Create Spark Session

In [2]:
# create spark session
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [3]:
# make spark run faster
spark.conf.set("mapreduce.fileoutputcommitter.algorithm.version", "2")

In [4]:
# check version
spark.version

'3.0.1'

# Load Citi Data

In [6]:
# read csv and save to df
df = spark.read.csv("cv19_data/202011-citibike-tripdata.csv",header=True)

In [8]:
df.printSchema()

root
 |-- tripduration: string (nullable = true)
 |-- starttime: string (nullable = true)
 |-- stoptime: string (nullable = true)
 |-- start station id: string (nullable = true)
 |-- start station name: string (nullable = true)
 |-- start station latitude: string (nullable = true)
 |-- start station longitude: string (nullable = true)
 |-- end station id: string (nullable = true)
 |-- end station name: string (nullable = true)
 |-- end station latitude: string (nullable = true)
 |-- end station longitude: string (nullable = true)
 |-- bikeid: string (nullable = true)
 |-- usertype: string (nullable = true)
 |-- birth year: string (nullable = true)
 |-- gender: string (nullable = true)



In [9]:
# change to function
def turn_col_to_ts(df, target_col, new_col):
    """
    df: pass in dataframe
    col: name of df column to transform
    new_col: new name of column
    returns dataframe
    """
    return df.withColumn(new_col, to_timestamp(col(target_col)))

In [41]:
df = turn_col_to_ts(df, "stoptime", "stoptime_ts")

In [43]:
df = turn_col_to_ts(df, "starttime", "starttime_ts")

In [33]:
# function to change get date and time
def turn_ts_date_time(df, target_col, new_date_col, new_time_col):
    df = df.withColumn(new_date_col, date_format(target_col, 'MM/dd/yyy'))
    df = df.withColumn(new_time_col, date_format(target_col, 'HH:mm:ss'))
    return df                                    

In [37]:
df = turn_ts_date_time(df, "stoptime_ts", "stoptime_date", "stoptime_time")

In [45]:
df = turn_ts_date_time(df, "starttime_ts", "starttime_date", "starttime_time")

In [52]:
# create new dataframe grouping by time, location, and counting the number of bikes
citibike_df = (df.groupBy("stoptime_date", hour("stoptime_ts").alias("hour"), "start station name", "end station name")
    .agg(count("bikeid").alias("no_of_trips")))

In [54]:
citibike_df.limit(5).toPandas()

Unnamed: 0,stoptime_date,hour,start station name,end station name,no_of_trips
0,11/01/2020,0,E 4 St & 2 Ave,36 Ave & 31 St,1
1,11/01/2020,0,Washington Park,Central Ave & Starr Street,1
2,11/01/2020,0,Spruce St & Nassau St,E 7 St & Avenue A,1
3,11/01/2020,1,W 25 St & 6 Ave,Christopher St & Greenwich St,1
4,11/01/2020,1,1 Ave & E 16 St,10 Ave & W 28 St,1


In [67]:
print(f"Rows: {citibike_df.count()} Columns: {len(citibike_df.columns)}")

Rows: 1560703 Columns: 5


In [69]:
citibike_df.printSchema()

root
 |-- stoptime_date: string (nullable = true)
 |-- hour: integer (nullable = true)
 |-- start station name: string (nullable = true)
 |-- end station name: string (nullable = true)
 |-- no_of_trips: long (nullable = false)



# Load Weather Data

In [74]:
weather_df = pd.read_csv("cv19_data/nyc_daily_weather.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [83]:
cols_keep_list = ['DATE',
 'SOURCE',
 'REPORT_TYPE',
 'HourlyDryBulbTemperature',
 'HourlyRelativeHumidity',
 'HourlyPrecipitation',
 'HourlyWindSpeed']

In [85]:
weather_df = weather_df[cols_keep_list]

In [89]:
weather_df = weather_df[weather_df['SOURCE']==7]

In [94]:
weather_df["ts"] = 0

In [95]:
weather_df["ts"] = pd.to_datetime(weather_df["DATE"])

In [99]:
weather_df["date"] = weather_df["ts"].dt.date

In [102]:
weather_df["week_ending"] = pd.to_datetime(weather_df["date"]+ Week(weekday=5))

In [107]:
weather_df["hour"] = weather_df["ts"].dt.hour

## Delete Rows With Empty Values

In [110]:
weather_df.isna().sum()

DATE                           0
SOURCE                         0
REPORT_TYPE                    0
HourlyDryBulbTemperature      22
HourlyRelativeHumidity        24
HourlyPrecipitation          937
HourlyWindSpeed             1551
ts                             0
date                           0
week_ending                    0
hour                           0
dtype: int64

In [119]:
# drop rows where HourlyDryBulbTemperature is missing
weather_df.dropna(subset=["HourlyDryBulbTemperature"], inplace=True)

In [120]:
# sort by time
weather_df.sort_values(by="ts", inplace=True)

In [122]:
weather_df.reset_index(inplace=True)

In [123]:
weather_df.fillna(method="ffill",inplace=True)

In [125]:
weather_df = weather_df[['week_ending', 'date', 'hour', 'HourlyDryBulbTemperature',
   'HourlyRelativeHumidity', 'HourlyPrecipitation', 'HourlyWindSpeed']]

# Load Covid Data

In [128]:
import requests

In [129]:
# raw data from nychealth
url = 'https://raw.githubusercontent.com/nychealth/coronavirus-data/master/trends/caserate-by-modzcta.csv'

In [130]:
# get webpage
res = requests.get(url, allow_redirects=True)

In [132]:
# save page info to file
with open('cv19_data/raw_covid_data.csv','wb') as file:
    file.write(res.content)

In [133]:
# read file
covid_df = pd.read_csv('cv19_data/raw_covid_data.csv')

In [135]:
# melt data with rows containing week_ending(date), zip_zode, and case_rate
covid_df = covid_df.melt(id_vars=["week_ending"],
       var_name="zip_code",
       value_name="case_rate")

In [138]:
covid_df["zip_code"] = covid_df["zip_code"].str[9:]

In [None]:
# df = df[~df['date'].isin(a)]

In [155]:
not_zipcodes = ["CITY", "BX", "BK", "MN", "QN", "SI"]

In [159]:
covid_df = covid_df[~covid_df["zip_code"].isin(not_zipcodes)]

In [162]:
covid_df.sort_values(by="week_ending", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_df.sort_values(by="week_ending", inplace=True)


In [165]:
covid_df.reset_index(inplace=True)

In [168]:
covid_df.drop(columns="index", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [172]:
covid_df.head()

Unnamed: 0,week_ending,zip_code,case_rate
0,08/08/2020,10001,7.24
1,08/08/2020,10451,44.33
2,08/08/2020,11230,19.12
3,08/08/2020,10314,17.29
4,08/08/2020,11231,16.05


# Get End of week

In [72]:
import pandas as pd

In [70]:
from pandas.tseries.offsets import *

In [73]:
pd.to_datetime(citibike_df["stoptime_date"]+ Week(weekday=5))

AttributeError: 'pandas._libs.tslibs.offsets.Week' object has no attribute '_get_object_id'

In [47]:
df.limit(10).toPandas()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,...,bikeid,usertype,birth year,gender,stoptime_ts,stoptime_date,stoptime_time,starttime_ts,starttime_date,starttime_time
0,521,2020-11-01 00:00:07.0150,2020-11-01 00:08:48.3010,3467,W Broadway & Spring St,40.72494672359416,-74.00165855884552,350,Clinton St & Grand St,40.71559509,...,40405,Subscriber,1989,1,2020-11-01 00:08:48.301,11/01/2020,00:08:48,2020-11-01 00:00:07.015,11/01/2020,00:00:07
1,492,2020-11-01 00:00:10.8080,2020-11-01 00:08:23.3170,3557,40 Ave & 9 St,40.75742,-73.945133,3557,40 Ave & 9 St,40.75742,...,46504,Subscriber,1970,2,2020-11-01 00:08:23.317,11/01/2020,00:08:23,2020-11-01 00:00:10.808,11/01/2020,00:00:10
2,1979,2020-11-01 00:00:14.7040,2020-11-01 00:33:14.1640,3085,Roebling St & N 4 St,40.71469,-73.95739,3854,Morgan Ave & Maspeth Ave,40.716657,...,37452,Subscriber,1989,2,2020-11-01 00:33:14.164,11/01/2020,00:33:14,2020-11-01 00:00:14.704,11/01/2020,00:00:14
3,2382,2020-11-01 00:00:14.7070,2020-11-01 00:39:57.0030,3783,Cliff St & Fulton St,40.70838,-74.00495,3167,Amsterdam Ave & W 73 St,40.77966809007312,...,40417,Subscriber,1981,1,2020-11-01 00:39:57.003,11/01/2020,00:39:57,2020-11-01 00:00:14.707,11/01/2020,00:00:14
4,166,2020-11-01 00:00:15.9690,2020-11-01 00:03:02.8980,422,W 59 St & 10 Ave,40.770513,-73.988038,3175,W 70 St & Amsterdam Ave,40.77748046,...,35776,Subscriber,1990,1,2020-11-01 00:03:02.898,11/01/2020,00:03:02,2020-11-01 00:00:15.969,11/01/2020,00:00:15
5,658,2020-11-01 00:00:21.9490,2020-11-01 00:11:20.5820,3323,W 106 St & Central Park West,40.7981856,-73.9605909006,3226,W 82 St & Central Park West,40.78275,...,45567,Subscriber,1994,1,2020-11-01 00:11:20.582,11/01/2020,00:11:20,2020-11-01 00:00:21.949,11/01/2020,00:00:21
6,605,2020-11-01 00:00:22.9320,2020-11-01 00:10:28.6980,2021,W 45 St & 8 Ave,40.75929124,-73.98859651,4121,1 Ave & E 39 St,40.74714,...,37154,Subscriber,1994,1,2020-11-01 00:10:28.698,11/01/2020,00:10:28,2020-11-01 00:00:22.932,11/01/2020,00:00:22
7,813,2020-11-01 00:00:24.5440,2020-11-01 00:13:58.1290,3761,Cedar St & Myrtle Ave,40.697842,-73.926241,471,Grand St & Havemeyer St,40.71286844,...,37176,Subscriber,1973,1,2020-11-01 00:13:58.129,11/01/2020,00:13:58,2020-11-01 00:00:24.544,11/01/2020,00:00:24
8,152,2020-11-01 00:00:30.1380,2020-11-01 00:03:03.1020,347,Greenwich St & W Houston St,40.728846,-74.008591,3256,Pier 40 - Hudson River Park,40.7277140777778,...,20878,Subscriber,1964,0,2020-11-01 00:03:03.102,11/01/2020,00:03:03,2020-11-01 00:00:30.138,11/01/2020,00:00:30
9,895,2020-11-01 00:00:32.0070,2020-11-01 00:15:27.0420,3075,Division Ave & Marcy Ave,40.70708701,-73.95796783,398,Atlantic Ave & Furman St,40.69165183,...,37424,Customer,1995,1,2020-11-01 00:15:27.042,11/01/2020,00:15:27,2020-11-01 00:00:32.007,11/01/2020,00:00:32
