# EDA on time-based features

In [0]:
filepath = "s3://full-stack-bigdata-datasets/Big_Data/youtube_playlog.csv"
ACCESS_KEY_ID = " " # cle du compte student
SECRET_ACCESS_KEY = " " # secret key du compte student

hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.access.key", ACCESS_KEY_ID)
hadoop_conf.set("fs.s3a.secret.key", SECRET_ACCESS_KEY)
hadoop_conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem") 

INPUT_FILENAME = 's3://full-stack-bigdata-datasets/Big_Data/playlog_processed.parquet'

In [0]:
# load data
### BEGIN STRIP ###
df_raw = spark.read.load(INPUT_FILENAME)
df_raw.count()
### END STRIP ###

In [0]:
# As a refresher
df_raw.limit(5).toPandas()

Unnamed: 0,timestamp,user,song
0,1392388988,16,ba_ztQx87Ew
1,1392389603,101,KsMEBEcxzYA
2,1392393824,180,uyrtLz_ShL4
3,1392394627,193,rnQG04SkWD4
4,1392387742,15,e7jMGpzDBKI


In [0]:
# compute a new column `datetime`
#       that converts the timestamp to a datetime
#       drop the `timestamp` column
#       and order by `datetime`
#       save this as a new DataFrame `df`
#       show the first 5 rows of `df`
### BEGIN STRIP ###
from pyspark.sql import functions as F

df = (df_raw.withColumn('datetime', F.from_unixtime('timestamp')) \
      .drop('timestamp') \
      .orderBy('datetime') \
      .select('datetime', 'user', 'song'))
df.show(5)

### END STRIP ###

Now that we have a datetime column, we can compute new columns, namely:
- [year](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.year)
- [month](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.month)
- [dayofmonth](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.dayofmonth)
- [dayofweek](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.dayofweek)
- [dayofyear](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.dayofyear)
- [weekofyear](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.weekofyear)

We will put the resulting DataFrame in a variable called `df_enriched`.

In [0]:
### BEGIN STRIP ###
from functools import reduce

funcs = [F.year, F.month, F.dayofmonth, F.dayofweek, F.dayofyear, F.weekofyear]

df_enriched = reduce(
  lambda memo_df, f: memo_df.withColumn(f.__name__, f('datetime')),
  funcs, df)

# alternatively
# df_enriched = df.select('*', *(f('datetime').alias(f.__name__) for f in funcs))

df_enriched.show()
### END STRIP ###

In [0]:
# print out the schema of the new dataframe
### BEGIN STRIP ###
df_enriched.printSchema()
### END STRIP ###

In [0]:
# Plot average of monthly counts
### BEGIN STRIP ###
display(df_enriched.groupBy('month').count().orderBy('month'))
### END STRIP ###

month,count
1,2003539
2,2147710
3,2518750
4,2265007
5,2366518
6,2086337
7,1953630
8,1760367
9,1966203
10,2386054


In [0]:
# clean
### BEGIN STRIP ###
def count_by_period(col_name, df):
    return df.groupBy(col_name).count().orderBy(col_name)

from functools import partial
### END STRIP ###

In [0]:
# bar plot by year
### BEGIN STRIP ###
display(df_enriched.transform(partial(count_by_period, 'year')))
### END STRIP ###

year,count
2014,8041498
2015,8143052
2016,6017753
2017,2468620
2018,819927
2019,125035


In [0]:
# bar plot by month
### BEGIN STRIP ###
display(df_enriched.transform(partial(count_by_period, 'month')))
### END STRIP ###

month,count
1,2003539
2,2147710
3,2518750
4,2265007
5,2366518
6,2086337
7,1953630
8,1760367
9,1966203
10,2386054


In [0]:
# bar plot by weekofyear
### BEGIN STRIP ###
display(df_enriched.transform(partial(count_by_period, 'weekofyear')))
### END STRIP ###

weekofyear,count
1,434786
2,424200
3,466600
4,496326
5,473586
6,477376
7,505821
8,581735
9,600993
10,568131


In [0]:
# bar plot by dayofmonth
### BEGIN STRIP ###
display(df_enriched.transform(partial(count_by_period, 'dayofmonth')))
### END STRIP ###

dayofmonth,count
1,830349
2,858332
3,820602
4,848713
5,850318
6,871429
7,835162
8,832550
9,842589
10,845356


In [0]:
# bar plot by dayofyear
### BEGIN STRIP ###
display(df_enriched.transform(partial(count_by_period, 'dayofyear')))
### END STRIP ###

dayofyear,count
1,41434
2,48587
3,45183
4,60572
5,53649
6,61874
7,60580
8,62790
9,79198
10,55000
