# Users analysis

In [0]:
# load the data into a PySpark DataFrame: `playlog`
# NOTE: perform the usual checks
### BEGIN STRIP ###
ACCESS_KEY_ID = " " # cle du compte student
SECRET_ACCESS_KEY = " " # secret key du compte student

hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.access.key", ACCESS_KEY_ID)
hadoop_conf.set("fs.s3a.secret.key", SECRET_ACCESS_KEY)
hadoop_conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem") 

playlog = spark.read.format("csv").option("header", "true").option("inferSchema","true").load("s3://full-stack-bigdata-datasets/Big_Data/youtube_playlog.csv")
playlog.printSchema()

### END STRIP ###

In [0]:
from pyspark.sql.functions import unix_timestamp, from_unixtime
playlog = playlog \
  .withColumn('datetime', from_unixtime('timestamp')) \
  .drop('timestamp') \
  .orderBy('datetime')
import datetime
from pyspark.sql.functions import year, month, dayofmonth, dayofweek, dayofyear, weekofyear
playlog = playlog \
  .withColumn('year', year('datetime')) \
  .withColumn('month', month('datetime')) \
  .withColumn('dayofmonth', dayofmonth('datetime')) \
  .withColumn('dayofyear', dayofyear('datetime')) \
  .withColumn('weekofyear', weekofyear('datetime'))

playlog.printSchema()
playlog.count(), len(playlog.columns)
playlog.limit(5).toPandas()

Unnamed: 0,user,song,datetime,year,month,dayofmonth,dayofyear,weekofyear
0,4,nRa-eGzpT6o,1965-07-26 03:21:43,1965,7,26,207,30
1,0,t1l8Z6gLPzo,2014-02-14 14:18:53,2014,2,14,45,7
2,22,Q24VZL8wpOM,2014-02-14 14:18:57,2014,2,14,45,7
3,70,VJ6ofd0pB_c,2014-02-14 14:18:57,2014,2,14,45,7
4,1,t1l8Z6gLPzo,2014-02-14 14:18:58,2014,2,14,45,7


### Aggregates

#### `firstPlay`, `lastPlay`, `playCount`, `uniquePlayCount`
For each user, we will compute these metrics:
- `firstPlay`: datetime of the first listening
- `lastPlay`: datetime of the last listening
- `playCount`: total play counts
- `uniquePlayCount`: unique play counts

We'll save all these in a new DataFrame: `users`.  
When you're done, print out the first 5 rows of `users` ordered by descending `playCount`.

In [0]:
# compute, for each user
#       - firstPlay
#       - lastPlay
#       - playCount
#       - uniquePlayCount
# Save the results in a DataFrame with name `users`
### BEGIN STRIP ###
from pyspark.sql import functions as F

def compute_aggregates(df):
  agg_exprs = (
    F.min('datetime').alias('firstPlay'),
    F.max('datetime').alias('lastPlay'),
    F.count('song').alias('playCount'),
    F.countDistinct('song').alias('uniquePlayCount')
  )
  return df.groupBy('user').agg(*agg_exprs)

users = playlog.transform(compute_aggregates)

# Alternative
# users_df = df.transform(compute_aggs('user', aggs))

users.orderBy(F.desc('playCount')).limit(5).toPandas()
### BEGIN STRIP ###

Unnamed: 0,user,firstPlay,lastPlay,playCount,uniquePlayCount
0,213,2014-02-14 15:34:17,2019-04-02 06:04:08,278749,161406
1,7290,2014-04-30 20:12:41,2019-04-03 06:50:05,151513,83831
2,435,2014-02-14 19:51:09,2019-04-03 19:36:28,144711,20055
3,21950,2014-10-23 09:09:36,2019-02-06 00:54:54,126285,15075
4,6270,2014-04-13 18:45:54,2018-08-11 20:46:08,125056,9247


In [0]:
# Sanity check that all firstPlay are before than lastPlay
### BEGIN STRIP ###
users.filter(F.col('firstPlay') > F.col('lastPlay')).count()
### END STRIP ###

In [0]:
# Another sanity check, we grouped on user, so these should be unique
# TODO: make sure all users are unique in the DataFrame
### BEGIN STRIP ##
print(f"Total users: {users.count()}")
print(f"Distinct users: {users.select('user').distinct().count()}")
### END STRIP ###

### `timespan`
We will compute `timespan`: the overall span of activity from a user in days, rounded to the inferior, for example:
- if a user was active 23 hours on the service, we will say he was active 0 days
- for 53 hours, that would be 2 days of activity

We **will not** transform the `users` DataFrame in place, but instead save the result as a new DataFrame: `users_with_timespan`.

In [0]:
# Compute timespan and save the result a new DataFrame: `users_with_timespan`
### BEGIN STRIP ###
from pyspark.sql.types import IntegerType

def compute_timespan(df):
  return df.withColumn('timespan', ((F.unix_timestamp('lastPlay') - F.unix_timestamp('firstPlay')) / (60**2 * 24)).cast(IntegerType()))

users_with_timespan = users.transform(compute_timespan)
users_with_timespan.limit(5).toPandas()
### END STRIP ###

Unnamed: 0,user,firstPlay,lastPlay,playCount,uniquePlayCount,timespan
0,31,2014-02-14 14:21:15,2018-01-05 08:30:40,43879,20293,1420
1,34,2014-02-14 14:21:28,2016-08-10 17:16:07,411,262,908
2,53,2014-02-14 14:21:29,2018-07-05 15:53:07,4974,3638,1602
3,65,2014-02-14 14:23:55,2019-03-28 19:59:17,17830,3377,1868
4,78,2014-02-14 14:37:16,2014-05-24 09:48:59,14,11,98


Let's check how this looks like, we will be using Databricks' `display` to plot an histogram of `timespan`.

In [0]:
# Plot an histogram of `timespan`
### BEGIN STRIP ###
display(users_with_timespan.select('timespan'))
### END STRIP ###

timespan
1837
999
1394
1066
19583
1853
1857
968
1290
202


Looking like a powerlaw, let's try to log transform.

In [0]:
# Use describe on the `timespan` column
### BEGIN STRIP ###
users_with_timespan.select('timespan').describe().toPandas().set_index('summary')
### END STRIP ###

Unnamed: 0_level_0,timespan
summary,Unnamed: 1_level_1
count,45904.0
mean,127.10855263157896
stddev,320.85394251182464
min,0.0
max,19583.0


In [0]:
# Plot a histogram of log transformed `timespan`
### BEGIN STRIP ###
display(users_with_timespan.select(F.log1p('timespan')))
### END STRIP ###

LOG1P(timespan)
7.516433302915632
6.907755278982137
7.240649694255466
6.9726062513017535
9.882468185312051
7.5251007461258
7.527255919373784
6.876264611890766
7.1631723908466425
5.313205979041787


In [0]:
# Plot a QQ-Plot of log transformed `timespan`
### BEGIN STRIP ###
display(users_with_timespan.select(F.log1p('timespan')))
### END STRIP ###

LOG1P(timespan)
7.516433302915632
6.907755278982137
7.240649694255466
6.9726062513017535
9.882468185312051
7.5251007461258
7.527255919373784
6.876264611890766
7.1631723908466425
5.313205979041787


We'll filter out users who stayed for less than a day and plot an histogram of this filtered data.

In [0]:
# Plot a histogram of log transformed `timespan` of users who stayed more than one day
### BEGIN STRIP ###
display(users_with_timespan.where(F.col('timespan') != 0).select(F.log('timespan')))
### END STRIP ###

ln(timespan)
6.906754778648554
6.97166860472579
7.52456122628536
7.526717561352706
6.78332520060396
6.680854678790215
5.680172609017068
4.477336814478207
7.535830462798367
6.376726947898627


### `isSingleDayUser`
What percentage of users used the service for less than one day?

In [0]:
# Compute the percentage of users who used the service for less than a day
### BEGIN STRIP ###
users_with_timespan \
  .select(F.sum((F.col('timespan') < 1).cast(IntegerType()))) \
  .rdd.map(lambda r: r[0]).first() / users.count() * 100
### END STRIP ###

Wow, that's a lot! We will flag this as its own column.  
That means we will create a new Boolean column `isSingleDayUser` that is `True` if the user used the service for less than a day and `False` otherwise.

In [0]:
# Create a new column (isSingleDayUser) to flag if a user used the service for less than a day
### BEGIN STRIP ###
users_with_single_day = users_with_timespan.withColumn('isSingleDayUser', (F.col('timespan') < 1))
users_with_single_day.limit(5).toPandas()
### END STRIP ###

Unnamed: 0,user,firstPlay,lastPlay,playCount,uniquePlayCount,timespan,isSingleDayUser
0,31,2014-02-14 14:21:15,2018-01-05 08:30:40,43879,20293,1420,False
1,34,2014-02-14 14:21:28,2016-08-10 17:16:07,411,262,908,False
2,53,2014-02-14 14:21:29,2018-07-05 15:53:07,4974,3638,1602,False
3,65,2014-02-14 14:23:55,2019-03-28 19:59:17,17830,3377,1868,False
4,78,2014-02-14 14:37:16,2014-05-24 09:48:59,14,11,98,False


### Measure of activity: `activeDaysCount` and `meanPlaycountByActiveDay`
This one is a bit harder, we want to compute:
- the number of active days for each user (not the `timespan`)
- the average play count on these active days for each user

In [0]:
# create 2 new columns
#       - activeDaysCount: the count of days each user was active
#       - dailyAvgPlayCount: the daily average playcount per user (active days only)
#       - activeDay
### BEGIN STRIP ###
def computeDailyStats(df):
  gb = df.groupBy(*(F.col(c) for c in ('user', 'year', 'dayofyear'))).count()
  exprs = (F.mean('count').alias('dailyAvgPlayCount'),
           F.count('count').alias('activeDaysCount'))
  return gb.groupBy('user').agg(*exprs)

users_with_avg = users_with_single_day.join(
  playlog.transform(computeDailyStats), 'user')
users_with_avg.limit(5).toPandas()
### END STRIP ###

Unnamed: 0,user,firstPlay,lastPlay,playCount,uniquePlayCount,timespan,isSingleDayUser,dailyAvgPlayCount,activeDaysCount
0,31,2014-02-14 14:21:15,2018-01-05 08:30:40,43879,20293,1420,False,69.871019,628
1,34,2014-02-14 14:21:28,2016-08-10 17:16:07,411,262,908,False,15.222222,27
2,53,2014-02-14 14:21:29,2018-07-05 15:53:07,4974,3638,1602,False,29.607143,168
3,65,2014-02-14 14:23:55,2019-03-28 19:59:17,17830,3377,1868,False,40.522727,440
4,78,2014-02-14 14:37:16,2014-05-24 09:48:59,14,11,98,False,3.5,4


In [0]:
# Plot a histogram of log of `activeDaysCount`
### BEGIN STRIP ###
display(users_with_avg.select(F.log('activeDaysCount')))
### END STRIP ###

ln(activeDaysCount)
4.605170185988092
1.791759469228055
5.831882477283517
4.382026634673881
0.0
4.204692619390966
6.825460036255307
1.791759469228055
1.6094379124341005
6.70073110954781


In [0]:
# Plot a histogram of log of `dailyAvgPlayCount`
### BEGIN STRIP ###
display(users_with_avg.select(F.log('dailyAvgPlayCount')))
### END STRIP ###

ln(dailyAvgPlayCount)
2.4096441652874536
2.908720896564361
3.80385634050828
1.929708174479033
2.6390573296152584
3.4135584784857294
3.999902626448877
5.883786533309793
1.6094379124341005
3.89079057416144


## Going further
What else do you think would be interesting to compute?
What about the ratio of activity, e.g. the ratio between `timespan` and `activeDaysCount`?