# Sparkify Capstone Porject workspace for full data set(12GB)

import library

In [1]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, concat, desc, explode, lit, min, max, split, udf, isnull
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.feature import CountVectorizer, IDF, Normalizer, PCA, RegexTokenizer, StandardScaler, StopWordsRemover, StringIndexer, VectorAssembler
import datetime

from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier, NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## ETL 

### Extract data
load data

In [2]:
# Create spark session
spark = SparkSession \
    .builder \
    .appName("Sparkify") \
    .getOrCreate()

# Read in full sparkify dataset
#event_data = "s3n://udacity-dsnd/sparkify/sparkify_event_data.json"
path = "mini_sparkify_event_data.json"

In [3]:
#spark_df = spark.read.json(event_data)
spark_df = spark.read.json(path)

In [4]:
spark_df.count()

286500

### clean null and empty data

In [5]:
spark_df_clean=spark_df.filter(spark_df["userId"]!="")
spark_df_clean=spark_df_clean.dropna(how="any",subset=["userId",'sessionId'])

In [6]:
spark_df_clean.count()

278154

### Transformation

#### convert ts to real time

In [7]:
gen_time = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).strftime("%Y-%m-%d %H:%M:%S"))
spark_df_clean = spark_df_clean.withColumn("time", gen_time(spark_df_clean['ts']))

get hour, weekday and day

In [8]:
gen_hour = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).hour)
spark_df_clean = spark_df_clean.withColumn("hour", gen_hour(spark_df_clean['ts']))

gen_weekday = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).strftime("%w"))
spark_df_clean = spark_df_clean.withColumn("weekday", gen_weekday(spark_df_clean['ts']))

gen_day = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).day)
spark_df_clean = spark_df_clean.withColumn("day", gen_day(spark_df_clean['ts']))

#### convert location to state

In [9]:
get_state=udf(lambda x:x[-2:])
spark_df_clean = spark_df_clean.withColumn("location_state", get_state(spark_df_clean['location']))

#### simplify userAgent

In [10]:
simp_useragent=udf(lambda x:"".join(x[x.index('(')+1:x.index(')')]))
spark_df_clean= spark_df_clean.withColumn("sim_user_agent", simp_useragent(spark_df_clean['userAgent']))

## Feature engineering

In [11]:
def features_merge(df1, df2):
    """
    This function is used to merge the feature using left join
    input: two data frame to be merged
    output: merged dataframe
    """
    df2 = df2.withColumnRenamed("userId", "userIdTemp")
    df = df1.join(df2, df1.userId == df2.userIdTemp, "left").drop("userIdTemp")
    return df

In [12]:
# user_list
df_feature=spark_df_clean.select('userId').dropDuplicates().sort('userId')

In [13]:
user_list = [(row['userId']) for row in spark_df_clean.select("userId").dropDuplicates().sort('userId').collect()]

In [14]:
# gender
gender_df=spark_df_clean.select('userId','gender').dropDuplicates().sort('userId')
df_feature=features_merge(df_feature,gender_df)

In [15]:
# level
level_df=spark_df_clean.select('userId','level').dropDuplicates().sort('userId')
df_feature=features_merge(df_feature,level_df)

In [16]:
# method
method_df=spark_df_clean.select('userId','method').dropDuplicates().sort('userId')
df_feature=features_merge(df_feature,method_df)

In [17]:
# location_state
location_state_df=spark_df_clean.select('userId','location_state').dropDuplicates().sort('userId')
df_feature=features_merge(df_feature,location_state_df)

In [18]:
# user agent
agent_df=spark_df_clean.select('userId','sim_user_agent').dropDuplicates().sort('userId')
df_feature=features_merge(df_feature,agent_df)

In [19]:
# page

In [20]:
# get count usage of total page 
page_list = [(row['page']) for row in spark_df_clean.select("page").dropDuplicates().collect()]
page_view_total_count=spark_df_clean.groupby("userId").count().sort('userId')

In [23]:
#get count usage of each page type
temp_df_feature=spark_df_clean.select('userId').dropDuplicates().sort('userId')
for page in page_list:
    
    col_name = "count_" + page.replace(" ", "")
    temp_page_count=spark_df_clean.filter(spark_df_clean['page']==page).groupby("userId").count().sort('userId')
    temp_page_count=temp_page_count.withColumnRenamed("count",col_name)
    
    temp_page_mints=spark_df_clean.filter(spark_df_clean['page']==page).groupby("userId").min('ts').sort('userId')
    temp_page_maxts=spark_df_clean.filter(spark_df_clean['page']==page).groupby("userId").max('ts').sort('userId')
    temp_page_minmaxts=features_merge(temp_page_maxts,temp_page_mints)
    get_minus=udf(lambda x,y:(x-y)/1000/60)
    temp_page_minmaxts = temp_page_minmaxts.withColumn("totalts", get_minus(temp_page_minmaxts['max(ts)'],temp_page_minmaxts['min(ts)']))
    temp_page_minmaxts=temp_page_minmaxts.drop('max(ts)','min(ts)')
    col_name_2="totalts_" + page.replace(" ", "")
    temp_page_minmaxts=temp_page_minmaxts.withColumnRenamed("totalts",col_name_2)
    
    
    temp_df_feature=features_merge(temp_df_feature,temp_page_count)
    temp_df_feature=features_merge(temp_df_feature,temp_page_minmaxts)

In [24]:
temp_df_feature=features_merge(temp_df_feature,page_view_total_count)
get_div=udf(lambda x,y:x/y if y>0 else 0)
for page in page_list:
    col_name = "count_" + page.replace(" ", "")
    col_name_2="totalts_" + page.replace(" ", "")
    col_name_3="percentage_" + page.replace(" ", "")
    col_name_4="freq_" + page.replace(" ", "")
    temp_df_feature.withColumn(col_name_3, get_div(temp_df_feature[col_name],temp_df_feature['count']))
    temp_df_feature.withColumn(col_name_4, get_div(temp_df_feature[col_name],temp_df_feature[col_name_2]))
    temp_df_feature=temp_df_feature.drop(col_name,col_name_2)

temp_df_feature=temp_df_feature.drop('count')
    
    

In [None]:
temp_df_feature.show()

In [None]:
df_feature=features_merge(df_feature,temp_df_feature)

In [None]:
# number of days after registration
user_max_time = spark_df_clean.groupby("userId").max("ts").sort("userId")
user_reg_time = spark_df_clean.select("userId", "registration").dropDuplicates().sort("userId")
user_max_time=features_merge(user_max_time,user_reg_time)

get_minus1=udf(lambda x,y:(x-y)/1000/60/60/24)
user_max_time.withColumn("number_of_days", get_minus1(user_max_time['max(ts)'],user_max_time['registration']))
user_max_time=user_max_time.drop('max(ts)','registration')
df_feature=features_merge(df_feature,user_max_time)


In [None]:
# session
#get total session time of each session
sessiontime_sparkdf=spark_df_clean.groupby("userId",'sessionId').agg(((max(spark_df_clean['ts'])-min(spark_df_clean['ts']))/1000/60).alias('sessiontime')).sort('userId','sessionId')
# get how many session per user has
session_count=sessiontime_sparkdf.groupby("userId").count().sort('userId')
session_count=session_count.withColumnRenamed("count","session_count")

# get max session time for each user
max_session_t=sessiontime_sparkdf.groupby("userId").max('sessiontime').sort('userId')
# get min session time for each user
min_session_t=sessiontime_sparkdf.groupby("userId").min('sessiontime').sort('userId')
# get average session time for each user
avg_session_t=sessiontime_sparkdf.groupby("userId").avg('sessiontime').sort('userId')


df_feature=features_merge(df_feature,session_count)
df_feature=features_merge(df_feature,max_session_t)
df_feature=features_merge(df_feature,min_session_t)
df_feature=features_merge(df_feature,avg_session_t)

In [None]:
# song
# how many songs each user played per session
song_sparkdf=spark_df_clean.filter(spark_df_clean['page']=="NextSong").groupby("userId",'sessionId').count().sort('userId','sessionId')

In [None]:
# max number of songs per session
max_song_session=song_sparkdf.groupby("userId").max('count').sort('userId')
max_song_session=max_song_session.withColumnRenamed("max(count)","session_max_num_songs")

In [None]:
# min number of songs per session
min_song_session=song_sparkdf.groupby("userId").min('count').sort('userId')
min_song_session=min_song_session.withColumnRenamed("min(count)","session_min_num_songs")

In [None]:
# avg number of songs per session
avg_song_session=song_sparkdf.groupby("userId").avg('count').sort('userId')
avg_song_session=avg_song_session.withColumnRenamed("avg(count)","session_avg_num_songs")

In [None]:
# The times of song play for each song in user's listening list
song_featuredf2=spark_df_clean.filter(spark_df_clean['page']=="NextSong").groupby("userId",'song').count().sort("userId")
# number of different songs per user played
diff_song_num=song_featuredf2.groupby("userId").count().sort("userId")
diff_song_num=diff_song_num.withColumnRenamed("count","diff_song_num")
# the most played songs per user played times
most_freq_song_num=groupby("userId").max('count').sort("userId")
most_freq_song_num=most_freq_song_num.withColumnRenamed("count","most_freq_song_num")

In [None]:
df_feature=features_merge(df_feature,max_song_session)
df_feature=features_merge(df_feature,min_song_session)
df_feature=features_merge(df_feature,avg_song_session)
df_feature=features_merge(df_feature,diff_song_num)
df_feature=features_merge(df_feature,most_freq_song_num)