# Sparkify Capstone Porject workspace for full data set(12GB)

import library

In [1]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, concat, desc, explode, lit, min, max, split, udf, isnull
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.feature import CountVectorizer, IDF, Normalizer, PCA, RegexTokenizer, StandardScaler, StopWordsRemover, StringIndexer, VectorAssembler
import datetime

from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier, NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,application_1555988898012_0003,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


## ETL 

### Extract data
load data

In [2]:
# Create spark session
spark = SparkSession \
    .builder \
    .appName("Sparkify") \
    .getOrCreate()

# Read in full sparkify dataset
event_data = "s3n://udacity-dsnd/sparkify/sparkify_event_data.json"


VBox()

In [3]:
spark_df = spark.read.json(event_data)


VBox()

In [None]:
spark_df.count()

### clean null and empty data

In [5]:
spark_df_clean=spark_df.filter(spark_df["userId"]!="")
spark_df_clean=spark_df_clean.dropna(how="any",subset=["userId",'sessionId'])

VBox()

In [None]:
spark_df_clean.count()

### Transformation

#### convert ts to real time

In [6]:
gen_time = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).strftime("%Y-%m-%d %H:%M:%S"))
spark_df_clean = spark_df_clean.withColumn("time", gen_time(spark_df_clean['ts']))

VBox()

get hour, weekday and day

In [7]:
gen_hour = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).hour)
spark_df_clean = spark_df_clean.withColumn("hour", gen_hour(spark_df_clean['ts']))

gen_weekday = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).strftime("%w"))
spark_df_clean = spark_df_clean.withColumn("weekday", gen_weekday(spark_df_clean['ts']))

gen_day = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).day)
spark_df_clean = spark_df_clean.withColumn("day", gen_day(spark_df_clean['ts']))

VBox()

#### convert location to state

In [8]:
get_state=udf(lambda x:x[-2:])
spark_df_clean = spark_df_clean.withColumn("location_state", get_state(spark_df_clean['location']))

VBox()

#### simplify userAgent

In [9]:
simp_useragent=udf(lambda x:"".join(x[x.index('(')+1:x.index(')')]))
spark_df_clean= spark_df_clean.withColumn("sim_user_agent", simp_useragent(spark_df_clean['userAgent']))

VBox()

## Feature engineering

In [11]:
def features_merge(df1, df2):
    """
    This function is used to merge the feature using left join
    input: two data frame to be merged
    output: merged dataframe
    """
    df2 = df2.withColumnRenamed("userId", "userIdTemp")
    df = df1.join(df2, df1.userId == df2.userIdTemp, "left").drop("userIdTemp")
    return df

VBox()

In [12]:
# user_list
df_feature=spark_df_clean.select('userId').dropDuplicates().sort('userId')

VBox()

In [21]:
user_list = [(row['userId']) for row in spark_df_clean.select("userId").dropDuplicates().sort('userId').collect()]

VBox()

In [13]:
# gender
gender_df=spark_df_clean.select('userId','gender').dropDuplicates().sort('userId')
df_feature=features_merge(df_feature,gender_df)

VBox()

In [14]:
# level
level_df=spark_df_clean.select('userId','level').dropDuplicates().sort('userId')
df_feature=features_merge(df_feature,level_df)

VBox()

In [15]:
# method
method_df=spark_df_clean.select('userId','method').dropDuplicates().sort('userId')
df_feature=features_merge(df_feature,method_df)

VBox()

In [16]:
# location_state
location_state_df=spark_df_clean.select('userId','location_state').dropDuplicates().sort('userId')
df_feature=features_merge(df_feature,location_state_df)

VBox()

In [17]:
# user agent
agent_df=spark_df_clean.select('userId','sim_user_agent').dropDuplicates().sort('userId')
df_feature=features_merge(df_feature,agent_df)

VBox()

In [None]:
#get count usage of each page type
page_list = [(row['page']) for row in spark_df_clean.select("page").dropDuplicates().collect()]

VBox()

In [None]:
temp_df_feature=spark_df_clean.select('userId').dropDuplicates().sort('userId')
for page in page_list:
    col_name = "count" + page.replace(" ", "")
    temp_page_count=spark_df_clean.filter(spark_df_clean['page']==page).groupby("userId").count().sort('userId')
    temp_page_count=temp_page_count.withColumnRenamed("count",col_name)
    temp_df_feature=features_merge(temp_df_feature,temp_page_count)

In [None]:
page_view_total_count=spark_df_clean.groupby("userId").count().sort('userId')

frequency_df_feature=temp_df_feature.select('id').subtract(df_b.select('id'))