# Jason Yu - Decision Tree Model  
Analytical Goal: Predict song rating based on genre_id, album_id, artist_id, and genre_level

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import pyspark.sql.functions
from pyspark.sql.functions import rand
from pyspark.ml.classification import DecisionTreeClassifier
from time import time

In [0]:
start = time()

In [0]:
database = 'msds697'
collection = 'music'
user_name = 'msds697_project'
password = 'msds697'
address = 'cluster1.ippkl.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
uri = 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music'
print(connection_string)

In [0]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config('spark.sql.parquet.binaryAsString', 'true')\
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
    .config('org.apache.hadoop:hadoop-aws:3.3.1')\
    .config("spark.mongodb.input.uri", 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music')\
    .config("spark.mongodb.output.uri", 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music')\
    .config("spark.network.timeout", "3600s")\
    .getOrCreate()

In [0]:
df = spark.read.format("mongo").option('uri', 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music').load(inferSchema='true')

In [0]:
df = df.select('genre_id', 'album_id', 'artist_id', 'genre_level', df['rating'].cast(IntegerType()).alias('label')).cache()

In [0]:
# subset for efficiency
_, df = df.randomSplit([.999,.001],200)
df = df.cache()

In [0]:
df.printSchema()

In [0]:
df.show(1)

In [0]:
# Create Training and Test data.
df_sets = df.randomSplit([0.8, 0.2], 200)
df_train = df_sets[0].cache()
df_valid = df_sets[1].cache()

In [0]:
df_train.show(1)

In [0]:
def oneHotEncodeColumns(train_df, test_df, cols):
    newdf_train = df
    newdf_test = test_df
    for c in cols: 
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf_train)
        
        newdf_train = ohe_model.transform(newdf_train).drop(c).withColumnRenamed(c+"-onehot", c)
        newdf_test = ohe_model.transform(newdf_test).drop(c).withColumnRenamed(c+"-onehot", c)
        
    return newdf_train, newdf_test

categorical_col = ['genre_id', 'album_id', 'artist_id', 'genre_level']
dfs = oneHotEncodeColumns(df_train, df_valid, categorical_col)
df_train_hot = dfs[0].cache()
df_test_hot = dfs[1].cache()

In [0]:
df_train_hot.show(1)

In [0]:
df_test_hot.show(1)

In [0]:
va = VectorAssembler(outputCol="features", 
                     inputCols=categorical_col) #except the last col.
df_train_points = va.transform(df_train_hot).select("features", "label")
df_test_points = va.transform(df_test_hot).select("features", "label")

In [0]:
df_train_points.show(1)

In [0]:
# dt = DecisionTreeClassifier()
# evaluator = MulticlassClassificationEvaluator()
# #ParamGridBuilder() – combinations of parameters and their values.
# paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [5,10,15,20,25,30]).build()
# cv = CrossValidator(estimator=dt, 
#                     evaluator=evaluator, 
#                     numFolds=5, 
#                     estimatorParamMaps=paramGrid)

In [0]:
dt = DecisionTreeClassifier(maxDepth=10, maxBins= 32, minInstancesPerNode=1, minInfoGain = 0)
dtmodel = dt.fit(df_train_points)

In [0]:
#Test data.
train_pred = dtmodel.transform(df_train_points)
test_pred = dtmodel.transform(df_test_points)


In [0]:
evaluator = MulticlassClassificationEvaluator().setLabelCol('label').setPredictionCol('prediction')
train_acc = evaluator.evaluate(train_pred, {evaluator.metricName: "accuracy"})
test_acc = evaluator.evaluate(test_pred, {evaluator.metricName: "accuracy"})

train_f1 = evaluator.evaluate(train_pred, {evaluator.metricName: "f1"})
test_f1 = evaluator.evaluate(test_pred, {evaluator.metricName: "f1"})


In [0]:
print(f"Train set accuracy: {train_acc}. F1 score: {train_f1}")
print(f"Validation set accuracy: {test_acc}. F1 score: {test_f1}")

In [0]:
# end timing
end = time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))