# Jaysen Shi - Logistic Regression

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
database = 'msds697'
collection = 'music'
user_name = 'msds697_project'
password = 'msds697'
address = 'cluster1.ippkl.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
uri = 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music'
print(connection_string)

In [0]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config('spark.sql.parquet.binaryAsString', 'true')\
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
    .config('org.apache.hadoop:hadoop-aws:3.3.1')\
    .config("spark.mongodb.input.uri", 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music')\
    .config("spark.mongodb.output.uri", 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music')\
    .config("spark.network.timeout", "3600s")\
    .getOrCreate()

In [0]:
df = spark.read.format("mongo").option('uri', 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music').load()

In [0]:
df.printSchema()

In [0]:
# subset for efficiency
_, df = df.randomSplit([.999,.001], seed=200)
df = df.cache()

In [0]:
features = ['genre_id', 'genre_level', 'artist_id', 'album_id']

In [0]:
#for col in features:
#    print(df.select(col).distinct().count())

In [0]:
for c in features:
    ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
    ohe_model = ohe.fit(df)

    df = ohe_model.transform(df).drop(c)
    df = df.withColumnRenamed(c+"-onehot", c)

In [0]:
df = df.select('genre_id','genre_level','artist_id','album_id',df['rating'].cast(IntegerType()).alias('label')).cache()
df.show(5)

In [0]:
va = VectorAssembler(outputCol="features", inputCols=df.columns[0:-1])
feature_df = va.transform(df).select("features", "label")
feature_df.show(5)

In [0]:
train, test = feature_df.randomSplit(weights=[0.8,0.2], seed=200)
train.cache()
test.cache()

In [0]:
lr = LogisticRegression()

metric_name = "accuracy"
evaluator = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")\
                .setMetricName(metric_name) 

paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0, 0.01, 0.1]).build()
cv = CrossValidator(estimator=lr, 
                    evaluator=evaluator, 
                    numFolds=5, 
                    estimatorParamMaps=paramGrid)

In [0]:
#lr = LogisticRegression()
#lr_model = lr.fit(train)
cvmodel = cv.fit(train)

In [0]:
cvmodel.bestModel.getRegParam()

In [0]:
#train_pred = lr_model.transform(train)
#test_pred = lr_model.transform(test)
train_pred = cvmodel.bestModel.transform(train)
test_pred = cvmodel.bestModel.transform(test)

print("Train accuracy: %s" % evaluator.evaluate(train_pred))
print("Test accuracy: %s" % evaluator.evaluate(test_pred))

In [0]:
lr = LogisticRegression()
lr_model = lr.fit(train)
train_pred = lr_model.transform(train)
test_pred = lr_model.transform(test)

print("Train accuracy: %s" % evaluator.evaluate(train_pred))
print("Test accuracy: %s" % evaluator.evaluate(test_pred))

In [0]:
lr = LogisticRegression(regParam=0.01)
lr_model = lr.fit(train)
train_pred = lr_model.transform(train)
test_pred = lr_model.transform(test)

print("Train accuracy: %s" % evaluator.evaluate(train_pred))
print("Test accuracy: %s" % evaluator.evaluate(test_pred))

In [0]:
lr = LogisticRegression(regParam=0.1)
lr_model = lr.fit(train)
train_pred = lr_model.transform(train)
test_pred = lr_model.transform(test)

print("Train accuracy: %s" % evaluator.evaluate(train_pred))
print("Test accuracy: %s" % evaluator.evaluate(test_pred))

In [0]:
print("Train f1: %s" % evaluator.evaluate(train_pred, {evaluator.metricName: "f1"}))
print("Test f1: %s" % evaluator.evaluate(test_pred, {evaluator.metricName: "f1"}))