In [6]:
import numpy as np
import csv
from collections import defaultdict

truth_data = defaultdict(dict)
train_data = defaultdict(dict)


with open("../data/trainIdx2_matrix.txt", 'r') as fTrain:
    next(fTrain)  # Skip the header row
    for line in fTrain:
        trainUserID, trainItemID, trainRating = line.strip().split('|')
        train_data[trainUserID][trainItemID] = int(trainRating)

track_rating_data = defaultdict(list)

for user_ratings in train_data.values():
    for trackID, rating in user_ratings.items():
        track_rating_data[trackID].append(rating)

track_stats = {}
for trackID, ratings in track_rating_data.items():
    ratings_array = np.array(ratings)
    track_stats[trackID] = {
        'min': np.min(ratings_array),
        'max': np.max(ratings_array),
        'mean': np.mean(ratings_array),
        'variance': np.var(ratings_array),
        'median': np.median(ratings_array)
    }

with open("../data/testTrack_hierarchy.txt", 'r') as fTest, open("../data/proccessed/test.csv", 'w', newline='') as fOut:
    csv_writer = csv.writer(fOut)
    csv_writer.writerow(["UserId", "TrackId", "AlbumId", "ArtistId", "AlbumRating", "ArtistRating", "TotalScore",
                         "MinRating", "MaxRating", "MeanRating", "Variance", "MedianRating"])

    trackID_vec = [0] * 6
    albumID_vec = [0] * 6
    artistID_vec = [0] * 6
    lastUserID = -1
    total_sum = 0
    total_count = 0

    for line in fTest:
        userID, trackID, albumID, artistID, *_ = line.strip().split('|')

        if userID != lastUserID:
            ii = 0
            user_rating_inTrain = np.zeros(shape=(6, 2))

        trackID_vec[ii] = trackID
        albumID_vec[ii] = albumID
        artistID_vec[ii] = artistID
        ii += 1
        lastUserID = userID

        if ii == 6:
            user_train_data = train_data[userID]

            for nn in range(6):
                user_rating_inTrain[nn] = [
                                           user_train_data.get(
                                               albumID_vec[nn], 0),
                                           user_train_data.get(artistID_vec[nn], 0)]
            for nn in range(6):
                total_score = sum(user_rating_inTrain[nn])
                total_sum += total_score
                total_count += 1
                prediction = int(total_sum / total_count > 58)
                track_stat = track_stats.get(trackID_vec[nn], {})

                csv_writer.writerow([userID, trackID_vec[nn], albumID_vec[nn],
                                     artistID_vec[nn], *
                                     user_rating_inTrain[nn], total_score,
                                     track_stat.get(
                                         'min', 0), track_stat.get('max', 0),
                                     track_stat.get('mean', 0), track_stat.get(
                                         'variance', 0),
                                     track_stat.get('median', 0)])

print("Total:" + str(total_sum))
print("Count: " + str(total_count))
print("Average: " + str(total_sum/total_count))


Total:7044150.0
Count: 120000
Average: 58.70125


In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, auc
from pyspark.sql.functions import split, col, avg


spark = SparkSession.builder.appName('recommend-ML').getOrCreate()
# import test_2 ground truths
data_dir = "../data/proccessed/test.csv"
df = spark.read.csv(data_dir, header=True, inferSchema=True)
df.show(5)




+------+-------+-------+--------+-----------+------------+----------+---------+---------+------------------+------------------+------------+
|UserId|TrackId|AlbumId|ArtistId|AlbumRating|ArtistRating|TotalScore|MinRating|MaxRating|        MeanRating|          Variance|MedianRating|
+------+-------+-------+--------+-----------+------------+----------+---------+---------+------------------+------------------+------------+
|199810| 208019| 209288|    None|        0.0|         0.0|       0.0|        0|      100|49.766129032258064|1349.9533688865763|        50.0|
|199810|  74139| 277282|  271146|        0.0|         0.0|       0.0|       50|       90| 78.33333333333333|297.22222222222223|        90.0|
|199810|   9903|   None|    None|        0.0|         0.0|       0.0|        0|      100|52.858823529411765|1339.4977162629757|        50.0|
|199810| 242681| 190640|  244574|        0.0|         0.0|       0.0|        0|      100| 49.50834597875569|1692.7537239713458|        50.0|
|199810|  185