In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DT').getOrCreate()


In [2]:
df = spark.read.format("csv").option("sep",",").option("inferSchema","true").option("header","true").load("test2_score.txt")
df.printSchema()

root
 |-- Userid: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- trackid: integer (nullable = true)
 |-- albumscore: integer (nullable = true)
 |-- artistscore: integer (nullable = true)
 |-- genreamax: integer (nullable = true)
 |-- genreamin: integer (nullable = true)
 |-- genreamean: double (nullable = true)



In [3]:
import pandas as pd
pd.DataFrame(df.take(5), columns=df.columns).transpose()
numeric_features = ['Userid','trackid','label','albumscore','artistscore','genreamax','genreamin','genreamean']

df.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Userid,6000,206286.491,3515.749299951101,200031,212234
trackid,6000,149260.0025,86146.45891885243,65,296098
label,6000,0.5,0.5000416718757232,0,1
albumscore,6000,0.0,0.0,0,0
artistscore,6000,23.488666666666667,38.63689523198974,0,100
genreamax,6000,48.80733333333333,43.19745227289881,0,100
genreamin,6000,42.272333333333336,40.789456944842996,0,100
genreamean,6000,45.58248076923077,41.17219269010034,0.0,100.0


In [4]:
# import matplotlib.pyplot as plt
# numeric_data = df.select(numeric_features).toPandas()

# axs = pd.plotting.scatter_matrix(numeric_data, figsize=(8, 8));
# n = len(numeric_data.columns)
# for i in range(n):
#     v = axs[i, 0]
#     v.yaxis.label.set_rotation(0)
#     v.yaxis.label.set_ha("right")
#     v.set_yticks(())
#     h = axs[n-1, i]
#     h.xaxis.label.set_rotation(90)
#     h.set_xticks(())
# plt.show()


In [5]:
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.types as types
df=df.withColumn('label',df['label'].cast(types.IntegerType()))
df=df.withColumn('Userid',df['Userid'].cast(types.IntegerType()))
df=df.withColumn('trackid',df['trackid'].cast(types.IntegerType()))
df=df.withColumn('albumscore',df['albumscore'].cast(types.IntegerType()))
df=df.withColumn('artistscore',df['artistscore'].cast(types.IntegerType()))
df=df.withColumn('genreamax',df['genreamax'].cast(types.IntegerType()))
df=df.withColumn('genreamin',df['genreamin'].cast(types.IntegerType()))
df=df.withColumn('genreamean',df['genreamean'].cast(types.IntegerType()))

assembler=VectorAssembler(inputCols=['Userid','trackid','label','artistscore','genreamax','genreamin','genreamean'],outputCol='features')


In [6]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[assembler])
model=pipeline.fit(df)
df=model.transform(df)


In [7]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(featuresCol = 'features',maxIter=10)
gbtModel = gbt.fit(df)

In [8]:
df1 = spark.read.format("csv").option("sep",",").option("inferSchema","true").option("header","true").load("test_02 copy.txt")
df1.printSchema()

root
 |-- Userid: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- trackid: integer (nullable = true)
 |-- albumscore: integer (nullable = true)
 |-- artistscore: integer (nullable = true)
 |-- genreamax: integer (nullable = true)
 |-- genreamin: integer (nullable = true)
 |-- genreamean: double (nullable = true)



In [9]:
import pandas as pd
pd.DataFrame(df1.take(5), columns=df1.columns).transpose()
numeric_features = ['Userid','trackid','label','albumscore','artistscore','genreamax','genreamin','genreamean']

df1.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Userid,120000,224372.7714,14155.612277030812,199810,249010
trackid,120000,147871.92235833334,85421.1993529344,1,296099
label,120000,0.0,0.0,0,0
albumscore,120000,23.685075,38.767846279190096,0,100
artistscore,120000,35.016175,42.73300263592536,0,100
genreamax,120000,35.471716666666666,40.80419698479321,0,100
genreamin,120000,32.74715833333333,39.44356509895082,0,100
genreamean,120000,34.149823609307354,39.70187105524825,0.0,100.0


In [10]:
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.types as types
df1=df1.withColumn('label',df1['label'].cast(types.IntegerType()))
df1=df1.withColumn('Userid',df1['Userid'].cast(types.IntegerType()))
df1=df1.withColumn('trackid',df1['trackid'].cast(types.IntegerType()))
df1=df1.withColumn('albumscore',df1['albumscore'].cast(types.IntegerType()))
df1=df1.withColumn('artistscore',df1['artistscore'].cast(types.IntegerType()))
df1=df1.withColumn('genreamax',df1['genreamax'].cast(types.IntegerType()))
df1=df1.withColumn('genreamin',df1['genreamin'].cast(types.IntegerType()))
df1=df1.withColumn('genreamean',df1['genreamean'].cast(types.IntegerType()))

assembler1=VectorAssembler(inputCols=['Userid','trackid','label','artistscore','genreamax','genreamin','genreamean'],outputCol='features')


In [11]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[assembler1])
model1=pipeline.fit(df1)
df1=model1.transform(df1)

In [12]:
predictions1 = gbtModel.transform(df1)
predictions1.show(10)

+------+-----+-------+----------+-----------+---------+---------+----------+--------------------+--------------------+--------------------+----------+
|Userid|label|trackid|albumscore|artistscore|genreamax|genreamin|genreamean|            features|       rawPrediction|         probability|prediction|
+------+-----+-------+----------+-----------+---------+---------+----------+--------------------+--------------------+--------------------+----------+
|199810|    0| 208019|         0|          0|        0|        0|         0|(7,[0,1],[199810....|[1.32590267922033...|[0.93412217565278...|       0.0|
|199810|    0|  74139|         0|          0|       80|       80|        80|[199810.0,74139.0...|[1.32590267922033...|[0.93412217565278...|       0.0|
|199810|    0|   9903|         0|          0|        0|        0|         0|(7,[0,1],[199810....|[1.32590267922033...|[0.93412217565278...|       0.0|
|199810|    0| 242681|         0|          0|        0|        0|         0|(7,[0,1],[199810..

In [13]:
predictions1.count()

120000

In [14]:
pre=predictions1.rdd.map(lambda x: x.prediction).collect()

In [15]:
prob=predictions1.rdd.map(lambda x: x.probability).collect()

In [16]:
user=predictions1.rdd.map(lambda x: x.Userid).collect()

In [17]:
track=predictions1.rdd.map(lambda x: x.trackid).collect()

In [18]:
track

[208019,
 74139,
 9903,
 242681,
 18515,
 105760,
 276940,
 142408,
 130023,
 29189,
 223706,
 211361,
 188441,
 20968,
 21571,
 79640,
 184173,
 111874,
 122375,
 189043,
 122429,
 52519,
 232332,
 262193,
 64345,
 118841,
 275682,
 30062,
 258473,
 129866,
 274758,
 102153,
 183464,
 23616,
 81699,
 46627,
 83722,
 200166,
 148603,
 88745,
 144775,
 222966,
 141875,
 290905,
 59789,
 283698,
 106788,
 235999,
 202308,
 36793,
 247549,
 214280,
 123424,
 193003,
 235240,
 292341,
 126082,
 142163,
 20067,
 286335,
 231569,
 51933,
 16949,
 34642,
 229894,
 42811,
 47402,
 21317,
 185734,
 92685,
 110099,
 33495,
 21593,
 132124,
 170625,
 30365,
 143570,
 254625,
 281826,
 253265,
 48891,
 33980,
 207963,
 126528,
 48808,
 223958,
 179298,
 5121,
 31975,
 38775,
 92599,
 75369,
 184197,
 183841,
 197936,
 138391,
 36027,
 29249,
 221237,
 123579,
 19694,
 255036,
 6143,
 24607,
 95726,
 46697,
 256209,
 266780,
 7477,
 266143,
 291820,
 196470,
 263222,
 225097,
 77700,
 138272,
 1402

In [19]:
output = []
for i in range(120000):
    output.append([user[i],track[i],prob[i]])

In [20]:
output

[[199810, 208019, DenseVector([0.9341, 0.0659])],
 [199810, 74139, DenseVector([0.9341, 0.0659])],
 [199810, 9903, DenseVector([0.9341, 0.0659])],
 [199810, 242681, DenseVector([0.9341, 0.0659])],
 [199810, 18515, DenseVector([0.9341, 0.0659])],
 [199810, 105760, DenseVector([0.9341, 0.0659])],
 [199812, 276940, DenseVector([0.9341, 0.0659])],
 [199812, 142408, DenseVector([0.9341, 0.0659])],
 [199812, 130023, DenseVector([0.9341, 0.0659])],
 [199812, 29189, DenseVector([0.9341, 0.0659])],
 [199812, 223706, DenseVector([0.9341, 0.0659])],
 [199812, 211361, DenseVector([0.9341, 0.0659])],
 [199813, 188441, DenseVector([0.9341, 0.0659])],
 [199813, 20968, DenseVector([0.9341, 0.0659])],
 [199813, 21571, DenseVector([0.9341, 0.0659])],
 [199813, 79640, DenseVector([0.9341, 0.0659])],
 [199813, 184173, DenseVector([0.9341, 0.0659])],
 [199813, 111874, DenseVector([0.9341, 0.0659])],
 [199814, 122375, DenseVector([0.9341, 0.0659])],
 [199814, 189043, DenseVector([0.9341, 0.0659])],
 [199814

In [24]:
import csv
with open("GBTC.csv", "w") as output_file:
    writer = csv.writer(output_file)
    writer.writerow(["TrackID", "Predictor"])
    for line in output:
        if line[2][0] == 0.9341221756527832:
            item = [str(line[0]) + "_" + str(line[1]), 0]
        else:
            item = [str(line[0]) + "_" + str(line[1]), 1]
        writer.writerow(item)
            

In [27]:
file = open("GBTC.txt", "w")
for line in output:
    file.write(str(line[0])+","+str(line[1])+","+str(line[2][0])+"\n")

In [25]:
output

[[199810, 208019, DenseVector([0.9341, 0.0659])],
 [199810, 74139, DenseVector([0.9341, 0.0659])],
 [199810, 9903, DenseVector([0.9341, 0.0659])],
 [199810, 242681, DenseVector([0.9341, 0.0659])],
 [199810, 18515, DenseVector([0.9341, 0.0659])],
 [199810, 105760, DenseVector([0.9341, 0.0659])],
 [199812, 276940, DenseVector([0.9341, 0.0659])],
 [199812, 142408, DenseVector([0.9341, 0.0659])],
 [199812, 130023, DenseVector([0.9341, 0.0659])],
 [199812, 29189, DenseVector([0.9341, 0.0659])],
 [199812, 223706, DenseVector([0.9341, 0.0659])],
 [199812, 211361, DenseVector([0.9341, 0.0659])],
 [199813, 188441, DenseVector([0.9341, 0.0659])],
 [199813, 20968, DenseVector([0.9341, 0.0659])],
 [199813, 21571, DenseVector([0.9341, 0.0659])],
 [199813, 79640, DenseVector([0.9341, 0.0659])],
 [199813, 184173, DenseVector([0.9341, 0.0659])],
 [199813, 111874, DenseVector([0.9341, 0.0659])],
 [199814, 122375, DenseVector([0.9341, 0.0659])],
 [199814, 189043, DenseVector([0.9341, 0.0659])],
 [199814

# 