In [2]:
from pyspark.sql import SparkSession 
spark = SparkSession.builder.appName('Analise Musicas por genero').getOrCreate()
df = spark.read.csv('musicas_genero.csv',inferSchema=True,sep=',', header=True)
df.show()

+----+--------------------+-------------------+-------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+------------------+---+
|mode|              genres|       acousticness|       danceability|       duration_ms|             energy|    instrumentalness|           liveness|           loudness|         speechiness|             tempo|            valence|        popularity|key|
+----+--------------------+-------------------+-------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+------------------+---+
|   1|21st century clas...| 0.9793333333333332|0.16288333333333335|160297.66666666663|0.07131666666666665|          0.60683367|             0.3616|-31.514333333333337| 0.04056666666666667|           75.3365|0.10378333333333334| 27.83333333333333| 

In [3]:
df.select('genres').distinct().count()

2973

In [4]:
df.count()

2973

In [3]:
from pyspark.ml.feature import VectorAssembler
X = df.columns
X.remove('genres')
dados_por_genero_vector = VectorAssembler(inputCols=X, outputCol='features').transform(df)


In [4]:
dados_por_genero_vector.select('features').show(1,truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                       |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[1.0,0.9793333333333332,0.16288333333333335,160297.66666666663,0.07131666666666665,0.60683367,0.3616,-31.514333333333337,0.04056666666666667,75.3365,0.10378333333333334,27.83333333333333,6.0]|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



In [10]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
fited = scaler.fit(dados_por_genero_vector)
transformed = fited.transform(dados_por_genero_vector)


In [12]:
transformed.select('scaled_features').show(2,truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|scaled_features                                                                                                                                                                                                                                      |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[2.6817483100027903,3.06270998538775,1.081076545642506,1.6934607098636496,0.30413995283161455,2.2699850636894197,3.9152994181718874,-5.869463538494235,0.5040405399674039,4.312535913661122,0.5142373856537806,1.6618182828525256,1.7814144999826311]|
|[2.6817

In [14]:
from pyspark.ml.feature import PCA
from pyspark.ml import Pipeline
pca_pipeline = Pipeline(stages=[VectorAssembler(inputCols=X, outputCol='features'),
                 StandardScaler(inputCol='features', outputCol='scaled_features'),
                 PCA(k=2, inputCol='scaled_features', outputCol='pca_features')])
pca_pipeline_df = pca_pipeline.fit(df)

PipelineModel_b7e73bce75ba

In [17]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=5, featuresCol='scaled_features')
fited = kmeans.fit(transformed)
transformed = fited.transform(transformed)

In [22]:
transformed.select('prediction','genres').show(10)

+----------+--------------------+
|prediction|              genres|
+----------+--------------------+
|         1|21st century clas...|
|         1|               432hz|
|         3|               8-bit|
|         2|                  []|
|         2|          a cappella|
|         1|            abstract|
|         3|      abstract beats|
|         3|    abstract hip hop|
|         2|           accordeon|
|         2|           accordion|
+----------+--------------------+
only showing top 10 rows



In [23]:
pca = PCA(k=2, inputCol='scaled_features', outputCol='pca_features')
fited = pca.fit(transformed)
transformed = fited.transform(transformed)
transformed.select('pca_features').show(2)

+--------------------+
|        pca_features|
+--------------------+
|[-2.5070953668885...|
|[0.59696790566334...|
+--------------------+
only showing top 2 rows



In [26]:
from pyspark.ml.functions import vector_to_array
pca_features_xy = transformed.withColumn('x', vector_to_array('pca_features')[0])\
            .withColumn('y', vector_to_array('pca_features')[1])
pca_features_xy.select(['x','y','genres','prediction']).show(10)

+------------------+--------------------+--------------------+----------+
|                 x|                   y|              genres|prediction|
+------------------+--------------------+--------------------+----------+
|-2.507095366888566|-0.43816913737698426|21st century clas...|         1|
|0.5969679056633481| -4.9816120527513545|               432hz|         1|
| 4.158460276223561|  0.8366525081079923|               8-bit|         3|
|2.3873448785122164|  0.4877989015663364|                  []|         2|
|2.6501218371679087|  0.5756819768820428|          a cappella|         2|
|1.4965091203367626| -1.8644183183717817|            abstract|         1|
|3.9235207721573238| -0.2851835002352867|      abstract beats|         3|
|4.6110111098311135|  0.6783790472312343|    abstract hip hop|         3|
|2.8376900630842297|  0.5712993716580531|           accordeon|         2|
|2.7066901398927827|  1.2593788079708268|           accordion|         2|
+------------------+------------------

In [28]:
import plotly.express as px 
import plotly.io as pio

fig = px.scatter(pca_features_xy.toPandas(),x='x',y='y',color='prediction', hover_data=['x','y','genres'])
fig.show()
pio.renderers.default = 'iframe'
