# Import all lib i will need

In [159]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, translate, concat_ws, udf
from pyspark.sql.types import FloatType

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import PCA

from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.functions import vector_to_array

import plotly.express as px

from scipy.spatial.distance import euclidean

## Create Spark Session object

In [160]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('Iniciando com Spark') \
    .config('spark.ui.port', '4050') \
    .getOrCreate()

In [161]:
def read_file(
    file_location:str,
    infer_schema = "true",
    first_row_is_header = "true",
    delimiter = "|",
    file_type = 'csv'
    ):
    
    df_spark = spark.read.format(file_type) \
    .option('inferSchema', infer_schema) \
    .option('header', first_row_is_header) \
    .option('sep', delimiter) \
    .load(file_location)

    return df_spark

In [162]:
def alter_type_column(
        df,
        column_name:str,
        new_type:str,
    ):
    
    df_ajusted = df.withColumn(column_name, df[column_name].cast(new_type))
        
    return df_ajusted

In [163]:
def remove_string_rows(
        df,
        cols:[],
        type_of_col:str
):
    for i in cols:
        df = df.withColumn("isNumber", col(str(i)).cast(type_of_col).isNotNull()).filter(col("isNumber") == True)
        df = df.drop("isNumber")

    return df

## This functions bellow is responsible for create the cluster and register the cluster in the column

In [164]:
def project_clusters(
    df_data_sp,
    columns_not_object:[],
    k_pca=2,
    SEED=1224
    ):

    pca_pipeline = Pipeline(
        stages = [
            VectorAssembler(inputCols=columns_not_object,outputCol='features'),
            StandardScaler(inputCol='features', outputCol='features_scaled'),
            PCA(k=k_pca, inputCol='features_scaled', outputCol='pca_features')
            ]
        )

    model_pca_pipeline = pca_pipeline.fit(df_data_sp)
    projection = model_pca_pipeline.transform(df_data_sp)

    kmeans = KMeans(k=20, featuresCol='pca_features',\
        predictionCol='cluster_pca', seed=SEED)
    model_kmeans = kmeans.fit(projection)
    projection_kmeans = model_kmeans.transform(projection)

    projection_kmeans = projection_kmeans.withColumn('x', vector_to_array('pca_features')[0])\
        .withColumn('y', vector_to_array('pca_features')[1])
    
    return projection_kmeans


## This functions is responsible for creating the df of recommendations considered by the distance between two points

In [165]:
def created_recommendation (
    projection_kmeans,
    selected_music = 'Sergei Rachmaninoff| James Levine| Berliner Philharmoniker_Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve',
    columns_of_interesse = ['pca_features', 'cluster_pca',\
                        'artists','name', 'year', 'name_music_artists']
    ):
    
    def calculate_distance(value):
        return euclidean(music_components, value)

    selected_colections = projection_kmeans.filter(projection_kmeans.name_music_artists == selected_music).select(columns_of_interesse).take(1)[0]

    recommended_musics =  projection_kmeans.where(projection_kmeans.cluster_pca == selected_colections[1])

    music_components = recommended_musics.filter(recommended_musics\
        .name_music_artists == selected_colections[5])\
        .dropDuplicates(['name_music_artists'])\
        .select('pca_features').collect()[0][0]

    udf_calculate_distance = udf(
        calculate_distance, 
        FloatType()
        )

    recommended_musics_dist = recommended_musics.withColumn(
        'Dist', 
        udf_calculate_distance('pca_features')
        )
    recommendation = spark.createDataFrame(
        recommended_musics_dist.sort('Dist').take(11)
        ).select(['name','artists','id', 'Dist'])
    recommendation = recommendation.where(recommendation.Dist>0)
    
    return recommendation


In [166]:
df_data = read_file('./data/data.csv', delimiter=',')

## this block of code should clean the database and remove the "broken of rows."

In [167]:

float_columns = ['valence','acousticness', 'danceability', \
                'instrumentalness', 'liveness', 'loudness',\
                'speechiness', 'tempo']
int_columns = ['duration_ms', 'energy', 'key', 'mode', \
                'popularity', 'release_date', 'explicit']

for i in float_columns:
    df_data = alter_type_column(
        df_data,
        column_name=i,
        new_type='float'
    )
for i in int_columns:
    df_data = alter_type_column(
        df_data,
        column_name=i,
        new_type='int'
    )
    
df_data = df_data.withColumn("artists", translate("artists", "[]'", ""))
df_data = df_data.withColumn("artists", translate('artists', '"', ''))
df_data = df_data.withColumn("artists", translate("artists", ",", "|"))

df_data = df_data.withColumn('name_music_artists', (concat_ws('_',df_data.artists, df_data.name)))

df_columns = df_data.columns
df_data = df_data[df_columns]

df_data = df_data.na.drop(subset=['release_date'])

columns_not_object = df_data.columns
columns_not_object.remove('artists')
columns_not_object.remove('id')
columns_not_object.remove('name')
columns_not_object.remove('name_music_artists')
columns_not_object.remove('explicit')

df_data = df_data.filter(
    (df_data.speechiness<1) 
    & 
    (df_data.energy < 1)
    &
    (df_data.explicit < 1)
    )

df_data = remove_string_rows(
        df=df_data,
        cols=float_columns,
        type_of_col= "float"
)
df_data = remove_string_rows(
        df=df_data,
        cols=int_columns,
        type_of_col= "int"
)

In [168]:
df_data.summary().show()



+-------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------+--------+--------------------+-------------------+------------------+-------------------+-------------------+------------------+--------------------+------------------+------------------+-------------------+------------------+--------------------+
|summary|            valence|              year|       acousticness|             artists|       danceability|       duration_ms|energy|explicit|                  id|   instrumentalness|               key|           liveness|           loudness|              mode|                name|        popularity|      release_date|        speechiness|             tempo|  name_music_artists|
+-------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------+--------+--------------------+-------------------+------------------+-------------------+----------

                                                                                

In [169]:
projection_kmeans = project_clusters(
    df_data_sp=df_data,
    columns_not_object=columns_not_object
)

                                                                                

In [170]:
recommendation = created_recommendation(
    projection_kmeans,
    selected_music='Dennis Day_Clancy Lowered the Boom'
    )
recommendation.show()

                                                                                

+--------------------+-------------------+--------------------+-------------------+
|                name|            artists|                  id|               Dist|
+--------------------+-------------------+--------------------+-------------------+
|       Mannish Woman|Reverend J.M. Gates|3FeZTKBjxCGzIhmxd...|0.05701521784067154|
|Chapter 1.13 - Dz...|Zofia Dromlewiczowa|78hZxBr4FDd8Lq9P0...|0.10150286555290222|
|Capítulo 1.1 - la...|     H.P. Lovecraft|08i6R3zRuUMSiH0Xg...|0.10356761515140533|
|Chapter 1.2 - Dzi...|Zofia Dromlewiczowa|5BImx4bzd8vZadkJy...|0.12527886033058167|
|Chapter 2.17 - Dz...|Zofia Dromlewiczowa|1sT888U3vZ9tC0RpD...|0.13612665235996246|
|The Grey Goose (1...|         Lead Belly|00MzW0XvVaoEsgEtO...|0.13680696487426758|
|Chapter 1.1 - Dzi...|Zofia Dromlewiczowa|5u2E1EauR9jEG5vU2...|0.17054855823516846|
|Capítulo 15.2 - l...|     H.P. Lovecraft|5wQafB5JcmRqii5Gm...|0.17213831841945648|
|  The Pasture Mowing|       Robert Frost|3tAZj800raqqhIsN7...|0.17814892530