In [21]:
import pandas as pd
from sklearn.cluster import KMeans
import os
import seaborn as sns
import cufflinks as cf

In [22]:
%reload_ext autoreload
%autoreload 2

In [None]:
cf.go_offline()


In [23]:
datasetpath = 'tf_mini.csv'
directory = os.fsencode('playlists')
global descriptions
descriptions = pd.DataFrame()
global popular_clusters
popular_clusters = {}

In [24]:

class k_means_structure:
    def __init__(self, setpath, countrypath, countryname):
        """[CONSTRUCTOR of KMA ]

        Args:
            setpath ([string]): [dataset relative path]
            countrypath ([string]): [relative path of country data json]
            countryname ([string]): [name of country less the jargon of filename]
        """
        self.debug = False
        self.sampleframe = pd.DataFrame(pd.read_csv(setpath), columns=[
                                        'energy', 'valence', 'tempo', 'duration', 'danceability', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness'])  # removed 'us_popularity_estimate', and 'release year'
        self.countryname = countryname
        self.countryframe = pd.DataFrame(pd.read_json(countrypath), columns=[
                                         'energy', 'valence', 'tempo', 'lenght', 'danceability', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness'])
        self.sampleframe['set'] = 2  # SET 2 -> BIG DATASET
        self.countryframe['set'] = 20  # SET 20-> PER COUNTRY TOP 50
        self.countryframe.rename(columns={'lenght': 'duration'}, inplace=True)
        self.countryframe['duration'] = self.countryframe['duration'].div(
            1000).round(2)

        self.fullframe = pd.concat([self.countryframe, self.sampleframe])

        self.kmeans = KMeans(n_clusters=2).fit(self.fullframe)
        self.fullframe['cluster'] = self.kmeans.labels_
        centroids = pd.DataFrame(self.kmeans.cluster_centers_)
        print(centroids)
        if self.debug is True:
            # DEBUGGING PRINTS
            print(f"LABELS: {self.kmeans.labels_}")
            print(self.fullframe)
            print('COUNT PER CLUSTER', self.fullframe.groupby('cluster').count())

    def return_desc(self):
        """[returns local copy of global variable country descriptions]

        Returns:
            [dict]: [statistical descriptions of every KMA per country]
        """
        self.desc = self.fullframe.groupby('cluster').describe()
        self.desc['country'] = self.countryname[11:-17]
        global descriptions
        descriptions = pd.concat([descriptions, self.desc])
        print('### DESCRIPTION PER CLUSTER ###')
        return descriptions

In [25]:
    def decide_popular_cluster(self):
        """[FIND THE MORE POPULAR CLUSTER PER COUNTRY]

        Returns:
            [dict]: [{country:popular_cluster}]
        """
        global popular_clusters

        cluster0 = 0
        cluster1 = 0
        for index, row in self.fullframe.iterrows():
            if row['cluster'] == 0 and row['set'] == 20:
                cluster0 += 1
            if row['cluster'] == 1 and row['set'] == 20:
                cluster1 += 1
            if cluster0 > cluster1:
                # CLUSTER 0 ES EL POPULAR
                popular_clusters[self.countryname[11:-17]] = 0
            else:
                # CLUSTER 1 ES EL POPULAR
                popular_clusters[self.countryname[11:-17]] = 1

        print(popular_clusters)

        return popular_clusters

In [28]:
def main(pops):

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".json"):
            path = 'playlists/'+filename
            print(path)
            KMA = k_means_structure(datasetpath, path, filename)
            KMA.return_desc()
            #KMA.decide_popular_cluster()
    # COMMENTED TO NOT STDOUT to EXCEL XML
    descriptions.to_excel('countries_massive.xlsx')

    # popular cluster -> dataframe -> excel
    pop_cluster = pd.DataFrame.from_dict(list(pops.items()))
    pop_cluster.to_excel('pop_clusters_per_country_massive.xlsx')


In [29]:
main(popular_clusters)

playlists/Top Songs - Argentina Oct-26-2021.json
        0         1           2           3        4         5         6   \
0  0.63836  0.431236  119.683275  310.710066  0.57115  5.257084 -7.896638   
1  0.64244  0.505907  122.189470  194.217646  0.62479  5.272390 -7.396638   

         7         8         9         10        11  
0  0.116131  0.245206  0.093452  0.225993  2.007371  
1  0.125166  0.251944  0.063255  0.198933  2.021015  
### DESCRIPTION PER CLUSTER ###
playlists/Top Songs - Australia Oct-26-2021.json
         0         1           2           3         4         5         6   \
0  0.642449  0.505857  122.177210  194.213070  0.624716  5.272135 -7.397636   
1  0.638244  0.431070  119.673466  310.672216  0.571090  5.256179 -7.897569   

         7         8        9         10        11  
0  0.125146  0.251889  0.06326  0.198943  2.020552  
1  0.116129  0.245266  0.09341  0.226033  2.008839  
### DESCRIPTION PER CLUSTER ###
playlists/Top Songs - Austria Oct-26-2021.json
