In [1]:
import pickle
import networkx as nx
import matplotlib
import matplotlib.pyplot as plt
import scipy as sp
import numpy as np

%matplotlib inline

### Helper Functions

The first two are for loading and storing dictionaries to/from a pickle file

In [4]:
def store_dict_pickle(file_name, dictionary):
    """
        Saves a dictionary to a pickle file

        :param file_name: the name of the file the data will be stored in
        :type file_name: str

        :param dictionary: the dictionary to be stored in the file
        :type dictionary: dict
    """

    with open('data/' + file_name + '.pkl', 'wb') as f:
        pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL)


def load_dict_pickle(file_name):
    """
    Retrieves a dictionary form the specified file

    :param file_name: the name of the file to load
    :type file_name: str

    :return: the dictionary retrieved from the file
    :rtype dict
    """

    with open('data/' + file_name + '.pkl', 'rb') as f:
        dictionary =  pickle.load(f)
    return dictionary

# Loading Artist-Country Data

In [5]:
#intializing dictionary variables
dict_05 = load_dict_pickle("2005_country_artist")
dict_06 = load_dict_pickle("2006_country_artist")
dict_07 = load_dict_pickle("2007_country_artist")
dict_08 = load_dict_pickle("2008_country_artist")
dict_09 = load_dict_pickle("2009_country_artist")
dict_10 = load_dict_pickle("2010_country_artist")
dict_13 = load_dict_pickle("2013_country_artist")

# Initializing Graphs From Dictionaries

In [144]:
import copy

def top_artists(year_dict, top_number):
    # create graph to represent trends for given year
    G = nx.Graph()
    
    # loop through each artist in the dictionary
    for country in year_dict:
        # create country node & add it
        G.add_node(country)
        G.node[country]['country'] = True
        
        # get top 5 artists for current country
        artists = copy.copy(year_dict[country])
        num_artist = len(artists)
        for i in range(top_number):
            # make sure the country has enough top artists
            if( num_artist < top_number):
                # remove country's node and skip it
                G.remove_node(country)
                break;
            
            # find ith top artist
            top = max(artists.keys(), key=(lambda k: artists[k]))
            
            #create node for ith top artist and make a conenction b/n it and the country
            G.add_node(top)
            G.node[top]['artist'] = True
            G.add_edge(country, top, weight=year_dict[country][top])
            
            artists.pop(top)

    return G

In [145]:
# pos = nx.spring_layout(G)
# nx.draw(G, pos=pos);
# plt.axis('off')
# plt.show()

G_05 = top_artists(dict_05, 5)
G_06 = top_artists(dict_06, 5)
G_07 = top_artists(dict_07, 5)
G_08 = top_artists(dict_08, 5)
G_09 = top_artists(dict_09, 5)

# Takes two lists and calculates the JS Index associated with them.
# Returns JS Index as a float
def Jaccard(neighborhood1, neighborhood2):
    """
        J(A,B) = |A n B| / |A U B| 
        A n B = elements in A AND B 
        A U B = elments in A OR B
    """
    #want to iterate through longer list;
    if len(neighborhood1) >= len(neighborhood2):
        list1 = neighborhood1
        list2 = neighborhood2
    else:
        list1 = neighborhood2
        list2 = neighborhood1
    
    #AND --> A n B
    intersection = 0
    #OR --> A U B
    #combined the lists & then get the set to only have unique occurrences. Then get the length
    union = len(set(list1 + list2))
    
    #find the number of concordant and discordant nodes
    for node in list1:
        #if it's concordant
        if node in list2:
            intersection += 1

    #calculate index
    return intersection / union

In [158]:
index = Jaccard(G_08.neighbors('United States'), G_09.neighbors('United States'))

# put the years in a dictionary
graph_years = {5: G_05, 6: G_06, 7: G_07, 8: G_08, 9: G_09}
years = [G_05, G_06, G_07, G_08, G_09]

def years_similarity(years):
    # getting Jaccard similarity for each country from years 2005-2006. Where the similarity is calculated for consecutive years
    
    # make dicionary to hold all similariteis
    # dict['country'] = [05-06, 06-07, 07-08, 08-09] if value is not available, then it equals -1
    results = dict()
    
    # get all countries from the graph
    for graph in years:
        x=1
        for country in nx.get_node_attributes(graph, 'country').keys():
            if not country in results:
                results[country] = list()
                
    # get Jaccard Similarities
    for i in range(len(years) - 1):
        # get the jaccard similarity for the current year and next year countries
        for country in results:
            current_year = years[i] 
            next_year = years[i+1]
            if country in current_year and country in next_year:
                index = Jaccard(current_year.neighbors(country), next_year.neighbors(country))
                results[country].append(index)
            else:
                results[country].append(-1)
                    
    return results

x = years_similarity(years)

[0.1111111111111111, 0.1111111111111111, 0.25, 0.25]


# Put Jaccard Similarities into CSV File

In [175]:
import csv

def write_Jaccard_to_CSV(similarities):
    file = open('data/jaccard_indexes.csv', 'w', newline='')
    writer = csv.writer(file)

    # make header row
    row = []
    row.append('Country')
    row.append('2005-2006')
    row.append('2006-2007')
    row.append('2007-2008')
    row.append('2008-2009')

    writer.writerow(row)

    # write all the results from getting the Jaccard Similarities for
    # all the countries
    for country in similarities:
        row = []
        row.append(country)
        for item in similarities[country]:
            row.append(item)
        writer.writerow(row)
        
    file.close()
    
write_Jaccard_to_CSV(x)