# Extract musical interests
In which we extract all FB interests (from the 2016 list) that relate to musical genres.

In [116]:
import pandas as pd
import re

## Load data

In [117]:
all_interests = pd.read_csv('../../data/all_FB_interests_2016.zip', compression='zip', index_col=False)
all_interests.fillna(0, inplace=True)
all_interests.loc[:, 'name'] = all_interests.loc[:, 'name'].apply(lambda x: str(x))
all_interests.loc[:, 'path'] = all_interests.loc[:, 'path'].apply(lambda x: str(x))
all_interests.loc[:, 'id'] = all_interests.loc[:, 'id'].astype(long)
all_interests.head()

Unnamed: 0,name,audience_size,topic,path,disambiguation_category,id
0,Facebook,1466366000.0,Business and industry,0,0,6003142505790
1,Technology,1318222000.0,Technology,Technology,0,6003985771306
2,Entertainment,1312986000.0,News and entertainment,Entertainment,0,6003349442621
3,Consumer electronics,1214006000.0,Technology,Technology|Consumer electronics,0,6003716669862
4,Shopping and fashion,1193714000.0,Shopping and fashion,Shopping and fashion,0,6003167425934


## Find music genres

In [118]:
music_matcher = re.compile('[Mm]usic')
music_interests = all_interests[all_interests.loc[:, 'name'].apply(lambda x: music_matcher.search(x) is not None)]
print('%d music interests'%(music_interests.shape[0]))
music_interests.head(50)

2712 music interests


Unnamed: 0,name,audience_size,topic,path,disambiguation_category,id
8,Music,1060003000.0,News and entertainment,Entertainment|Music,0,6003020834693
18,Arts and music,862449900.0,Lifestyle and culture,Hobbies and activities|Arts and music,0,6003029869785
38,Pop music,654290400.0,News and entertainment,Entertainment|Music|Pop music,0,6003341579196
40,Rock music,648965100.0,News and entertainment,Entertainment|Music|Rock music,0,6003582732907
56,Music videos,524824800.0,News and entertainment,Entertainment|Music|Music videos,0,6003332483177
83,Electronic music,444201300.0,News and entertainment,Entertainment|Music|Electronic music,0,6003902397066
97,Rhythm and blues music,423585200.0,News and entertainment,Entertainment|Music|Rhythm and blues music,0,6003195554098
102,Hip hop music,405137600.0,News and entertainment,Entertainment|Music|Hip hop music,0,6003225556345
149,Soul music,331153700.0,News and entertainment,Entertainment|Music|Soul music,0,6003107699532
156,Country music,316829600.0,News and entertainment,Entertainment|Music|Country music,0,6003493980595


In [119]:
music_path_matcher = re.compile('Entertainment\|Music')
music_path_interests = all_interests[all_interests.loc[:, 'path'].apply(lambda x: music_path_matcher.search(x) is not None)]
print('%d music path interests'%(music_path_interests.shape[0]))
print(','.join(music_path_interests.loc[:, 'name'].sort_values(inplace=False).unique()))

15 music path interests
Blues music,Classical music,Country music,Dance music,Electronic music,Gospel music,Heavy metal music,Hip hop music,Jazz music,Music,Music videos,Pop music,Rhythm and blues music,Rock music,Soul music


In [120]:
music_category = 'Musical Genre'
music_interests_1 = all_interests[all_interests.loc[:, 'disambiguation_category'] == music_category]
print('%d music cat interests'%(music_interests_1.shape[0]))
print(','.join(music_interests_1.loc[:, 'name'].sort_values(inplace=False).unique()))

524 music cat interests
2 Tone,20th-century Classical Music,21st-century classical music,4-beat,A cappella,Abwe,Acid house,Acid jazz,Acid rock,Acid techno,Acid trance,Acoustic folk,Active rock,Adani,Adult album alternative,African hip hop,African popular music,Afro-Cuban jazz,Afro-punk,Afrobeat,Album-oriented rock,Alternative country,Ambient house,Anatolian rock,Andalusian classical music,Andean music,Anti-folk,Apala,Arabesque (Turkish music),Arena rock,Argentine cumbia,Argentine rock,Argentine tango,Art music,Art punk,Asian Underground,Atlanta hip hop,Atonality,Australian hip hop,Avant-garde jazz,Avant-garde metal,Avant-garde music,Ax̩ (music),Bakersfield sound,Bal-musette,Baladi,Balearic beat,Balearic trance,Baltimore club,Barbershop music,Bard (Soviet Union),Baroque music,Baroque pop,Bassline (music genre),Beat music,Beatboxing,Belgian rock,Bhajan,Big beat,Bikutsi,Boi (music),Boogaloo,Boogie (genre),Boogie rock,Bounce music,Bouncy techno,Bouyon music,Bozlak,Brazilian hip hop,Breakbe

This list is not big enough! Let's use all interests that end with "music" or "(music)".

In [121]:
music_matcher_2 = re.compile('music$|\(music\)$')
music_interests_2 = all_interests[all_interests.loc[:, 'name'].apply(lambda x: music_matcher_2.search(x) is not None)]
print('%d music interests'%(music_interests_2.shape[0]))
music_interests_2.loc[:, 'name'].unique()

356 music interests


array(['Arts and music', 'Pop music', 'Rock music', 'Electronic music',
       'Rhythm and blues music', 'Hip hop music', 'Soul music',
       'Country music', 'Heavy metal music', 'Christian music',
       'Blues music', 'Jazz music', 'House music', 'World music',
       'African-American music', 'Dance music', 'Gospel music',
       'Latin American music', 'Electronic dance music', 'Classical music',
       'Single (music)', 'Popular music', 'New wave music',
       'Contemporary Christian music', 'Christmas music', 'Swing music',
       'American folk music', 'Ambient music', 'Trance music',
       'Bachata (music)', 'Folk music', 'Beat (music)', 'Industrial music',
       'Sampling (music)', 'Salsa music', 'Electro (music)',
       'Calypso music', 'Lounge music', 'Phrase (music)', 'Merengue music',
       'Solo (music)', 'Cha-cha-cha (music)', 'Pitch (music)',
       'Mexican pop music', 'Dub (music)', 'Emo (music)', 'Romantic music',
       'Banda (music)', 'Beautiful music', 'No

This seems like a good list! But is it leaving anything out?

In [122]:
test_genres = ['salsa', 'tango', 'merengue', 'bolero', 'mariachi']
for g in test_genres:
    g_matcher = re.compile(g)
    g_interests = all_interests[all_interests.loc[:, 'name'].apply(lambda x: g_matcher.search(x.lower()) is not None)]
    print('test genre=%s with %d matching interests'%(g, g_interests.shape[0]))
    print('\n'.join(g_interests.loc[:, 'name'].unique()))

test genre=salsa with 22 matching interests
Salsa music
Salsa (dance)
Salsa (sauce)
Salsa rom��ntica
Salsa congress
Salsa dura
Addicted2Salsa
Bailar Salsa
HECTOR LAVOE- LEYENDA DE LA SALSA
Salsa Maelo Ruiz
Ruben Blades: La Salsa del Subdesarrollo
Salsa Erotica
Oxnard Salsa Festival
Cuban Salsa Dance Cubana
Baila en Cuba - Salsa Congress in Havanna
Yo tambien amo la SALSA...!!
Peruanos en su Salsa
I LOVE SALSA DC
Salsa Con Estilo ((( www.salsaconestilo.com ))) LA COMUNIDAD SALSERA
Salsa Cubaine
Cuban Salsa
The Salsa Rueda Festival
test genre=tango with 22 matching interests
Tango
Tango music
Tango (application)
Tango (drink)
Argentine tango
Tango Province
TANGO WAFER RENYAH
Tango Waffle
Blacklight: Tango Down
Project Tango
tango argentino
Last Tango in Paris
Adidas Tango 12
Bang Tango
Last Tango in Halifax
Bajofondo Tango Club (album)
Dni Tango
Academia Nacional del Tango de la Rep̼blica Argentina
Arjantin Tango
Adidas Tango
Geraldin Rojas Paludi - Tango
OSLO Tango Marathon
test genre=m

Problem! Most of these interests are not marked by "music".

In [123]:
all_interests[all_interests.loc[:, 'name'] == 'Mariachi']

Unnamed: 0,name,audience_size,topic,path,disambiguation_category,id
2784,Mariachi,31245180.0,News and entertainment,0,0,6003281529097


Is there a list of musical genres that we can cross-reference against?

In [124]:
# use SPARQL to access DBPedia
from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# take 1: collect all subclasses of music genre
# problem: this includes a lot of noise like "Fairy Novel"
# sparql.setQuery("""
# SELECT ?name
# WHERE {
#   ?entity rdfs:label ?name.
#   ?entity rdf:type ?type.
#   ?type rdfs:subClassOf* dbo:MusicGenre.
#   FILTER(langmatches(lang(?name), "en"))
# }
# """)

# take 2: collect all music subgenres for Music of Latin America
sparql.setQuery("""
SELECT ?name
WHERE {
  ?entity rdfs:label ?name.
  dbr:Music_of_Latin_America dbo:musicSubgenre ?entity.
  FILTER(langmatches(lang(?name), "en"))
}
""")

sparql.setReturnFormat(JSON)
results = sparql.query().convert()
result_names = sorted([r['name']['value'] for r in results['results']['bindings']])
print(','.join(result_names))
music_interests_3 = all_interests[all_interests.loc[:, 'name'].isin(result_names)]
print('%d Latin American interests'%(music_interests_3.shape[0]))
music_interests_3.head()

Axé (music),Bachata (music),Baião (music),Bambuco,Banda (music),Batucada,Biguine,Bolero,Bomba (Puerto Rico),Boogaloo,Bossa nova,Brazilian rock,Cha-cha-cha (music),Changüí,Charanga (Cuba),Choro,Compas,Conga (music),Conjunto,Contradanza,Corrido,Cuarteto,Cuban rumba,Cueca,Cumbia,Danza,Danzón,Duranguense,Filin (music),Forró,Frevo,Funk carioca,Guaguancó,Guaracha,Gwo ka,Huapango,Huayno,Jarabe,Joropo,Kwadril,Lambada,Lundu (dance),Mambo (music),Mariachi,Merengue music,Milonga (music),Méringue,Música popular brasileira,Música sertaneja,Norteño (music),Nueva canción,Nueva trova,Orquesta típica,Pachanga,Pagode,Pambiche,Pasillo,Payada,Plena,Porro,Punto guajiro,Ranchera,Rasin,Reggaeton,Rondalla,Salsa music,Samba,Seis,Son (music),Son Jarocho,Son jalisciense,Son montuno,Songo music,Tango music,Tejano music,Timba,Tonada,Tropicália,Trío romántico,Twoubadou,Vallenato,Vals criollo,Zouk
39 Latin American interests


Unnamed: 0,name,audience_size,topic,path,disambiguation_category,id
1023,Reggaeton,78852370.0,News and entertainment,0,0,6003434636451
1686,Bachata (music),51429630.0,News and entertainment,0,0,6003143764440
1970,Salsa music,43638040.0,News and entertainment,0,0,6003668231507
2171,Samba,39644950.0,News and entertainment,0,0,6003302709853
2549,Ranchera,33694270.0,News and entertainment,0,0,6002957429850


Let's combine all the musical genres and the corresponding IDs, then write to JSON file.

In [125]:
music_interests_combined = pd.concat([music_interests_1, music_interests_2, music_interests_3], axis=0)
# deduplicate
music_interests_combined.drop_duplicates('id', inplace=True)
print('%d combined music interests'%(music_interests_combined.shape[0]))
music_interests_combined.head()

732 combined music interests


Unnamed: 0,name,audience_size,topic,path,disambiguation_category,id
8364,Rocksteady,9919710.0,News and entertainment,0,Musical Genre,6003314921218
8386,Christian hip hop,9893950.0,News and entertainment,0,Musical Genre,6003140211647
8387,Noise music,9892570.0,0,0,Musical Genre,6003258005082
8391,Flamenco,9881550.0,News and entertainment,0,Musical Genre,6003074062797
8454,Psychedelic trance,9793400.0,News and entertainment,0,Musical Genre,6003419806154


In [129]:
import json
out_file = '../../data/queries/musical_genre_interests.json'
music_interests_json = music_interests_combined.loc[:, ['name', 'id']]
# fix names
music_interests_json = music_interests_json.apply(lambda x: x.to_dict(), axis=1)
# fix IDs
music_interests_json = music_interests_json.apply(lambda x: {'name' : x['name'], 'id' : str(long(x['id']))})
music_interests_json = {'data' : music_interests_json.tolist()}
json.dump(music_interests_json, open(out_file, 'w'), indent=4, encoding='latin-1')

While we're at it, let's build the entire query.

In [146]:
full_query = {
    'interests' : [{'name' : [i['name']], 'or' : [i['id']]} 
                   for i in music_interests_json['data']],
    'ages_ranges' : [{'min' : 13, 'max' : 65}],
    'geo_locations': [{
        'values' : ['US'],
        'location_types':[
            'home'
        ],
        'name': 'countries'
    }],
    'behavior':[
        {
            "and": [
                6023676072183
            ], 
            "or": [
                6003133212372
            ], 
            "name": "Hispanic (All) AND Ex-pats (Mexico)"
        },
        {
            "not": [
                6023676072183
            ],
            "name": "Non-Expats"
        }
    ],
    'genders':[0],
    'name': "Hispanic Mexican Ex-pats and non-Expats Musical Interests"
}
full_query_file = '../../data/queries/US_hispanic_expat_non_expats_music_interests.json'
json.dump(full_query, open(full_query_file, 'w'), indent=4, encoding='latin-1')