# Everyscrape

In this notebook we collect and visualize data on which genres are popular in which cities, the rankings of which cities have the most spotify listeners (from first to last), and additional information on cities such as lat-long and population. 

In [4]:
import os
import everyscrape
import pandas as pd

# useful during dev
from importlib import reload
everyscrape = reload(everyscrape)

<br>***Import the country codes and scrape a link of cities from the website:***

In [None]:
# import a csv file that matches country names to country codes 
country_codes = pd.read_csv(os.getcwd()+'/data/country_codes.csv', encoding = "ISO-8859-1")
country_codes.columns = ['Country','Country Code', 'Country Code 3']   

# get the links with all the cities 
links = everyscrape.get_cities()                                       
print(str(len(links))," links")

<br>***Scrape each city link (this will take a while, use the `resume` option to pick up where you left off in case connection gets interrupted):***

In [None]:
# scrape the everynoise website for genre popularity 
everynoise_popularity = everyscrape.genre_popularity(links,            
                                                     os.getcwd()+'/data/everynoise_popularity.csv')

<br>***Read the saved data:***

In [19]:
 # import the saved data
everynoise_popularity = pd.read_csv(os.getcwd()+'/data/everynoise_popularity.csv')  

# remove any unlabeled data
try:
    everynoise_popularity = everynoise_popularity.drop([c for c in everynoise_popularity.columns if "Unnamed" in c], 1)
except:
    pass

everyscrape.country_city_count(everynoise_popularity)        # check the number of unique cities 

# add the country codes to the everynoise data 
enpop = everynoise_popularity.merge(country_codes, on='Country Code', how='left', indicator=True)
enpop = enpop.rename(columns={'_merge':'cc_mrg'})            # keep the merge variable
enpop.columns = [c.lower() for c in enpop.columns]           # change the column names to lower case

for c in ['city','country']:                                 # clean string variables
    enpop[c] = enpop[c].str.lower()
    enpop[c] = enpop[c].str.strip()
enpop[55490:55500]

2955  unique cities


Unnamed: 0,popularity,genre,city,country code,country,country code 3,cc_mrg
55490,127,contemporary country,mesa arizona,US,united states,USA,both
55491,127,country,mesa arizona,US,united states,USA,both
55492,127,country road,mesa arizona,US,united states,USA,both
55493,123,modern country rock,mesa arizona,US,united states,USA,both
55494,119,pop rap,mesa arizona,US,united states,USA,both
55495,118,southern hip hop,mesa arizona,US,united states,USA,both
55496,117,rap,mesa arizona,US,united states,USA,both
55497,115,trap music,mesa arizona,US,united states,USA,both
55498,113,country rock,mesa arizona,US,united states,USA,both
55499,113,gangster rap,mesa arizona,US,united states,USA,both


<br>***Import the latitude and longitude for each city:***

In [112]:
worldcities = pd.read_csv(os.getcwd()+'/data/worldcities.csv')         # import the world cities data 
worldcities = worldcities.rename(columns={'iso2':'country code',       # rename the columns to make merging easier to code 
                                          'iso3':'country code 3'})

for c in ['city','city_ascii','admin_name','country']:                 # clean string variables
    worldcities[c] = worldcities[c].str.lower()
    worldcities[c] = worldcities[c].str.strip()

worldcities.head(2)

Unnamed: 0,city,city_ascii,lat,lng,country,country code,country code 3,admin_name,capital,population,id
0,malishevë,malisheve,42.4822,20.7458,kosovo,XK,XKS,malishevë,admin,,1901597212
1,prizren,prizren,42.2139,20.7397,kosovo,XK,XKS,prizren,admin,,1901360309


<br>***Merge city data with everynoise data:***

In [372]:
# merge the everynoise popularity and worldcities data
enpop_cities, leftovers = everyscrape.fuzzy_city_merge(enpop, worldcities)
enpop_cities = enpop_cities.drop(['_merge','city_ascii','admin_name'], 1)
enpop_cities.to_csv(os.getcwd()+'/data/everycity.csv')

print('Merged Rows:', str(len(enpop_cities)))
print('Leftover Rows:', str(len(leftovers)))

Merged Rows: 62290
Leftover Rows: 99


In [201]:
# load the data if you want 
enpop_cities = pd.read_csv(os.getcwd()+'/data/everycity.csv')
enpop_cities = enpop_cities.drop('Unnamed: 0',1)

<br>**Get city ranked by most spotify listeners**

In [114]:
listened_to_cities = everyscrape.listeners_by_city()                 # get the ranking of each city based on listener volume
listened_to_cities['city'] = listened_to_cities['city'].str.lower()  # format the city name strings 
listened_to_cities['city'] = listened_to_cities['city'].str.strip()

<br>***Get the final merged dataset and get top genre ranks per city:***

In [207]:
# create our final merged data for targeting
everygenre = pd.merge(enpop_cities, listened_to_cities, on=['city','country code'], how='left')          

# drop duplicates and sort values
everygenre = everygenre.drop_duplicates(subset = ['city','country','genre','country code 3'])
everygenre = everygenre.sort_values(['city','country','popularity'], ascending=[True,True,False])

# get a genre list but remove charaters before hop so its not split with the preceding word 
everygenre['genre'] = everygenre['genre'].apply(lambda s: s.replace(' hop','hop').replace('-hop','hop'))

# create a variable showing the rank of each variable in each city 
everygenre['top_genres'] = everygenre.groupby(['city','country'], as_index=False).cumcount()
everygenre['top_genres'] = everygenre['top_genres']+1

# save the everygenre dataset
everygenre.to_csv(os.getcwd()+'/data/everygenre.csv')

In [5]:
everygenre = pd.read_csv(os.getcwd()+'/data/everygenre.csv').drop('Unnamed: 0', 1)
len(list(everygenre['genre'].value_counts()))

1557