# Extract, Transform and Load Spotify Data Set
<hr>

### Dependencies

In [2]:
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine
import requests
import pymongo
import json

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

### Extract Data Set

In [3]:
path = "../Resources/Spotify_Daily_Streaming.csv"
spotify_data = pd.read_csv(path)

In [4]:
spotify_data.head(5)

Unnamed: 0,Date,Track URL,Position,Track Name,Artist,Streams,Country
0,7/20/2017,https://open.spotify.com/track/3RXkboS74UYzN14xTqzPyY,1,,,3457,Lithuania
1,7/31/2018,https://open.spotify.com/track/77wz2VtAwxAwYOGTJrZBKT,1,"Miljonair (feat. SBMG, Lil' Kleine, Boef & Ronnie Flex)",$hirak,238729,Netherlands
2,8/2/2018,https://open.spotify.com/track/77wz2VtAwxAwYOGTJrZBKT,1,"Miljonair (feat. SBMG, Lil' Kleine, Boef & Ronnie Flex)",$hirak,245639,Netherlands
3,7/23/2017,https://open.spotify.com/track/6mw02h3qbWmuq2bJCRjRAe,1,Millionär,187 Strassenbande,387632,Germany
4,6/29/2018,https://open.spotify.com/track/2iUXsYOEPhVqEBwsqP70rE,1,Youngblood,5 Seconds of Summer,261795,Australia


### Transform 

In [5]:
print("We have a total of " + str(len(spotify_data)) + " rows in the data set")

We have a total of 8469130 rows in the data set


In [6]:
spotify_data.dtypes

Date          object
Track URL     object
Position       int64
Track Name    object
Artist        object
Streams        int64
Country       object
dtype: object

### Change date type from obj to date

In [7]:
spotify_data["Date"] = pd.to_datetime(spotify_data["Date"], format="%m/%d/%Y")

In [8]:
spotify_data.dtypes

Date          datetime64[ns]
Track URL             object
Position               int64
Track Name            object
Artist                object
Streams                int64
Country               object
dtype: object

### Verify NAs and kill them all

In [9]:
spotify_data["Artist"].isna().value_counts()

False    8466361
True        2769
Name: Artist, dtype: int64

In [10]:
spotify_data_wNA = spotify_data.dropna()
spotify_data_wNA["Artist"].isna().value_counts()

False    8466353
Name: Artist, dtype: int64

### Separate Global and Country Data

In [11]:
spotify_data_global = spotify_data_wNA[spotify_data_wNA["Country"]=="Global"] #Global Data Set
spotify_data_bycountry = spotify_data_wNA[spotify_data_wNA["Country"]!="Global"] #Country Data Set

### Get unique Artist List to retrieve info (Genre, Followers and Image) from Spotify API

In [12]:
# Get unique Artists from all the Data Set
spotify_artists = spotify_data_wNA.groupby("Artist").sum().sort_values(by=['Streams'],ascending=False).reset_index()
len(spotify_artists["Artist"])

11046

In [13]:
# spotify_artists = spotify_artists[~spotify_artists.Artist.isin(spotify_artists_df1["Artist"])]
# spotify_artists

### Load data into MongoDB Collection adding Spotify Genre by Artist

In [None]:
token="BQAqFieWd9Vs7_PadOTnr5aa5Z718R_mmO6WACUnALMgjtzORXuE1eV6wxBonSkIlbKt924WCoIdF2Ia9QSgFlKzw2VONVEbgzTYTMUobMQTP4CSybOuLgCyAc8dasFXh8ycBwfmtLKvrHb5D9v7c3t_DxahFixpPZd-TsBPoCbRgw"

headers_dict = {"Authorization": f'Bearer {token}'}

# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define database and collection
db = client.Spotifydb
collection = db.SpotifyArtists
# collection.drop()

rows=len(spotify_artists)
artist_dict={}
# spotify_artist_dict=[]

for row in range(0,rows):
    #Hacer el llamado a la API de Spotify usando spotify_data_wNA.iloc[row,4]
    artist = spotify_artists.iloc[row,0]
    if (artist[0]=="#"):
        artist=artist[1:]
    
    response = requests.get(f'https://api.spotify.com/v1/search?q={artist}&type=artist', headers=headers_dict).json()
#     print(response)
    artists = response["artists"]
    
    try:
        items = artists["items"][0]
        genres = items["genres"]
        genre = items["genres"][0]
        followers = items["followers"]["total"]
        image = items["images"][0]["url"]
        
    except:
#         print(artists)
        genre = 'None'
        genres = ['Unavailable']
        followers = 'None'
        image = 'No Image Found'
        
        
    artist_dict={'Artist':artist,
                 'Genre':genre,
                 'Genres': genres,
                 'Followers': followers,
                 'Image': image
                 }
    collection.insert_one(artist_dict)
#     spotify_artist_dict.append(artist_dict)
   

In [28]:
# pd.read_json()
spotify_artists_json = json.load(open("../Resources/SpotifyArtists.json"))
spotify_artists_df = pd.DataFrame(spotify_artists_json)
spotify_artists_df[spotify_artists_df["Artist"]=="TocoParaVos"]

Unnamed: 0,Artist,Followers,Genre,Genres,Image
3014,TocoParaVos,251948,cumbia pop,[cumbia pop],https://i.scdn.co/image/ab6761610000e5eb74053255cb8a8eac221c760d


In [29]:
spotify_artists_df.loc[3014,"Artist"] = "#TocoParaVos"
spotify_artists_df[spotify_artists_df["Artist"]=="#TocoParaVos"]

Unnamed: 0,Artist,Followers,Genre,Genres,Image
3014,#TocoParaVos,251948,cumbia pop,[cumbia pop],https://i.scdn.co/image/ab6761610000e5eb74053255cb8a8eac221c760d


### Complete Final Data Base for Historial Top 200 with Genre and Image 

In [30]:
# Complete Data set by country with genre
spotify_data_complete_country = pd.merge(spotify_data_bycountry, spotify_artists_df, how="left", on="Artist")
spotify_data_complete_country = spotify_data_complete_country[["Date", "Track Name", "Artist","Genre","Streams","Country","Position","Genres","Track URL","Image"]]

# Complete Data set Global with genre
spotify_data_complete_global = pd.merge(spotify_data_global, spotify_artists_df, how="left", on="Artist")
spotify_data_complete_global = spotify_data_complete_global[["Date", "Track Name", "Artist","Genre","Streams","Country","Position","Genres","Track URL","Image"]]

### Added Column Month & Year to the data set

In [39]:
# Complete Data set by country with MonthYear
# spotify_data_complete_country["MonthYear"] = (pd.DatetimeIndex(spotify_data_complete_country['Date']).year).astype(str)+"-"+(spotify_data_complete_country['Date'].dt.strftime('%m')).astype(str)+"-01"
spotify_data_complete_country["MonthYear"] = (spotify_data_complete_country['Date'].dt.strftime('%m')).astype(str)+"-01-"+(pd.DatetimeIndex(spotify_data_complete_country['Date']).year).astype(str)
spotify_data_complete_country["MonthYear"] = pd.to_datetime(spotify_data_complete_country["MonthYear"], format="%m-%d-%Y")

# Complete Data set Global with MonthYear
spotify_data_complete_global["MonthYear"] = (spotify_data_complete_global['Date'].dt.strftime('%m')).astype(str)+"-01-"+(pd.DatetimeIndex(spotify_data_complete_global['Date']).year).astype(str)
spotify_data_complete_global["MonthYear"] = pd.to_datetime(spotify_data_complete_global["MonthYear"], format="%m-%d-%Y")

In [40]:
spotify_data_complete_country.head()

Unnamed: 0,Date,Track Name,Artist,Genre,Streams,Country,Position,Genres,Track URL,Image,MonthYear
0,2018-07-31,"Miljonair (feat. SBMG, Lil' Kleine, Boef & Ronnie Flex)",$hirak,dutch hip hop,238729,Netherlands,1,"[dutch hip hop, dutch rap pop]",https://open.spotify.com/track/77wz2VtAwxAwYOGTJrZBKT,https://i.scdn.co/image/ab6761610000e5eb7a1bbd5a8b02e5ed8cfc977d,2018-07-01
1,2018-08-02,"Miljonair (feat. SBMG, Lil' Kleine, Boef & Ronnie Flex)",$hirak,dutch hip hop,245639,Netherlands,1,"[dutch hip hop, dutch rap pop]",https://open.spotify.com/track/77wz2VtAwxAwYOGTJrZBKT,https://i.scdn.co/image/ab6761610000e5eb7a1bbd5a8b02e5ed8cfc977d,2018-08-01
2,2017-07-23,Millionär,187 Strassenbande,german hip hop,387632,Germany,1,"[german hip hop, hamburg hip hop]",https://open.spotify.com/track/6mw02h3qbWmuq2bJCRjRAe,https://i.scdn.co/image/ab6761610000e5ebd0be16c471ca57e6485267aa,2017-07-01
3,2018-06-29,Youngblood,5 Seconds of Summer,boy band,261795,Australia,1,"[boy band, dance pop, electropop, pop, post-teen pop]",https://open.spotify.com/track/2iUXsYOEPhVqEBwsqP70rE,https://i.scdn.co/image/ab6761610000e5ebffe8513647c422e6d93ed94a,2018-06-01
4,2018-06-11,Youngblood,5 Seconds of Summer,boy band,276266,Australia,1,"[boy band, dance pop, electropop, pop, post-teen pop]",https://open.spotify.com/track/55S2PQgSMYAhgoTCcGCDfw,https://i.scdn.co/image/ab6761610000e5ebffe8513647c422e6d93ed94a,2018-06-01


In [41]:
spotify_data_complete_global.head()

Unnamed: 0,Date,Track Name,Artist,Genre,Streams,Country,Position,Genres,Track URL,Image,MonthYear
0,2017-05-19,All Time Low,Jon Bellion,pop rap,548112,Global,164,[pop rap],https://open.spotify.com/track/1CnPYaKxTVb4LWOtiGOm0m,https://i.scdn.co/image/ab6761610000e5ebb2acdc716b2b520ccb9aebc0,2017-05-01
1,2019-01-22,Sola,Manuel Turizo,colombian pop,703901,Global,163,"[colombian pop, latin, reggaeton, reggaeton colombiano, trap latino]",https://open.spotify.com/track/675w3ACw5xZR3ODhUJlEVN,https://i.scdn.co/image/ab6761610000e5ebf787368e450e4a9c101a47e7,2019-01-01
2,2018-04-29,Paradise,George Ezra,folk-pop,601833,Global,158,"[folk-pop, modern rock, neo-singer-songwriter, pop, pop rock]",https://open.spotify.com/track/38zwkK6TtTjIW9tpYBfZ3D,https://i.scdn.co/image/ab6761610000e5eb7ddc849606ca3e6343735953,2018-04-01
3,2017-11-27,The Race,22 Savage,baton rouge rap,497874,Global,179,[baton rouge rap],https://open.spotify.com/track/0UioblV1x795s55Ur58c6c,https://i.scdn.co/image/ab67616d0000b273e03f31b8ea0ad3e9ada7e020,2017-11-01
4,2017-09-22,September,"Earth, Wind & Fire",disco,601577,Global,153,"[disco, funk, jazz funk, motown, quiet storm, soul]",https://open.spotify.com/track/1mqlc0vEP9mU1kZgTi6LIQ,https://i.scdn.co/image/9fd0a9822140cce668ee15263e1f73730152dff0,2017-09-01


## Load Countries' latitude and longitude

In [42]:
csv_path="../Resources/countries.csv"
countries_df = pd.read_csv(csv_path)
countries_df.columns = ['Code','Lat','Long','Country']

In [43]:
countries_df.head()

Unnamed: 0,Code,Lat,Long,Country
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla


In [44]:
spotify_data_complete_country = pd.merge(spotify_data_complete_country, countries_df, how="left", on="Country")

In [45]:
spotify_data_complete_country.head()

Unnamed: 0,Date,Track Name,Artist,Genre,Streams,Country,Position,Genres,Track URL,Image,MonthYear,Code,Lat,Long
0,2018-07-31,"Miljonair (feat. SBMG, Lil' Kleine, Boef & Ronnie Flex)",$hirak,dutch hip hop,238729,Netherlands,1,"[dutch hip hop, dutch rap pop]",https://open.spotify.com/track/77wz2VtAwxAwYOGTJrZBKT,https://i.scdn.co/image/ab6761610000e5eb7a1bbd5a8b02e5ed8cfc977d,2018-07-01,NL,52.132633,5.291266
1,2018-08-02,"Miljonair (feat. SBMG, Lil' Kleine, Boef & Ronnie Flex)",$hirak,dutch hip hop,245639,Netherlands,1,"[dutch hip hop, dutch rap pop]",https://open.spotify.com/track/77wz2VtAwxAwYOGTJrZBKT,https://i.scdn.co/image/ab6761610000e5eb7a1bbd5a8b02e5ed8cfc977d,2018-08-01,NL,52.132633,5.291266
2,2017-07-23,Millionär,187 Strassenbande,german hip hop,387632,Germany,1,"[german hip hop, hamburg hip hop]",https://open.spotify.com/track/6mw02h3qbWmuq2bJCRjRAe,https://i.scdn.co/image/ab6761610000e5ebd0be16c471ca57e6485267aa,2017-07-01,DE,51.165691,10.451526
3,2018-06-29,Youngblood,5 Seconds of Summer,boy band,261795,Australia,1,"[boy band, dance pop, electropop, pop, post-teen pop]",https://open.spotify.com/track/2iUXsYOEPhVqEBwsqP70rE,https://i.scdn.co/image/ab6761610000e5ebffe8513647c422e6d93ed94a,2018-06-01,AU,-25.274398,133.775136
4,2018-06-11,Youngblood,5 Seconds of Summer,boy band,276266,Australia,1,"[boy band, dance pop, electropop, pop, post-teen pop]",https://open.spotify.com/track/55S2PQgSMYAhgoTCcGCDfw,https://i.scdn.co/image/ab6761610000e5ebffe8513647c422e6d93ed94a,2018-06-01,AU,-25.274398,133.775136


## Load Final Spotify Data Base to MongoDB

### Load Top 200 by Country

In [46]:
# spotify_data_complete_country.head()

In [47]:
# spotify_data_complete_global.head()

In [48]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define database and collection
db = client.Spotifydb
collection = db.Top200byCountry
collection.drop()


rows=len(spotify_data_complete_country)
data=spotify_data_complete_country
songs_dict={}
x=0


for row in range(0,100000):
        
    songs_dict={'Date':data.iloc[row,0],
                'MonthYear':data.iloc[row,10],
                'MonthYearStr':data.iloc[row,11],
                'Track_Name':data.iloc[row,1],
                'Artist':data.iloc[row,2],
                'Genre':data.iloc[row,3],
                'Streams':data.iloc[row,4].astype(float),
                'Country':data.iloc[row,5],
                'Position':data.iloc[row,6].astype(float),
                'Genres':data.iloc[row,7],
                'Track_URL':data.iloc[row,8],
                'Image':data.iloc[row,9],
                'Code':data.iloc[row,11],
                'Lat':data.iloc[row,12],
                'Long':data.iloc[row,13]
                 }
    
    collection.insert_one(songs_dict)

### Load Global Top 200

In [27]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define database and collection
db = client.Spotifydb
collection = db.Top200Global
collection.drop()


rows = len(spotify_data_complete_global)
data = spotify_data_complete_global
songs_dict = {}
x=0


for row in range(0,rows):
        
    songs_dict={'Date':data.iloc[row,0],
                'MonthYear':data.iloc[row,10],
                'Track_Name':data.iloc[row,1],
                'Artist':data.iloc[row,2],
                'Genre':data.iloc[row,3],
                'Streams':data.iloc[row,4].astype(float),
                'Country':data.iloc[row,5],
                'Position':data.iloc[row,6].astype(float),
                'Genres':data.iloc[row,7],
                'Track_URL':data.iloc[row,8],
                'Image':data.iloc[row,9],
                 }
    
    collection.insert_one(songs_dict)