# Extract, Transform and Load Spotify Data Set

<hr>

## Exploration and summary tranformation

### Dependencies

In [1]:
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine
import requests
import pymongo
import matplotlib.pyplot as plt

### Extract Data Set

In [2]:
path = "../Resources/songs.json"
spotify_data = pd.read_json(path)

# path = "../Resources/Spotify_Daily_Streaming.csv"
# spotify_data = pd.read_csv(path)

# spotify_data.count()

In [3]:
spotify_data["Country"].unique()

array(['Netherlands', 'Germany', 'Australia', 'New Zealand', 'Panama',
       'Chile', 'Taiwan', 'South Africa', 'Japan', 'Spain', 'Argentina',
       'Paraguay', 'Turkey', 'Italy', 'Brazil', 'Iceland', 'Norway',
       'Switzerland', 'United Kingdom', 'Austria', 'Ireland', 'Sweden',
       'Hungary', 'Czech Republic', 'Poland', 'Slovakia', 'France',
       'Thailand', 'Singapore', 'Viet Name', 'Indonesia', 'Malaysia',
       'Philippines', 'Hong Kong', 'Dominican Republic', 'Peru',
       'Bolivia', 'Portugal', 'Denmark', 'Costa Rica', 'El Salvador',
       'Global', 'Greece', 'Guatemala', 'Israel', 'Mexico', 'Romania',
       'United States', 'Canada', 'Colombia', 'Ecuador', 'Estonia',
       'Latvia', 'Lithuania', 'Malta', 'Nicaragua', 'Honduras',
       'Bulgaria', 'Luxembourg', 'Uruguay', 'Finland', 'Belgium', 'India'],
      dtype=object)

### Separate Global and Country Data

In [11]:
spotify_data_global = spotify_data[spotify_data["Country"]=="Global"] #Global Data Set
spotify_data = spotify_data[spotify_data["Country"]!="Global"] #Country Data Set

<hr>

### Summary Streams by Month & Year

In [6]:
streams_by_date = spotify_data.groupby("Date").sum().sort_values(by=['Date']).reset_index()
streams_by_date["Date"] = streams_by_date["Date"].dt.strftime("%m/%d/%Y")
streams_by_date = streams_by_date[["Date","Streams"]]

In [7]:
streams_by_date["Date"] = pd.to_datetime(streams_by_date["Date"], format="%m/%d/%Y")

In [19]:
streams_by_date["MonthYear"] = (pd.DatetimeIndex(streams_by_date['Date']).year).astype(str)+"-"+(streams_by_date['Date'].dt.strftime('%m')).astype(str)+"-01"
streams_by_monthYear = streams_by_date.groupby("MonthYear").sum().reset_index()
# streams_by_monthYear.head()

<hr>

### Summary Streams by Artist

In [18]:
streams_by_artist = spotify_data.groupby("Artist").sum().sort_values(by=['Streams'],ascending=False).reset_index()
streams_by_artist = streams_by_artist[["Artist","Streams"]]
# streams_by_artist.head()

<hr>

### Summary Streams by Song

In [68]:
streams_by_track = spotify_data.groupby("Track URL").sum().sort_values(by=['Streams'],ascending=False).reset_index()
streams_by_track

streams_by_track = pd.merge(streams_by_track,spotify_data, on="Track URL", how='left') \
    .drop_duplicates(subset=['Track URL'], keep='first')

streams_by_track = streams_by_track[["Track Name","Artist","Streams_x","Track URL"]]
streams_by_track.rename(columns={"Streams_x":"Streams"},inplace=True)
# streams_by_track.head()

# Loading Data -  Mongodb connection

<hr>

### Load Streams by Month & Year

In [22]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define database and collection
db = client.Spotifydb
collection = db.StreamsByDate

collection.drop()

rows=len(streams_by_monthYear)
songs_dict={}
x=0
for row in range(0,rows):

    songs_dict={'MonthYear':streams_by_monthYear.iloc[row,0],
                'Streams':(streams_by_monthYear.iloc[row,1]).astype("str"),
                 }
   
    x += 1
    collection.insert_one(songs_dict)

<hr>

### Load Streams by Artist

In [23]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define database and collection
db = client.Spotifydb
collection = db.StreamsByArtist

collection.drop()

rows=len(streams_by_artist)
songs_dict={}
x=0
for row in range(0,rows):

    songs_dict={'Artist':streams_by_artist.iloc[row,0],
                'Streams':(streams_by_artist.iloc[row,1]).astype("str"),
                 }
   
    x += 1
    collection.insert_one(songs_dict)

### Load Streams by Track

In [69]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define database and collection
db = client.Spotifydb
collection = db.StreamsByTrack

collection.drop()

rows=len(streams_by_track)
songs_dict={}
x=0
for row in range(0,rows):

    songs_dict={'Track':streams_by_track.iloc[row,0],
                'Artist':(streams_by_track.iloc[row,1]),
                'Streams':(streams_by_track.iloc[row,2]).astype("str"),
                'Track URL':(streams_by_track.iloc[row,3])
                 }
   
    x += 1
    collection.insert_one(songs_dict)

### Filter Data

In [None]:
# filter1= spotify_data['Date'] == "2019-01-02 00:00:00+00:00"
filter1= spotify_data['Date'] == "2/5/2019"
filter2= spotify_data['Track Name'] == "Saturday Nights"

spotify_data.where(filter1&filter2, inplace = True)
spotify_data=spotify_data.dropna()

spotify_data.to_csv("Global.csv")