# Extract, Transform and Load Spotify Data Set
<hr>

### Dependencies

In [None]:
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine
import requests
import pymongo

### Extract Data Set

In [None]:
path = "../Resources/Spotify_Daily_Streaming.csv"
spotify_data = pd.read_csv(path)

In [None]:
spotify_data.head(5)

Unnamed: 0,Date,Track URL,Position,Track Name,Artist,Streams,Country
0,2017-07-20,https://open.spotify.com/track/3RXkboS74UYzN14...,1,,,3457,Lithuania
1,2018-07-31,https://open.spotify.com/track/77wz2VtAwxAwYOG...,1,"Miljonair (feat. SBMG, Lil' Kleine, Boef & Ron...",$hirak,238729,Netherlands
2,2018-08-02,https://open.spotify.com/track/77wz2VtAwxAwYOG...,1,"Miljonair (feat. SBMG, Lil' Kleine, Boef & Ron...",$hirak,245639,Netherlands
3,2017-07-23,https://open.spotify.com/track/6mw02h3qbWmuq2b...,1,Millionär,187 Strassenbande,387632,Germany
4,2018-06-29,https://open.spotify.com/track/2iUXsYOEPhVqEBw...,1,Youngblood,5 Seconds of Summer,261795,Australia


### Transform 

In [None]:
print("We have a total of " + str(len(spotify_data)) + " rows in the data set")

We have a total of 8469130 rows in the data set


In [None]:
spotify_data.dtypes

Date          object
Track URL     object
Position       int64
Track Name    object
Artist        object
Streams        int64
Country       object
dtype: object

### Change date type from obj to date

In [None]:
spotify_data["Date"] = pd.to_datetime(spotify_data["Date"], format="%m/%d/%Y")

In [None]:
spotify_data.dtypes

Date          datetime64[ns]
Track URL             object
Position               int64
Track Name            object
Artist                object
Streams                int64
Country               object
dtype: object

### Verify NAs and kill them all

In [None]:
spotify_data["Artist"].isna().value_counts()

False    8466361
True        2769
Name: Artist, dtype: int64

In [None]:
spotify_data_wNA = spotify_data.dropna()
spotify_data_wNA["Artist"].isna().value_counts()

False    8466353
Name: Artist, dtype: int64

### Create Mongodb connection

In [None]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define database and collection
db = client.Spotifydb
collection = db.Top200

### Load data into MongoDB Collection

In [None]:
collection.drop()

rows=len(spotify_data_wNA)
songs_dict={}
x=0
for row in range(0,10000):

    songs_dict={'Date':spotify_data_wNA.iloc[row,0],
                'Track URL':(spotify_data_wNA.iloc[row,1]),
                'Position':spotify_data_wNA.iloc[row,2].astype(str),
                 'Track Name':spotify_data_wNA.iloc[row,3],
                 'Artist':spotify_data_wNA.iloc[row,4],
                 'Streams':spotify_data_wNA.iloc[row,5].astype(str),
                 'Country':spotify_data_wNA.iloc[row,6],
                 }
   
    x += 1
    collection.insert_one(songs_dict)

10000
