## Import Libraries



In [18]:
#import libraries
from cm_api import get_api_token, get_track_metadata, requests, get_chart_data, get_tiktok_chart_data, get_artist_id
import pandas as pd
import re
from cm_config import token
import pickle

## Refresh Token for ChartMetric API

In [None]:
#save refresh token to variable

REFRESH_TOKEN = token['refresh_token']

## Get api_token

In [3]:
#get api_token and save it to a variable
api_token = get_api_token(REFRESH_TOKEN)


## Get Shazam Chart Data

In [None]:
#shazam chart data
shazam_charts = get_chart_data(api_token, '28795304', 'shazam', '2019-08-04')

In [None]:
shazam_charts

In [None]:
#parse shazam data

roses_bucket = []
for item in shazam_charts:
    item_tuple = (item['id'], item['rank'], item['added_at'], item['code2'], item['city'], item['pre_rank'],
                 item['peak_rank'], item['peak_date'], item['release_dates'][0])
    roses_bucket.append(item_tuple)
    

    

In [None]:
#create pandas dataframe for shazam chart postions

df = pd.DataFrame(roses_bucket, columns=['shazam_id', 'rank', 'added_at', 'code2', 'city', 'pre_rank', 
                                         'peak_rank', 'peak_date', 'release_date'])

## Data Cleaning

In [None]:
#strip white space in code2 column
df['code2'] = df['code2'].apply(lambda x: x.strip())

In [None]:
df.info()

In [None]:
#convert dates columns to datetime

df['added_at'] = pd.to_datetime(df['added_at'], format='%Y/%m/%d')
df['peak_date'] = pd.to_datetime(df['peak_date'], format='%Y/%m/%d')
df['release_date'] = pd.to_datetime(df['release_date'], format='%Y/%m/%d')

In [None]:
df

## Save Dataframe to CSV file

In [None]:
df.to_csv('datasets/roses_shazam_chart.csv')

In [None]:
#open csv file
df = pd.read_csv('datasets/roses_shazam_chart', index_col=0)
df

In [None]:
#save file to json
df.to_json('datasets/roses_shazam_chart.json')

In [None]:
#save as excel file
df.to_excel('datasets/roses_shazam_chart.xlsx')

In [None]:
df_json = pd.read_json('datasets/roses_shazam_chart.json')
df_json

In [None]:
US_market =  df_json[df_json['code2'] == 'US']

In [None]:
US_market['added_at'].min()

In [None]:
df_json.iloc[538]

In [None]:
df_json['added_at'].sort_values(ascending=False)

In [None]:
US_market

## Get itunes Top Data

In [None]:
itunes_top = get_chart_data(api_token, '28795304', 'itunes_top', '2019-10-05')

In [None]:
#parse itunes data

roses_Itunes_bucket = []
for item in itunes_top:
    item_tuple = (item['id'], item['code2s'][0],item['code2'], item['release_dates'][0], item['rank'], item['added_at'], 
                    item['pre_rank'], item['peak_rank'], item['peak_date'])
    roses_Itunes_bucket.append(item_tuple)
   
    

## Clean iTunes Data

In [None]:
#convert itunes data to dataframe

itunes_df = pd.DataFrame(roses_Itunes_bucket, columns=['id', 'code2s', 'code2', 'release date', 'rank', 'added_at', 'pre_rank', 'peak_rank', 'peak_date'])

In [None]:


#convert dates columns to datetime

itunes_df['release date'] = pd.to_datetime(itunes_df['release date'], format='%Y/%m/%d')
itunes_df['added_at'] = pd.to_datetime(itunes_df['added_at'], format='%Y/%m/%d')
itunes_df['peak_date'] = pd.to_datetime(itunes_df['peak_date'], format='%Y/%m/%d')

## Save iTunes Data to CSV File

In [None]:
itunes_df.to_csv('datasets/roses_itunes_chart.csv')

## Retrieve TikTok Chart data

In [None]:
date_range = ['2020-08-08', '2020-08-15']

for date in date_range:
    #for each date grab the top 100 tik tok tracks
    tt_chart = get_tiktok_chart_data(api_token, 'tracks', date, 'weekly')
    
    data_bucket = []
    for track in tt_chart:
        #for each track on the chart, parse the data into a tuple
        track_tuple = (track['name'], track['tiktok_artist_names'][0], track['cm_track'], track['album_label'][0], track['release_dates'][0], track['rank'],
                      track['weekly_posts'], track['added_at'], track['velocity'], track['pre_rank'], track['peak_rank'], 
                      track['peak_date'], track['time_on_chart'], track['rankStats'][0]['rank'], track['rankStats'][0]['weekly_posts'], 
                      track['rankStats'][0]['timestp'], track['rankStats'][-1]['rank'], track['rankStats'][-1]['weekly_posts'], 
                      track['rankStats'][-1]['timestp'])
        #add each tuple to a giant list where all the desired track's metadata will live
        data_bucket.append(track_tuple)
    #create a dataframe with correct column names
    df = pd.DataFrame(data_bucket, columns=['track_name','artist_name', 'cm_id', 'label', 'release_date', 'rank', 
                                      'weekly_posts', 'add_date', 'velocity', 'pre_rank', 'peak_rank', 'peak_date', 'time_on_chart', 
                                      'rank_week_start', 'weekly_posts_start', 'week_start_date', 'rank_week_end', 'weekly_posts_end', 'week_end_date'])
    #convert date columns to actual datetimes
    df['release_date'] = pd.to_datetime(df['release_date'], format='%Y/%m/%d')
    df['add_date'] = pd.to_datetime(df['add_date'], format='%Y/%m/%d')
    df['peak_date'] = pd.to_datetime(df['peak_date'], format='%Y/%m/%d')
    df['week_start_date'] = pd.to_datetime(df['week_start_date'], format='%Y/%m/%d')
    df['week_end_date'] = pd.to_datetime(df['week_end_date'], format='%Y/%m/%d')
    #save dataframes as csv to a folder
    df.to_csv('datasets/tiktokweekly_{}.csv'.format(date))

In [None]:
import os
import re

ttwk_200502 = pd.read_csv('datasets/tiktokweekly_2020-05-02.csv', index_col=0)
ttwk_200509 = pd.read_csv('datasets/tiktokweekly_2020-05-09.csv', index_col=0)
ttwk_200516 = pd.read_csv('datasets/tiktokweekly_2020-05-16.csv', index_col=0)
ttwk_200523 = pd.read_csv('datasets/tiktokweekly_2020-05-23.csv', index_col=0)
ttwk_200530 = pd.read_csv('datasets/tiktokweekly_2020-05-30.csv', index_col=0)
ttwk_200606 = pd.read_csv('datasets/tiktokweekly_2020-06-06.csv', index_col=0)
ttwk_200613 = pd.read_csv('datasets/tiktokweekly_2020-06-13.csv', index_col=0)
ttwk_200620 = pd.read_csv('datasets/tiktokweekly_2020-06-20.csv', index_col=0)
ttwk_200627 = pd.read_csv('datasets/tiktokweekly_2020-06-27.csv', index_col=0)
ttwk_200704 = pd.read_csv('datasets/tiktokweekly_2020-07-04.csv', index_col=0)
ttwk_200718 = pd.read_csv('datasets/tiktokweekly_2020-07-18.csv', index_col=0)
ttwk_200725 = pd.read_csv('datasets/tiktokweekly_2020-07-25.csv', index_col=0)
ttwk_200801 = pd.read_csv('datasets/tiktokweekly_2020-08-01.csv', index_col=0)
ttwk_200808 = pd.read_csv('datasets/tiktokweekly_2020-08-08.csv', index_col=0)
ttwk_200815 = pd.read_csv('datasets/tiktokweekly_2020-08-15.csv', index_col=0)

In [None]:
master_df = pd.concat([ttwk_200502,ttwk_200509,ttwk_200516,ttwk_200523, ttwk_200530, ttwk_200606, ttwk_200613, ttwk_200620, ttwk_200627,
          ttwk_200704, ttwk_200718, ttwk_200725, ttwk_200801, ttwk_200808, ttwk_200815])
        

In [None]:
#save master df to csv file

master_df.to_csv('datasets/historic_ttwk.csv')

In [None]:
master_df = pd.read_csv('datasets/historic_ttwk.csv', index_col=0)
master_df

In [None]:
#how many unique songs

print("Number of Unique Tracks: ", master_df['track_name'].nunique())
print("Number of Unique Artists: ", master_df['artist_name'].nunique())

## Add CM ID for each artist

### Open Up Master Dataframe of Weekly TikTok Chart Data (08-15-20 thru 05-02-20)

In [None]:
master_df = pd.read_csv('datasets/historic_ttwk.csv', index_col=0)
master_df

### Isolate Artists with Top 10 Ranked Tracks

In [None]:
#lets find a list of artists who have been ranked in the past 4 months

top10_rankings = master_df[master_df['rank'] < 11]
top10_ttartists = list(top10_rankings['artist_name'].unique())
top10_ttartists

### Create DIctionary of Artist Chartmetric IDs

In [None]:
#lets create a dictionary consisting of artists and their associated chartmetric IDs so that we can retreive social media
#data for each artist

cm_artistIDs = []
for artist in top10_ttartists:
    cm_id = get_artist_id(api_token, artist, 'artists')
    cm_artistIDs.append(cm_id)
    
cm_artistIDs_dict = {}

for key in top10_ttartists:
    for value in cm_artistIDs:
        cm_artistIDs_dict[key] = value
        cm_artistIDs.remove(value)
        break


### Pickle Dictionary Obj

In [None]:
#pickle dictionary so I don't have to waste my api calls and I can just open this up whenever I want
# cm_artistIDs_dict

file_to_write = open("cm_artistID_dictionary.pickle", "wb")
pickle.dump(cm_artistIDs_dict, file_to_write)

file_to_write.close()

### Open Pickled Dictionary

In [None]:
file_input = open("cm_artistID_dictionary.pickle", "rb")
cm_artistIDs = pickle.load(file_input)
file_input.close()

In [None]:
cm_artistIDs

## Get Fan Metrics for Top 10 Ranked Artists

In [None]:
from cm_api import get_fan_metrics

In [None]:
#lets get spotify popularity metrics for each artist and save to a csv file
artist_no_data_avail = []
success_list = []
for artist in cm_artistIDs:
    if isinstance(cm_artistIDs[artist], type(None)):
        print(artist, ' has no ChartMetric ID')
        print('/n')
        artist_no_data_avail.append(artist)
        continue

    else:
        data = get_fan_metrics(api_token, cm_artistIDs[artist], 'spotify', '2019-09-02', 'popularity')
        data_bucket = []
        if isinstance(data, type(None)):
            continue
        success_list.append(artist)
        for record in data['popularity']:
            if record:
                record_tuple = (record['timestp'], artist, cm_artistIDs[artist] , record['value'])
                data_bucket.append(record_tuple)
            else:
                continue
        df = pd.DataFrame(data_bucket, columns=['timestamp', 'artist', 'cm_artist_id', 'popularity'])
        df.to_csv('datasets/top10artists_popularity_historic_data/{}_spotpop.csv'.format(artist))
        


print('Out of {} artists who had tracks ranked in the top 10, only {} had IDs'.format(len(cm_artistIDs), len(success_list)))

## Merge artist dat into one DF

In [39]:
import os
import glob
import pandas as pd
#set working directory
os.chdir('datasets/top10artists_popularity_historic_data')

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'datasets/top10artists_popularity_historic_data'

In [40]:
#find all csv files in the folder
#use glob pattern matching -> extension = 'csv'
#save result in list -> all_filenames
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
# print(all_filenames)


In [41]:
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv('top10artists_popmetrics_mstr.csv', index=False)

In [42]:
top10artists_mstr = pd.read_csv('top10artists_popmetrics_mstr.csv', index_col=0)
top10artists_mstr

Unnamed: 0_level_0,timestamp,artist,cm_artist_id,popularity
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2019-09-02T00:00:00.000Z,BLACKPINK,206548,85
1,2019-09-03T00:00:00.000Z,BLACKPINK,206548,84
2,2019-09-04T00:00:00.000Z,BLACKPINK,206548,84
3,2019-09-05T00:00:00.000Z,BLACKPINK,206548,84
4,2019-09-06T00:00:00.000Z,BLACKPINK,206548,84
...,...,...,...,...
350,2020-08-29T00:00:00.000Z,YFN Lucci,5314,71
351,2020-08-30T00:00:00.000Z,YFN Lucci,5314,71
352,2020-08-31T00:00:00.000Z,YFN Lucci,5314,71
353,2020-09-01T00:00:00.000Z,YFN Lucci,5314,71


count          6634
unique           28
top       Pop Smoke
freq            355
Name: artist, dtype: object