# Genre & Artist Nested List Wrangling

## Imports

In [1]:
import pandas as pd
import numpy as np

## Set display options

In [2]:
pd.set_option('display.max_columns', 75)

## Import data

#### Import table for all chart album tracks

In [3]:
df = pd.read_csv("../data/allchartalbumtracks.csv")

In [4]:
#df = pd.read_csv("allchartalbumtracks.csv", 
                 #converters={'track_artists_popularity': pd.eval, 'track_artists_genres': pd.eval, 
                            # 'track_artists_ids': pd.eval, 'track_artists_popularity': pd.eval, 
                            # 'track_artists_followers': pd.eval})

#### Import table for yearly chart data

In [5]:
dfa = pd.read_csv("../data/mastercharttable.csv")

#### Drop original index column

In [6]:
df = df.drop("Unnamed: 0", axis=1)

In [7]:
dfa = dfa.drop("Unnamed: 0", axis=1)

#### Import table of wrangled genres

In [8]:
pd.set_option('display.max_rows', None)
genres_df = pd.read_csv("../data/genre_wrangling.csv")

In [9]:
genres_df.at[220,'New Genre']= 'No Genre Assigned'

In [10]:
# Replace values for 'movie tunes', 'show_tunes' 'mellow gold' 'hollywood'
genres_df.loc[genres_df.Original_Genre == "movie tunes", 'New Genre'] = "Soundtrack"
genres_df.loc[genres_df.Original_Genre == "show_tunes", 'New Genre'] = "Soundtrack"
genres_df.loc[genres_df.Original_Genre == "mellow gold", 'New Genre'] = "Rock"
genres_df.loc[genres_df.Original_Genre == "hollywood", 'New Genre'] = "Soundtrack"

## Wrangle Nested Columns In All Track Table

#### Eval track artist, track_artists_ids, track_artists_popularity, track_artists_followers columns for proper list format

In [11]:
df['track_artists'] = df['track_artists'].apply(lambda v: eval(v))

In [12]:
df['track_artists_ids'] = df['track_artists_ids'].apply(lambda v: eval(v))

In [13]:
df['track_artists_popularity'] = df['track_artists_popularity'].apply(lambda v: eval(v))

In [14]:
df['track_artists_followers'] = df['track_artists_followers'].apply(lambda v: eval(v))

#### Eval genre lists for proper list format

In [15]:
df['track_artists_genres'] = df['track_artists_genres'].apply(lambda v: eval(v))

## Wrangle Nested Columns In Yearly Chart Track Table

#### Eval track artist, track_artists_ids, track_artists_popularity, track_artists_followers columns for proper list format

In [16]:
dfa['track_artists'] = dfa['track_artists'].apply(lambda v: eval(v))


In [17]:
dfa['track_artists_ids'] = dfa['track_artists_ids'].apply(lambda v: eval(v))

In [18]:
dfa['track_artists_popularity'] = dfa['track_artists_popularity'].apply(lambda v: eval(v))

In [19]:
dfa['track_artists_followers'] = dfa['track_artists_followers'].apply(lambda v: eval(v))

#### Eval genre lists for proper list format

In [20]:
dfa['track_artists_genres'] = dfa['track_artists_genres'].apply(lambda v: eval(v))

#### Eval album_artist, album_artists_ids, album_artists_popularity, album_artists_followers columns for proper list format

In [21]:
# This function generates an error on row 12. Work in progress to bugfix. 
# See function below joining track_artist on album_id from tracks dataframe
# dfa['album_artists'] = dfa['album_artists'].apply(lambda v: eval(v))
# dfa['album_artists']

In [22]:
dfa['album_artists_ids'] = dfa['album_artists_ids'].apply(lambda v: eval(v))

In [23]:
dfa['album_artists_popularity'] = dfa['album_artists_popularity'].apply(lambda v: eval(v))

In [24]:
dfa['album_artists_followers'] = dfa['album_artists_followers'].apply(lambda v: eval(v))

#### Eval genre lists for proper list format

In [25]:
dfa['album_artists_genres'] = dfa['album_artists_genres'].apply(lambda v: eval(v))

## Wrangle genres

### Select primary track artist, artist id, popularity, followers from list

#### Primary artist selection

In [26]:
track_artists_list = list(df["track_artists"])

In [27]:
artist_list = [original_list[0] for original_list in track_artists_list]

In [28]:
df["track_artist"] = artist_list

In [29]:
album_artists_list = list(dfa['track_artists'])

In [30]:
album_artist_list = [original_list[0] for original_list in album_artists_list]

In [31]:
dfa["album_artist"] = album_artist_list

#### Primary artist id selection

In [32]:
track_artists_ids_list = list(df["track_artists_ids"])

In [33]:
artist_id_list = [original_list[0] for original_list in track_artists_ids_list]

In [34]:
df["track_artist_id"] = artist_id_list

#### Primary artist popularity selection

In [35]:
track_artists_popularity_list = list(df["track_artists_popularity"])

In [36]:
artist_popularity_list = [original_list[0] for original_list in track_artists_popularity_list]

In [37]:
df["track_artist_popularity"] = artist_popularity_list

#### Primary artist followers selection

In [38]:
track_artists_followers_list = list(df["track_artists_followers"])

In [39]:
artist_followers_list = [original_list[0] for original_list in track_artists_followers_list]

In [40]:
df["track_artist_followers"] = artist_followers_list

### Reshape track_artists_genres

#### Create dictionary to contain oldgenre:new genre pairs

In [41]:
genres_dict = {}

#### Loop though table to add to dictionary of pairs

In [42]:
for index in genres_df.index:
    genres_dict[genres_df.iloc[index,1]] = genres_df.iloc[index,2]

#### Create list of track_artists_genres variable

In [43]:
track_artists_genres_list = list(df["track_artists_genres"])

#### Select the first artist's genre list only if the cell value is a list of genre lists for multiple artists, otherwise return just the single artist's genre list if there's only one.

In [44]:
reshaped_list = [original_list[0] if type(original_list[0]) == list else original_list for original_list in track_artists_genres_list]

In [45]:
df["track_artists_genres"] = reshaped_list

#### Consolidate track_artists_genres by replacing old genres with new genres

In [46]:
df["track_artists_genres_consolidate"] = df["track_artists_genres"].apply(lambda l: [genres_dict.get(v) for v in l])

#### Select the first new genre as primary genre

In [47]:
main_genre_list = list(df["track_artists_genres_consolidate"])
test_genre = main_genre_list[75]
test_genre[0]

'Hip-hop'

In [48]:
reshaped_main_genre_list = [original_list[0] if original_list else "No Genre Assigned" for original_list in main_genre_list]

In [49]:
df['main_genre'] = reshaped_main_genre_list

#### Add genre information for albums without a genre assigned

In [50]:
df_unassigned = df.loc[df['main_genre'] == 'No Genre Assigned']

In [51]:
df_unassigned['album_name'].unique()

array(['Moana (Original Motion Picture Soundtrack/Deluxe Edition)',
       'The Greatest Showman (Original Motion Picture Soundtrack)',
       'Frozen 2 (Original Motion Picture Soundtrack/Deluxe Edition)',
       'Frozen (Original Motion Picture Soundtrack / Deluxe Edition)',
       'Spider-Man: Into the Spider-Verse (Soundtrack From & Inspired by the Motion Picture)',
       'Aladdin (Original Motion Picture Soundtrack)',
       'Love, Simon (Original Motion Picture Soundtrack)',
       'Fifty Shades Freed (Original Motion Picture Soundtrack)',
       'The Fate of the Furious: The Album', 'Suicide Squad: The Album',
       'Vol. 2 Guardians of the Galaxy: Awesome Mix Vol. 2 (Original Motion Picture Soundtrack)',
       'Beauty and the Beast (Original Motion Picture Soundtrack/Deluxe Edition)',
       'Sing (Original Motion Picture Soundtrack / Deluxe)',
       'Fifty Shades Darker (Original Motion Picture Soundtrack)',
       'EPIC AF', "NOW That's What I Call Music, Vol. 61",
      

In [52]:
df.loc[df.album_name == 'Moana (Original Motion Picture Soundtrack/Deluxe Edition)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'The Greatest Showman (Original Motion Picture Soundtrack)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Frozen 2 (Original Motion Picture Soundtrack/Deluxe Edition)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Frozen (Original Motion Picture Soundtrack / Deluxe Edition)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Spider-Man: Into the Spider-Verse (Soundtrack From & Inspired by the Motion Picture)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Aladdin (Original Motion Picture Soundtrack)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Love, Simon (Original Motion Picture Soundtrack)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Fifty Shades Freed (Original Motion Picture Soundtrack)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'The Fate of the Furious: The Album', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Suicide Squad: The Album', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Vol. 2 Guardians of the Galaxy: Awesome Mix Vol. 2 (Original Motion Picture Soundtrack)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Sing (Original Motion Picture Soundtrack / Deluxe)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Fifty Shades Darker (Original Motion Picture Soundtrack)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'Fifty Shades Darker (Original Motion Picture Soundtrack)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'EPIC AF', 'main_genre'] = "Hip-hop"
df.loc[df.album_name == 'Descendants 2 (Original TV Movie Soundtrack)', 'main_genre'] = "Soundtrack"
df.loc[df.album_name == 'NOW That\'s What I Call Music, Vol. 61', 'main_genre'] = "Misc"
df.loc[df.album_name == 'NOW That\'s What I Call Music, Vol. 62', 'main_genre'] = "Misc"
df.loc[df.album_name == 'EPIC LIT', 'main_genre'] = "Edm"
df.loc[df.album_name == 'Beauty and the Beast (Original Motion Picture Soundtrack/Deluxe Edition)', 'main_genre'] = "Soundtrack"

In [53]:
df_unassigned = df.loc[df['main_genre'] == 'No Genre Assigned']

#### Function for dummy coding genre columns

In [54]:
def dummy_code(l, new_col):
    if new_col in l:
        return(1)
    else:
        return(0)

#### Dummy code genre columns creating a column for each new genre with the 0/1 presence of that genre for this track

In [55]:
for new_col in genres_df['New Genre'].unique():
    df[new_col] = df["track_artists_genres_consolidate"].apply(lambda v: dummy_code(v, new_col))

#### Wrangle release date format

In [56]:
dfa['release_date_datetime'] = pd.to_datetime(dfa['album_release_date'], format='%Y-%m-%d')
dfa['release_year'] = dfa['release_date_datetime'].dt.year

#### Export to csv

In [57]:
df.to_csv("../data/AllChartAlbumTracksRecoded.csv")

In [58]:
dfa.to_csv("../data/MasterChartTableRecoded.csv")