# ECMM451 Data Science Project

In [None]:
# General
import os
import requests
import json
from selenium import webdriver
import time
import pandas as pd
import numpy as np
import re
import random
import pickle
from datetime import datetime
import string

# Text preprocessing
import nltk
from nltk import bigrams
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import ast

# Model helpers
from scipy import sparse
from scipy.sparse import csr_matrix, vstack
from textblob import TextBlob
from langdetect import detect_langs

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Models
import gensim
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC, LinearSVC

# Plotting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
from adjustText import adjust_text
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Data Collection

## 1.1. Spotify Calls
- Step 1: Getting the token to access Spotify API
- Step 2: Getting artist information. This dictionary was manually made by collecting data from Billoard
- Step 3: Find all artists. Querying Spotify API to get artist information

### Step 1: Getting the token

In [None]:
# Get Spotify Client Id from https://developer.spotify.com/ and replce in spotify_client_id
spotify_client_id = '<Your Spotify Client Id>'
# Get Spotify Client Secret from https://developer.spotify.com/ and replce in spotify_client_secret
spotify_client_secret = '<Your Spotify Client Secret>'

spotify_account_base_url = "https://accounts.spotify.com"
spotify_api_base_url = 'https://api.spotify.com/v1'
spotify_token_url = spotify_account_base_url + "/api/token"
spotify_headers = {'Content-Type': 'application/x-www-form-urlencoded'}
spotify_params = {'grant_type': 'client_credentials', 
                  'client_id': spotify_client_id, 
                  'client_secret': spotify_client_secret}

spotify_response = requests.post(spotify_token_url, params=spotify_params, headers=spotify_headers)
token_response = spotify_response.json()
spotify_access_token = token_response['access_token']

### Step 2: Get artist info

In [None]:
######################################################
# Year-wise list of artists collected from billboard #
######################################################

artists = {
    '2022': [
        "Bad bunny", "Taylor Swift", "Harry Styles", "Drake",
        "Morgan Wallen", "Doja Cat", "Ed Sheeran", "Adele", "The Weeknd",
        "Lil Baby", "Future", "Justin Bieber", "Post Malone", "Jack Harlow", 
        "Kendrick Lamar", "Luke Combs", "Juice Wrld", "Glass Animals", "Lil Durk", 
        "Lil Nas X", "Dua Lipa", "Elton John", "YoungBoy never broke again", "Rod Wave", 
        "Kanye West", "Olivia Rodrigo", "Beyoncé", "The Kid LAROI", "Billie Eilish", 
        "Kodak Black", "Lizzo", "Gunna", "Imagine Dragons", "Latto", "Chris Stapleton", 
        "Summer Walker", "Walker Hayes", "Polo G", "SZA", "Bruno Mars", "Zach Bryan",
        "Steve Lacy", "Eminem", "Bailey Zimmerman", "Tyler, the Creator", "Kate Bush", 
        "Karol G", "Kane Brown", "Gayle", "Nardo Wick", "Cole Swindell", "Ariana Grande",
        "Lil Uzi Vert", "BTS", "Cody Johnson","J. Cole", "Megan Thee Stallion", "Nicki Minaj",
        "Chris Brown", "XXXTentacion", "Anderson .Paak", "Machine Gun Kelly", "Pop Smoke",
        "The Beatles", "MoneyBagg Yo", "Fleetwood Mac", "Queen", "Jason Aldean", "Em Beihold",
        "Michael Jackson", "OneRepublic", "Travis Scott", "PlayBoi Carti", "Giveon",
        "Mariah Carey", "Lady Gaga", "Frank Ocean", "Yeat", "Red Hot Chili Peppers", 
        "Andy Williams", "Jordan Davis", "DJ Khaled", "JID", "Guns N' Roses", "Joji",
        "Charlie Puth", "Michael Bublé", "Carrie Underwood", "Metallica", "Dove Cameron",
        "Stephanie Beatriz", "Baby Keem", "Rauw Alejandro", "Brent Faiyaz", "Jessica Darrow",
        "Chencho Corleone", "ColdPlay", "CKay", "Nirvana", "21 Savage"
    ],
    '2021': [
        "DaBaby", "Young Thug", "Cardi B", "Gabby Barrett", "Pooh Shiesty", "24kGoldn",
        "Lil Tjay", "Roddy Ricch", "Luke Bryan", "Masked Wolf", "Dan + Shay", "Thomas Rhett",
        "AJR", "Kid Cudi", "H.E.R.", "Florida Georgia Line", "Khalid", "Maroon 5", "Lewis Capaldi",
        "Ava Max", "Tate McRae", "Migos", "King Von", "Saweetie", "AC/DC", "Kali Uchis", 
        "Miley Cyrus", "blackbear", "Lee Brice", "Eagles", "Sam Smith", "Yung Bleu", "Miranda Lambert", 
        "Trippie Redd", "CJ", "Sam Hunt", "Jhené Aiko", "Eric Church", "Creedence Clearwater Revival"
    ],
    '2020': [
        "Jonas Brothers", "Maren Morris", "Halsey", "Tones And I", "Selena Gomez", "Camila Cabello",
        "Lil Mosey", "SAINt JHN", "Trevor Daniel", "Blake Shelton", "Arizona Zervas", "A Boogie Wit da Hoodie",
        "Mustard", "Mac Miller", "Shawn Mendes", "JACKBOYS", "Céline Dion", "Don Toliver", "YNW Melly",
        "Old Dominion", "Tory Lanez", "NF", "NLE Choppa", "Lil Wayne", "U2", "Surfaces", "Trans-Siberian Orchestra",
        "Lil Tecca", "Jon Pardi", "SHAED", "Marshmello", "Kenny Chesney", "Maddie & Tae"
    ],
    '2019': [
        "Panic! At The Disco", "Meek Mill", "P!nk", "Swae Lee", "The Rolling Stones", "5 Seconds Of Summer",
        "Lauren Daigle", "Ella Mai", "Bradley Cooper", "Normani", "twenty one pilots", "Billy Ray Cyrus",
        "Bastille", "Offset", "City Girls", 'Blueface', "Billy Joel", "Backstreet Boys", "Paul McCartney",
        "Bazzi", "6ix9ine", "Sheck Wes", "Nipsey Hussle", "benny blanco"
    ],
    '2018': [
        "Justin Timberlake", "Demi Lovato", "Bebe Rexha", "G-Eazy", "Lil Pump", "Logic", "JAY-Z", "J Balvin",
        "Ozuna", "Brett Young", "Childish Gambino", "Portugal. The Man", "Lil Skies", "EXO", "BlocBoy JB",
        "Lauv", "Keith Urban", "Metro Boomin", "Pentatonix", "Rihanna", "The Carters", "Ty Dolla $ign", 
        "Gucci Mane", "Journey", "Foo Fighters"
    ],
    '2017': [
        "The Chainsmokers", "J. Cole", "Lady Gaga", "Alessia Cara", "Niall Horan", "Rae Sremmurd",
        "Big Sean", "James Arthur", "Katy Perry", "Depeche Mode", "Luis Fonsi", "Maluma", "French Montana", 
        "Daddy Yankee", "Liam Payne", "Calvin Harris", "Zayn", "Julia Michaels", "Linkin Park", 
        "Chance The Rapper", "Bryson Tiller", "Quavo", "Tom Petty And The Heartbreakers", "Roger Waters",
        "John Mayer", "2 Chainz", "Zac Brown Band", "Kesha", "Sia", "Zedd"
    ],
    '2016': [
        "Bruce Springsteen", "Meghan Trainor", "Prince", "Desiigner", "Fetty Wap",
        "One Direction", "Fifth Harmony", "Lukas Graham", "Flo Rida", "Kevin Gates", "DNCE", "Mike Posner",
        "Daya", "Madonna", "Major Lazer", "Ellie Goulding", "James Bay", "Troye Sivan", "Jeremih", "Wiz Khalifa",
        "X Ambassadors", "Britney Spears", "Elle King", "David Bowie", "Kiiara", "Tim McGraw", "Dierks Bentley",
        "Disturbed", "Jennifer Lopez", "Nick Jonas", "gnash", "DRAM"
    ],
    '2015': [
        "Mark Ronson", "WALK THE MOON", "Hozier", "Fall Out Boy", "Jason Derulo", "silentó", "OMI", "Tove Lo",
        "Rachel Platten", "Andy Grammer", "Pitbull", "David Guetta", "Iggy Azalea", "Little Big Town", 
        "Trey Songz", "Shania Twain", "Omarion", "DJ Snake", "Ne-Yo", "Vance Joy", "Kid Ink", "Rich Homie Quan",
        "Neil Diamond", "Mumford & Sons", "Lana Del Rey", "A$AP Rocky", "Usher", "Grateful Dead", "Garth Brooks",
        "Kelly Clarkson", "Kanye West", "T-Wayne", "Enrique Iglesias"
    ],
    '2014': [
        "Pharrell Williams", "Lorde", "John Legend", "Avicii", "MAGIC!", "Charli XCX", "Nico & Vinz", "Shakira",
        "Passenger", "Brantley Gilbert", "Idina Menzel", "Lady A", "YG", "American Authors", "Juicy J", "ScHoolboy Q",
        "George Strait", "Paramore", "Austin Mahone", "Snoop Dogg", "Aloe Blacc", "Martin Garrix", "Disclosure",
        "Romeo Santos", "A Great Big World", "Lil Jon", "Arctic Monkeys", "Cher", "Becky G", 
        "Bob Marley & The Wailers", "Rascal Flatts", "MKTO", "Sara Bareilles", "Christina Aguilera"
    ],
    '2013': [
        "Macklemore & Ryan Lewis", "Robin Thicke", "The Lumineers", "Baauer", "Phillip Phillips", "Bon Jovi",
        "Daft Punk", "Hunter Hayes", "Alicia Keys", "fun.", "Kesha", "will.i.am", "The Band Perry", "Miguel",
        "Darius Rucker", "Anna Kendrick", "AWOLNATION", "T.I.", "Capital Cities", "Of Monsters And Men",
        "Rod Stewart", "Swedish House Mafia", "Wanz", "Avril Lavigne", "Brad Paisley", "Wale", "Muse",
        "Randy Houser", "Icona Pop"
    ]
}

### Step 3: Find all artists

In [None]:
auth_headers = {'Authorization': f'Bearer {spotify_access_token}'}
# To make sure no artist gets repeated
unique_artists = []
artist_search_params = {
    'q': "",
    'type': "artist",
    'market': 'GB',
    'limit': 10,
}
artists_info_df = pd.DataFrame(columns=['Artist_Name', 'Arist_Id', 'Genres', 'Followers', 'Popularity'])

for key, value in artists.items():
    print(f'Year: {key}')
    print(len(value))
    for artist in value:
        if artist not in unique_artists:
            print(artist)
            artist_search_params['q'] = f'artist:{artist}'
            # https://api.spotify.com/v1/search
            spotify_artist_url = spotify_api_base_url + '/search'
            spotify_artist_response = requests.get(spotify_artist_url, 
                                                   params = artist_search_params, 
                                                   headers=auth_headers)

            json_response = spotify_artist_response.json()
            artist_found = False
            
            for item in json_response['artists']['items']:
                if item['name'].lower() == artist.lower():
                    json_item = item
                    artist_found = True
                    break
            # json_item = json_response['artists']['items'][0]

            if artist_found == True:
                artist_info = {
                    'Artist_Name': json_item['name'],
                    'Arist_Id': json_item['id'],
                    'Genres': str(json_item['genres']),
                    'Followers': json_item['followers']['total'],
                    'Popularity': json_item['popularity']
                }

                artist_info_df = pd.DataFrame([artist_info])
                artists_info_df = pd.concat([artists_info_df, artist_info_df], ignore_index=True)

            else:
                print("not found!!!!!!!!")
                print(f"{json_response['artists']['items'][0]['name'].lower()} != {artist.lower()}")

## 1.2. Genius.com Calls

### Step 1: Getting genius id for artists

In [None]:
# Get API Token from https://docs.genius.com/ and replce in TOKEN
TOKEN = '<Your Genius Token>'
base_url = "http://api.genius.com"
#using the API
genius_headers = {'Authorization': f'Bearer {TOKEN}'}
search_url = base_url + "/search"
artist_url = base_url + "/artists"

In [None]:
# get genius id of artist
genius_artist_id = {}

for key, value in artists.items():
    print(f'Year: {key}')
    for artist in value:
        artist_params = {
            'q': artist
        }
        response = requests.get(search_url, params = artist_params, headers=genius_headers)
        artist_search_response = response.json()
        for item in artist_search_response['response']['hits']:
            flag = False
            if item['result']['primary_artist']['name'].lower() == artist.lower():
                flag = True
                genius_artist_id[artist] = item['result']['primary_artist']['id']

            if flag == True:
                break

In [None]:
lyrics_df = pd.DataFrame(columns=['Artist_Name', '', 'Title', 'Lyrics_Url', 'Language'])

artist_params = {'sort': 'popularity'}

# Fetching Lyrics
# Step 1: Get all links to lyrics 
# Fetching title for each artist based on popularity. API by default returns 20 results per page.

for key, value in genius_artist_id.items():
    page = 1
    
    while True:
        print(key)
        print(page)
        artist_params['page'] = page
        response = requests.get(f'{artist_url}/{value}/songs', params = artist_params, headers=genius_headers)
        artist_songs_response = response.json()

        for song in artist_songs_response['response']['songs']:
            print(song['title'])
            lyrics_info_dict = {'Artist_Name': key, 
                                'Featured_Artists': [],
                                'Title': song['title'], 
                                'Lyrics_Url': song['url'], 
                                'Language': song['language']}
            
            if song['featured_artists'] != []:
                for featured_artists in song['featured_artists']:
                    lyrics_info_dict['Featured_Artists'].append(featured_artists['name'])

            lyrics_info_df = pd.DataFrame([lyrics_info_dict])
            lyrics_df = pd.concat([lyrics_df, lyrics_info_df], ignore_index=True)
        
        page += 1
        
        if not artist_songs_response['response']['next_page']:
            break

In [None]:
# Step 2:Use links stored in Dataframe to get lyrics

all_artists = lyrics_df['Artist_Name'].unique()
processed_artist = dict.fromkeys(all_artists, False)

lyrics_df_list_by_artist = []

for artist in all_artists:
    lyrics_df_list_by_artist.append(lyrics_df[(lyrics_df["Artist_Name"]==artist)])

error_df = pd.DataFrame()
error_idx = []

option = Options()
option.headless = True
option.incognito = True
browser = webdriver.Chrome(options=option)

for idx, lyrics_df_by_artist in enumerate(lyrics_df_list_by_artist):
    lyrics = []
    no_error = True
   
    if not os.path.exists(f"./artists/{lyrics_df_by_artist['Artist_Name'].iloc[0]}.pkl"):
        print(lyrics_df_by_artist['Artist_Name'].iloc[0])
        if processed_artist[lyrics_df_list_by_artist[idx]['Artist_Name'].iloc[0]] == False:
            for index, song in lyrics_df_by_artist.iterrows():
                try:
                    print('working on index: ', index)
                    browser.get(song['Lyrics_Url'])
                    titles_element = browser.find_element("xpath", '/html/body/div[1]/main/div[2]/div[3]/div/div/div[3]')
                    lyrics.append(titles_element.text)
                    browser.back()
                except Exception as e:
                    browser.quit()
                    os.system("taskkill /im chromedriver.exe")
                    browser = webdriver.Chrome(options=option)
                    print("error!! index: ", index)
                    error_idx.append(index)
                    error_df = pd.concat([error_df,song])
                    lyrics.append('')
                    no_error = False
                    continue

            if no_error == True:
                processed_artist[lyrics_df_list_by_artist[idx]['Artist_Name'].iloc[0]] = True

            lyrics_df_by_artist['Lyrics'] = lyrics
            if lyrics_df_by_artist['Artist_Name'].iloc[0] == "AC/DC":
                lyrics_df_by_artist.to_pickle(f"./artists/AC_DC.pkl")
            else:
                lyrics_df_by_artist.to_pickle(f"./artists/{lyrics_df_by_artist['Artist_Name'].iloc[0]}.pkl")

In [None]:
artist_data_df = pd.DataFrame()

directory = "path/to/stored/pickle/files"
    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".pkl"): 
        print(filename)
        artist_df = pd.read_pickle(os.path.join(directory, filename))
        artist_data_df = artist_data_df.append(artist_df, ignore_index=True, verify_integrity=False, sort=None)

## Note: Web Scraping takes more than 4 days without multi-processing. Collected and pre-processed dataset has been saved and can be provided if asked for. Size of zip file of data is more than limit on eBART

# Exploratory Data Analysis

In [None]:
def n_songs_artists(df):
    # Group the DataFrame by 'artist' and count the number of songs for each artist
    artist_song_counts = df.groupby('Artist_Name')['Title'].count()

    # Sort artists by song counts
    sorted_artists = artist_song_counts.sort_values()

    # Select top 5, middle 5, and bottom 5 artists
    top_artists = sorted_artists.tail(5)
    middle_artists = sorted_artists[len(sorted_artists) // 2 - 2:len(sorted_artists) // 2 + 3]
    bottom_artists = sorted_artists.head(5)

    # Create a bar plot for top 5, middle 5, and bottom 5 artists
    plt.figure(figsize=(10, 6))  # Set the figure size
    plt.bar(top_artists.index, top_artists.values, color='darkgreen', label='Top 5', alpha=0.7)
    plt.bar(middle_artists.index, middle_artists.values, color='mediumseagreen', label='Middle 5', alpha=0.7)
    plt.bar(bottom_artists.index, bottom_artists.values, color='lightgreen', label='Bottom 5', alpha=0.7)

    # Add labels and title
    plt.xlabel('Artist')
    plt.ylabel('Number of Songs')
    plt.title('Number of Songs Vs Artists')
    
    plt.yscale('log')

    # Customize the tick labels
    plt.xticks(rotation=90)
    plt.yticks()

    # Add legend
    plt.legend()

    # Show the plot
    plt.tight_layout()
    plt.show()

In [None]:
################################
################################
#.        Network Graph        #
################################
################################
def remove_other_artists(artist_df):
    unique_artist_values = artist_df['Artist_Name'].unique().tolist()
    artist_df_copy = artist_df.copy()
    for index, row in artist_df_copy.iterrows():
        upd_list = []
        for artist in row['Featured_Artists']:
            if artist in unique_artist_values:
                if artist != row['Artist_Name']:
                    upd_list.append(artist)
        
        artist_df_copy.at[index, 'Featured_Artists'] = upd_list
        
    return artist_df_copy


df_saved = pd.read_pickle("./Datasets/finalproject/network_df.pkl")


df = remove_other_artists(df_saved)
df = df[ df['Featured_Artists'].apply(lambda x: len(x) != 0)]


df_graph= df[['Artist_Name', 'Featured_Artists']].copy()
df_graph = df_graph.explode('Featured_Artists')


#################################################

# Group by 'Artist_Name' and 'Featured_Artists' and count the retweet frequency
song_counts = df_graph.groupby(['Artist_Name', 'Featured_Artists']).size().reset_index(name='song_count')

# Calculate the weights based on retweet frequency
song_counts['weight'] = song_counts['song_count']

################################################

# Reset the index
song_counts = song_counts.reset_index(drop=True)

# Turn df into graph
G = nx.from_pandas_edgelist(retweet_counts, 'Artist_Name', 'Featured_Artists', edge_attr='weight',  create_using=nx.DiGraph()) 
pos = nx.spring_layout(G,k=0.1) #specify layout for visual


f, ax = plt.subplots(figsize=(10, 10))
plt.style.use('ggplot')
nodes = nx.draw_networkx_nodes(G, pos,node_size=20,alpha=0.8)
nodes.set_edgecolor('k')
nx.draw_networkx_edges(G, pos, width=0.5, alpha=0.2, edge_color='black')


# Get a list of nodes
nodes = list(G.nodes())
# Get a list of edges with their corresponding weights
edges = [(u, v, G[u][v]['weight']) for u, v in G.edges()]
# Create a DataFrame for nodes
nodes_df = pd.DataFrame(nodes, columns=['ID'])
# Create a DataFrame for edges
edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'Weight'])
# Save nodes and edges to CSV
nodes_df.to_csv('./nodes.csv', index=False)
edges_df.to_csv('./edges.csv', index=False)

# CSV files created were used in Gephi to plot the network graph

# Pre processing

### Genre

In [None]:
# fix issue with trap and rap
def genre_pre_processing(genre_list):
    print(genre_list)
    genre_list = ast.literal_eval(genre_list)
    genre_list = [n.strip() for n in genre_list]
    
    genre_tracker = {key: False for key in genre_list}

    main_genres = ['pop', 'rock', 'trap', 'hip hop', 'rap', 'dance', 'r&b', 'indie', 'country', 'metal', 'permanent wave', 'reggaeton', 'house', 'neo mellow', 'afrofuturism', 'soul', 'reggae', 'movie tunes']
    genre_array = np.zeros(len(main_genres), dtype = int)
    
    # secondary genres
    secondary_genres = {
        'edm': 'dance',
        'techno': 'dance',
        'alt z': 'hip hop',
        'mellow gold': 'rock',
        'drill': 'hip hop',
        'adult standards': 'pop',
        'hollywood': 'pop'
    }
    
    new_genre_list = []
    
    for genre in genre_list:
        indices = []
        genres_found = []
        for main_genre in main_genres:
            if main_genre in genre:
                indices.append(genre.find(main_genre))
                genres_found.append(main_genre)
                
        for secondary_genre in list(secondary_genres.keys()):
            if secondary_genre in genre:
                indices.append(genre.find(secondary_genre))
                genres_found.append(secondary_genres[secondary_genre])
                
        
        if indices != []:
            new_genre_list.append(genres_found[np.argmin(indices)])
            genre_tracker[genre] = True
    
    seen = set()
    seen_add = seen.add
    upd_genres = [x for x in new_genre_list if not (x in seen or seen_add(x))]
    
    for genre in upd_genres:
        genre_array[main_genres.index(genre)] = 1
    
    return genre_array

def invoke_genre_pre_processing(song_df):
    song_df_copy = song_df.copy()
    song_df_copy['Genres_array'] = song_df['Genres'].apply(genre_pre_processing)    
    return song_df_copy

### Profanity and Politeness Score

In [None]:
def profanity_score(lyrics):
    words = lyrics.split()
    curse_words = open("./curse").read().splitlines()
    
    word_sum = 0
    for word in words:
        if word in curse_words:
            word_sum += 1

    return (word_sum/ len(lyrics))*100

def politeness_score(lyrics):
    words = lyrics.split()
    love_words = open("./love_affection").read().splitlines()
    
    word_sum = 0
    for word in words:
        if word in love_words:
            word_sum += 1

    return (word_sum/ len(lyrics))*100

def invoke_profanity_politeness_score(song_df, col_name):
    song_df_copy = song_df.copy()
    
    song_df_copy['profanity'] = song_df[col_name].apply(lambda x: profanity_score(x))
    song_df_copy['politeness'] = song_df[col_name].apply(lambda x: politeness_score(x))
    
    return song_df_copy

### Lyrics Preprocessing Start

In [None]:
def remove_non_eng_and_songs_without_lyrics(song_df, col_name):
    string_lyrics_not_avail = "Lyrics for this song have yet to be released. Please check back once the song has been released"
    string_instrumental = "This song is an instrumental"
    song_df_copy = song_df.copy()
    song_df_copy = song_df_copy[ song_df_copy['Language'] == 'en']
    song_df_copy = song_df_copy[ [string_lyrics_not_avail not in g for g in song_df_copy[col_name]]]
    song_df_copy = song_df_copy[ [string_instrumental not in g for g in song_df_copy[col_name]]]
    song_df_copy = song_df_copy[ song_df_copy[col_name].apply(lambda x: len(x.split(' ')) > 50) ]
        
    return song_df_copy

In [None]:
def remove_round_brackets(song_df, col_name):
    song_df_copy = song_df.copy()
    song_df_copy['Lyrics_without_rb'] = song_df[col_name].map(lambda s: re.sub(r'(\(|\))', '', s))
    song_df_copy.drop([col_name], axis=1, inplace=True)
    
    return song_df_copy

In [None]:
def remove_square_brackets(song_df, col_name):
    song_df_copy = song_df.copy()
    song_df_copy['Lyrics_without_sb'] = song_df[col_name].map(lambda s: re.sub(r"\[[^\[\]]*\]", '', s))
    song_df_copy.drop([col_name], axis=1, inplace=True)
    
    return song_df_copy

In [None]:
def convert_lyrics_to_sentences(lyrics):
    sentences = lyrics.split('\n')
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    
    return sentences

def fetch_lines_from_lyrics(song_df, col_name):
    song_df_copy = song_df.copy()
    song_df_copy['Lyrics_lines'] = song_df[col_name].apply(convert_lyrics_to_sentences).tolist()
    song_df_copy.drop([col_name], axis=1, inplace=True)
    
    return song_df_copy

In [None]:
def remove_new_line_character(lyrics):
    # Replace new line characters in the middle with spaces
    lyrics = lyrics.replace('\n', ' ')
    # Remove new line characters at the start or end of the text
    lyrics = lyrics.strip('\n')
    
    return lyrics

def fetch_nlc_removed_lyrics(song_df, col_name):
    song_df_copy = song_df.copy()
    song_df_copy['Lyrics_without_nlc'] = song_df[col_name].apply(remove_new_line_character)
    song_df_copy.drop([col_name], axis=1, inplace=True)
    
    return song_df_copy

In [None]:
def remove_punctuation(lyrics):
    translator = str.maketrans("", "", string.punctuation)
    lyrics_without_punct = lyrics.translate(translator)
    
    return lyrics_without_punct

def token_stemming(lyrics_tokens):
    tokenized_words = word_tokenize(lyrics_tokens)
    
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in tokenized_words]
    
    return stemmed_words
    

def to_lower_and_remove_punctuations(song_df, col_name):
    song_df_copy = song_df.copy()
    
    # Clean data using nltk
    song_df_copy['Lyrics_clean'] = song_df[col_name].apply(lambda x: remove_punctuation(x.lower()))
    song_df_copy['Lyrics_clean'] = song_df_copy['Lyrics_clean'].apply(lambda x: token_stemming(x))
    
    song_df_copy.drop([col_name], axis=1, inplace=True)
    return song_df_copy

In [None]:
def remove_line_with_substring(lyrics):    
    patterns = [r'Translations', r'\d+', r'\w+ Contributors', r'\w+ Lyrics', r'Embed']
    lines = lyrics.splitlines()
    filtered_lines = []
    for line in lines:
        match = False
        for pattern in patterns:
            match = re.search(pattern, line)
            if match:
                match = True
                break
        
        if not match:
            filtered_lines.append(line)
    updated_lyrics = '\n'.join(filtered_lines)
    return updated_lyrics

def invoke_remove_line_with_substring(song_df, col_name):
    song_df_copy = song_df.copy()
    
    song_df_copy['Lyrics_without_junk'] = song_df[col_name].apply(remove_line_with_substring)
    song_df_copy.drop([col_name], axis=1, inplace=True)
    
    return song_df_copy

### Lyrics Preprocessing End

In [None]:
# Used to set seed value for word2vec
def hash(astring):
    return ord(astring[0])

In [None]:
def Word2Vec_vectorize_complete_song(X_train, X_test, y_train, y_test, vector_size = 100, window = 25): 
    # Generating Word2Vec Model
    w2v_model = gensim.models.Word2Vec(X_train,
                                       min_count=2,
                                       window=window,
                                       vector_size=vector_size,
                                       sample=6e-5, 
                                       alpha=0.03, 
                                       min_alpha=0.0007, 
                                       workers=4, 
                                       hashfxn=hash)
            
    return w2v_model

In [None]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings
    
    # Reduces the dimensionality from 300 to 50 dimensions with PCA
    reduc = PCA(n_components=50).fit_transform(vectors)

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0, perplexity=15) #.fit_transform(reduc))
    vectors = tsne.fit_transform(reduc)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

def plot_with_matplotlib(sentence_model, res_df):    
    curse_words = open("./curse").read().splitlines()
    love_words = open("./love_affection").read().splitlines()
    
    # Making vocab_with_freq
    # Flatten the lists in the 'words_column' and count the frequency of each word
    word_list = [word for sublist in res_df['Lyrics_clean'] for word in sublist]
    word_frequency = Counter(word_list)
    
    ###############
    # Curse words #
    ###############
    selected_words = curse_words
    
    # Filter the word_frequency dictionary to keep only selected words
    filtered_word_frequency = {word: count for word, count in word_frequency.items() if word in selected_words}
    # new addition
    # Get the top 10 most common elements based on frequency
    # Sort the dictionary items by values in descending order
    sorted_items = sorted(filtered_word_frequency.items(), key=lambda item: item[1], reverse=True)

    # Keep only the top 10 elements
    filtered_word_frequency = dict(sorted_items[:20])
    total_frequency = sum(filtered_word_frequency.values())

    # Normalize the frequencies
    vocab_with_freq_curse = {word: (count / total_frequency)*1500 for word, count in filtered_word_frequency.items()}

    
    ###############
    # Love words #
    ###############
    selected_words = love_words
    
    # Filter the word_frequency dictionary to keep only selected words
    filtered_word_frequency = {word: count for word, count in word_frequency.items() if word in selected_words}
    # new addition
    # Get the top 10 most common elements based on frequency
    # Sort the dictionary items by values in descending order
    sorted_items = sorted(filtered_word_frequency.items(), key=lambda item: item[1], reverse=True)

    # Keep only the top 10 elements
    filtered_word_frequency = dict(sorted_items[:20])
    total_frequency = sum(filtered_word_frequency.values())
 
    # Normalize the frequencies
    vocab_with_freq_love = {word: (count / total_frequency)*1500 for word, count in filtered_word_frequency.items()}

    
    print("INFO: vocab_with_freq populated")

    x_vals, y_vals, labels = reduce_dimensions(sentence_model)

    plt.figure(figsize=(8, 8))
    
    indices = list(range(len(labels)))
    selected_indices_curse = []
    selected_indices_love = []
    
    annotations_objs = []
    
    for curse_word in vocab_with_freq_curse.keys():
        if curse_word in labels:
            selected_indices_curse.append(np.where(labels == curse_word)[0][0])
            

    for love_word in vocab_with_freq_love.keys():
        if love_word in labels:
            selected_indices_love.append(np.where(labels == love_word)[0][0])
            
    selected_indices_curse = np.array(selected_indices_curse)
    selected_indices_love = np.array(selected_indices_love)
    
    for i in selected_indices_curse:
        cu = plt.scatter(x_vals[i], y_vals[i], s = vocab_with_freq_curse[labels[i]], c = '#b9cdeb')
        annotations_objs.append(plt.annotate(labels[i], (x_vals[i], y_vals[i]), color='black', fontsize=10))
        
    for i in selected_indices_love:
        lo = plt.scatter(x_vals[i], y_vals[i], s = vocab_with_freq_love[labels[i]], c = '#f7c500')
        annotations_objs.append(plt.annotate(labels[i], (x_vals[i], y_vals[i]), color='black', fontsize=10))
        
    adjust_text(annotations_objs)
    
    plt.legend((cu, lo),
               ('Profanity', 'Love/Affection'),
               loc='upper right',
               ncol=2,
               fontsize=12,
               markerscale = 5,
               framealpha = 0.5)
    
    plt.axis('tight')
    plt.axis('off')
    plt.title("Profanity And Love Clusting: Word2Vec")
    plt.show()

In [None]:
# Used to plot accuracy of RF using word2vec against value chose for vector size and window size
# This was used for hyperparameter tuning of word2vec
def plot_acc_vs_window(vector_size, window_size, acc):
    plt.figure()
    
    plt.ylabel("accuracy")
    plt.xlabel("window size")
    
    for vec_s in vector_size:
        plt.plot(window_size, acc[vec_s], marker='o', label='%s :vector size' % vec_s)
        
    
    plt.legend()
    plt.show()

In [None]:
def Classify_RF(X_train_vect_avg, train_genres_array, train_profanity, train_politeness, X_test_vect_avg, test_genres_array, test_profanity, test_politeness, y_train, y_test):
########################################
#     #Hyperparameter Tuning           #
########################################
#     # Number of trees in random forest
#     n_estimators = [int(x) for x in np.linspace(start = 50, stop = 200, num = 4)]
#     # Number of features to consider at every split
#     max_features = ['auto', 'sqrt']
#     # Maximum number of levels in tree
#     max_depth = [int(x) for x in np.linspace(5, 20, num = 4)]
#     max_depth.append(None)
#     # Minimum number of samples required to split a node
#     min_samples_split = [2, 4, 6]
#     # Minimum number of samples required at each leaf node
#     min_samples_leaf = [1, 2, 4]
#     # Method of selecting samples for training each tree
#     bootstrap = [True, False]
#     # Create the random grid
#     random_grid = {'n_estimators': n_estimators,
#                    'max_features': max_features,
#                    'max_depth': max_depth,
#                    'min_samples_split': min_samples_split,
#                    'min_samples_leaf': min_samples_leaf,
#                    'bootstrap': bootstrap}
#     print(random_grid)
    
#     # Use the random grid to search for best hyperparameters
#     # First create the base model to tune
#     rf_random = RandomizedSearchCV(estimator = rf, 
#                                    param_distributions = random_grid, 
#                                    n_iter = 100, 
#                                    verbose=2, 
#                                    random_state=42, 
#                                    n_jobs = -1)
#     Fit the random search model
#     rf_random.fit(X_train_vect_avg, y_train.values.ravel())
#     rf_model = rf_random.best_estimator_
#     # Best Parameters found using grid search : {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'bootstrap': False}
########################################
#     #Hyperparameter Tuning           #
########################################

    rf = RandomForestClassifier(n_estimators = 200, 
                                min_samples_split = 2, 
                                min_samples_leaf = 1, 
                                max_features = 'auto', 
                                max_depth = None, 
                                bootstrap = False, 
                                n_jobs = -1, 
                                random_state = random.seed(1))
    print("INFO: rf_model created")
    
    # Uncomment to use the desired set of features
    # X_train = X_train_vect_avg
    # X_train = np.column_stack((X_train_vect_avg, train_profanity, train_politeness))
    X_train = np.column_stack((X_train_vect_avg, train_profanity, train_politeness, train_genres_array))
        
    rf_model = rf.fit(X_train, y_train.values.ravel())
    print("INFO: rf_model fitted to training set")
    
    
    # Uncomment to use the desired set of features
    # X_test = X_test_vect_avg
    # X_test = np.column_stack((X_test_vect_avg, test_profanity, test_politeness))
    X_test = np.column_stack((X_test_vect_avg, test_profanity, test_politeness, test_genres_array))
    
    y_pred = rf_model.predict(X_test)
    print("INFO: predictiond on test set done")
    
    
#########################################
    
#     # Calculate class-wise accuracy
#     class_wise_accuracy = {}
#     for class_label in set(y_test):
#         class_mask = y_test == class_label
#         class_accuracy = accuracy_score(y_test[class_mask], y_pred[class_mask])
#         class_wise_accuracy[class_label] = class_accuracy
#     print("Class-wise Accuracy:")
#     print(class_wise_accuracy)
    
#########################################

    
    # Finding top-1, top-3 and top-5 Accuracy
    ##################
    # Step 1: Get probability scores for each class label
    y_probabilities = rf_model.predict_proba(X_test)
    print("INFO: y_probabilities created")

    # Step 2: Sort probability scores in descending order to get top 3 probabilities and labels
    top3_indices = np.argsort(y_probabilities, axis=1)[:, -3:]
    top3_labels = rf_model.classes_[top3_indices]
    
    top5_indices = np.argsort(y_probabilities, axis=1)[:, -5:]
    top5_labels = rf_model.classes_[top5_indices]
    print("INFO: top 3 and 5 labels created")
    
    # Convert y_test to a NumPy array
    y_test_np = y_test.values.reshape(-1, 1)

    # Step 3: Check if true label is in the top 3 probabilities for each sample
    correct_top3_predictions = np.any(top3_labels == y_test_np, axis=1)
    correct_top5_predictions = np.any(top5_labels == y_test_np, axis=1)

    # Step 4: Calculate accuracy based on top 3 probabilities
    top3_accuracy = np.mean(correct_top3_predictions)
    top5_accuracy = np.mean(correct_top5_predictions)
    ##################
    print("INFO: top 3 and 5 accuracy calculated")
    

    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Top 3 Accuracy: {top3_accuracy:.4f}")
    print(f"Top 5 Accuracy: {top5_accuracy:.4f}")
    
    return rf_model, accuracy

In [None]:
def Classify_SVM(X_train_vect_avg, train_genres_array, train_profanity, train_politeness, X_test_vect_avg, test_genres_array, test_profanity, test_politeness, y_train, y_test): 
########################################
#     #Hyperparameter Tuning           #
########################################
#     param_grid = {
#         'C': [0.1, 1, 10],
#         'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#         'gamma': [0.1, 1, 'scale'],
#         'probability': [True]
#     }

#     # Perform Grid Search with cross-validation
#     grid_search = GridSearchCV(svm_model, param_grid, cv=5)
#     grid_search.fit(X_train_vect_avg, y_train.values.ravel())

#     # Get the best hyperparameters
#     best_params = grid_search.best_params_
#     print("INFO: Best Params")
#     print(best_params)

#     # Initialize the SVM model with the best hyperparameters
#     svm_model = SVC(**best_params)
    
#     Define the SVM model
#     Using parameters with best perfromace after grid search
########################################
#     #Hyperparameter Tuning           #
########################################

    svm_model = SVC(kernel='poly', C=1.0, gamma='scale', probability=True , random_state = random.seed(1))
    print("INFO: svm_model created")
    
    # Uncomment to use the desired set of features
    # X_train = X_train_vect_avg
    # X_train = np.column_stack((X_train_vect_avg, train_profanity, train_politeness))
    X_train = np.column_stack((X_train_vect_avg, train_profanity, train_politeness, train_genres_array))
     
    svm_model.fit(X_train, y_train.values.ravel())
    print("INFO: svm_model fitted to training set")
    
    # Uncomment to use the desired set of features
    # X_test = X_test_vect_avg
    # X_test = np.column_stack((X_test_vect_avg, test_profanity, test_politeness))
    X_test = np.column_stack((X_test_vect_avg, test_profanity, test_politeness, test_genres_array))
    
    y_pred = svm_model.predict(X_test)
    print("INFO: predictiond on test set done")
    
    
#########################################
    # Calculate class-wise accuracy
#     class_wise_accuracy = {}
#     for class_label in set(y_test):
#         class_mask = y_test == class_label
#         class_accuracy = accuracy_score(y_test[class_mask], y_pred[class_mask])
#         class_wise_accuracy[class_label] = class_accuracy
#     print("Class-wise Accuracy:")
#     print(class_wise_accuracy)
    
#########################################
    
    
    # Finding top-1, top-3 and top-5 Accuracy
    # Step 1: Get probability scores for each class label
    y_probabilities = svm_model.predict_proba(X_test)
    print("INFO: y_probabilities created")

    # Step 2: Sort probability scores in descending order to get top 3 probabilities and labels
    top3_indices = np.argsort(y_probabilities, axis=1)[:, -3:]
    top3_labels = svm_model.classes_[top3_indices]
    
    top5_indices = np.argsort(y_probabilities, axis=1)[:, -5:]
    top5_labels = svm_model.classes_[top5_indices]
    print("INFO: top 3 and 5 labels created")
    
    # Convert y_test to a NumPy array
    y_test_np = y_test.values.reshape(-1, 1)

    # Step 3: Check if true label is in the top 3 probabilities for each sample
    correct_top3_predictions = np.any(top3_labels == y_test_np, axis=1)
    correct_top5_predictions = np.any(top5_labels == y_test_np, axis=1)

    # Step 4: Calculate accuracy based on top 3 probabilities
    top3_accuracy = np.mean(correct_top3_predictions)
    top5_accuracy = np.mean(correct_top5_predictions)
    print("INFO: top 3 and 5 accuracy calculated")
    
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Top 3 Accuracy: {top3_accuracy:.4f}")
    print(f"Top 5 Accuracy: {top5_accuracy:.4f}")
    
    return svm_model, accuracy

In [None]:
def Classify_LinearSVM(X_train_vect_avg, train_genres_array, train_profanity, train_politeness, X_test_vect_avg, test_genres_array, test_profanity, test_politeness, y_train, y_test): 
    clf = LinearSVC(max_iter=1000, random_state = random.seed(1)) 
    svm_model = CalibratedClassifierCV(base_estimator = clf, method='sigmoid') #, cv='prefit')
    
    print("INFO: LinearSVC_model created")
    
    # Uncomment to use the desired set of features
    # X_train = X_train_vect_avg
    # X_train = np.column_stack((X_train_vect_avg, train_profanity, train_politeness))
    X_train = np.column_stack((X_train_vect_avg, train_profanity, train_politeness, train_genres_array))
     
    svm_model.fit(X_train, y_train.values.ravel())
    print("INFO: LinearSVC_model fitted to training set")
    
    # Uncomment to use the desired set of features
    # X_test = X_test_vect_avg
    # X_test = np.column_stack((X_test_vect_avg, test_profanity, test_politeness))
    X_test = np.column_stack((X_test_vect_avg, test_profanity, test_politeness, test_genres_array))
    
    y_pred = svm_model.predict(X_test)
    print("INFO: predictiond on test set done")
    
    
    #########################################
    
    # Calculate class-wise accuracy
    class_wise_accuracy = {}
    for class_label in set(y_test):
        class_mask = y_test == class_label
        class_accuracy = accuracy_score(y_test[class_mask], y_pred[class_mask])
        class_wise_accuracy[class_label] = class_accuracy

    print("Class-wise Accuracy:")
    print(class_wise_accuracy)
    
    #########################################
    
    
    # Finding top-1, top-3 and top-5 Accuracy
    # Step 1: Get probability scores for each class label
    y_probabilities = svm_model.predict_proba(X_test)
    print("INFO: y_probabilities created")

    # Step 2: Sort probability scores in descending order to get top 3 probabilities and labels
    top3_indices = np.argsort(y_probabilities, axis=1)[:, -3:]
    top3_labels = svm_model.classes_[top3_indices]
    
    top5_indices = np.argsort(y_probabilities, axis=1)[:, -5:]
    top5_labels = svm_model.classes_[top5_indices]
    print("INFO: top 3 and 5 labels created")
    
    # Convert y_test to a NumPy array
    y_test_np = y_test.values.reshape(-1, 1)

    # Step 3: Check if true label is in the top 3 probabilities for each sample
    correct_top3_predictions = np.any(top3_labels == y_test_np, axis=1)
    correct_top5_predictions = np.any(top5_labels == y_test_np, axis=1)

    # Step 4: Calculate accuracy based on top 3 probabilities
    top3_accuracy = np.mean(correct_top3_predictions)
    top5_accuracy = np.mean(correct_top5_predictions)
    print("INFO: top 3 and 5 accuracy calculated")
    
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Top 3 Accuracy: {top3_accuracy:.4f}")
    print(f"Top 5 Accuracy: {top5_accuracy:.4f}")
    
    return svm_model, accuracy

In [None]:
def generate_vect_avg(sentence_model, words, batch_df_col):
    vect = np.array([np.array([sentence_model.wv[i] 
                                       for i in vectors if i in words])
                             for vectors in batch_df_col])
    
    return vect

In [None]:
def preprocess(raw_df):
    res_df = invoke_genre_pre_processing(raw_df)
    res_df = invoke_remove_line_with_substring(res_df, 'Lyrics')
    res_df = remove_non_eng_and_songs_without_lyrics(res_df, 'Lyrics_without_junk')
    res_df = invoke_profanity_politeness_score(res_df, 'Lyrics_without_junk')
    res_df = remove_round_brackets(res_df, 'Lyrics_without_junk')
    res_df = remove_square_brackets(res_df, 'Lyrics_without_rb')
    res_df = fetch_nlc_removed_lyrics(res_df, 'Lyrics_without_sb')
    res_df = to_lower_and_remove_punctuations(res_df, 'Lyrics_without_nlc')
    res_df.to_pickle("./Datasets/finalproject/artist_genre_lyrics_preprocessed_df.pkl")
    
    return res_df

In [None]:
def orchestrate(df):
    random.seed(1)
    print("INFO: started main")
    
    # Uncomment to perform pre-processing
    # res_df = preprocess(artist_genre_lyrics_df) # data_df # artist_genre_lyrics_df # artist_data_midsize_df
    # print("INFO: pre processing done")
    res_df = df
    
    # Uncomment to plot Artist vs Songs distribution
    # n_songs_artists(res_df)
    
    vector_size = 100 #[100, 200, 300]
    window_size = 25 #[5, 10, 15, 20, 25]
    
    #########################
    # Used to create stratified smaller datasets
    data_50, _ = train_test_split(res_df,
                                  test_size=0.50,
                                  stratify=res_df['Artist_Name'],
                                  random_state=42)
    #########################
    
    train, test = train_test_split(res_df, # data_50 use this value if smaller portion of dataset needed
                                   test_size=0.2,
                                   stratify=res_df['Artist_Name'], # data_50['Artist_Name'] use this value if smaller portion of dataset needed
                                   random_state=42)
    print("INFO: train-test split created")

    lyrics_model = Word2Vec_vectorize_complete_song(train['Lyrics_clean'], 
                                                      test['Lyrics_clean'], 
                                                      train['Artist_Name'], 
                                                      test['Artist_Name'],
                                                      vector_size,
                                                      window_size)
    print("INFO: Word2Vec model created")
    
    # Uncomment to plot t-SNE projection of Word2Vec vectors
    # plot_with_matplotlib(lyrics_model, res_df)

    w2v_weights = lyrics_model.wv.vectors
    vocab_size, embedding_size = w2v_weights.shape

    print("Vocabulary Size: {} - Embedding Dim: {} - Window Size: {}".format(vocab_size, embedding_size, window_size))

    words = set(lyrics_model.wv.index_to_key)

    ##############
    # Compute lyrics vectors by averaging the word vectors for the words contained in the lyrics
    ##############
    X_train_vect_avg = []
    
    # Define the batch size
    batch_size = 1000

    # Iterate through the dataframe in batches
    num_records = len(train['Lyrics_clean'])
    num_batches = (num_records // batch_size) + 1
    
    print("INFO: Working on train set")
    
    for batch_num in range(num_batches):
        print("Working on batch: ", batch_num)
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, num_records)

        batch_df_col = train['Lyrics_clean'].iloc[start_idx:end_idx]
        processed_data = generate_vect_avg(lyrics_model, words, batch_df_col)
        
        for v in processed_data:
            if v.size:
                X_train_vect_avg.append(v.mean(axis=0))
            else:
                X_train_vect_avg.append(np.zeros(vec_s, dtype=float))
    
    X_test_vect_avg = []
    
    # Iterate through the dataframe in batches
    num_records = len(test['Lyrics_clean'])
    num_batches = (num_records // batch_size) + 1
    
    print("INFO: Working on test set")
    
    for batch_num in range(num_batches):
        print("Working on batch: ", batch_num)
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, num_records)

        batch_df_col = test['Lyrics_clean'].iloc[start_idx:end_idx]
        processed_data = generate_vect_avg(lyrics_model, words, batch_df_col)
        
        for v in processed_data:
            if v.size:
                X_test_vect_avg.append(v.mean(axis=0))
            else:
                X_test_vect_avg.append(np.zeros(vec_s, dtype=float))
    
    # Uncomment the claasifier which you want to use. Currently using SVM
#     print("####################################")
#     print("INFO: Calling RF classifier function")
#     print("####################################")
#     model, accuracy = Classify_RF(X_train_vect_avg, 
#                                   train['Genres_array'].tolist(), 
#                                   train['profanity'],
#                                   train['politeness'],
#                                   X_test_vect_avg, 
#                                   test['Genres_array'].tolist(), 
#                                   test['profanity'],
#                                   test['politeness'],
#                                   train['Artist_Name'], 
#                                   test['Artist_Name'])
    
#     print("####################################")
#     print("INFO: Calling SVM classifier function")
#     print("####################################")
#     model, accuracy = Classify_SVM(X_train_vect_avg, 
#                                    train['Genres_array'].tolist(), 
#                                    train['profanity'],
#                                    train['politeness'],
#                                    X_test_vect_avg,
#                                    test['Genres_array'].tolist(), 
#                                    test['profanity'],
#                                    test['politeness'],
#                                    train['Artist_Name'],
#                                    test['Artist_Name'])
    
    print("####################################")
    print("INFO: Calling Linear SVM classifier function")
    print("####################################")
    model, accuracy = Classify_LinearSVM(X_train_vect_avg, 
                                   train['Genres_array'].tolist(), 
                                   train['profanity'],
                                   train['politeness'],
                                   X_test_vect_avg,
                                   test['Genres_array'].tolist(), 
                                   test['profanity'],
                                   test['politeness'],
                                   train['Artist_Name'],
                                   test['Artist_Name'])
    
    print("INFO: control back to main")
    return model, lyrics_model, words

In [None]:
# Creating vector for prompts
def predict_artist(sentence_words, words, sentence_model, vec_size = 300):
    tokenized_test_song_lyrics = sentence_words.split()    
    X_pred_vect = [np.array([sentence_model.wv[i] for i in tokenized_test_song_lyrics if i in words])]
                              
    X_pred_vect_avg = []
    for v in X_pred_vect:
        if v.size:
            X_pred_vect_avg.append(v.mean(axis=0))
        else:
            X_pred_vect_avg.append(np.zeros(vec_size, dtype=float))
        
    return X_pred_vect_avg

In [None]:
# Function to test out prompts
def test_prompts():
    # Elton john: "i not the man they think i am at home"
    # Billie Eilish: "white shirt now red my bloody nose"
    sample_string = "white shirt now red my bloody nose sleepin you on your tippy toes creepin around like no one knows think you so criminal bruises on both my knees for you don't say thank you or please i do what i want when i wanting to my soul so cynical"
    rf_model, sentence_model, words = orchestrate()
    X_pred_vect_avg = predict_artist(sample_string, words, sentence_model)
    
    output = rf_model.predict(X_pred_vect_avg)
    output_proba = rf_model.predict_proba(X_pred_vect_avg)
    print(rf_model.classes_)
    print(output_proba)
    print(output)
    
test_prompts()

In [None]:
# run code
if __name__ == "__main__":
    artist_genre_lyrics_df = pd.read_pickle("./artist_genre_lyrics_preprocessed_filtered_df.pkl")
    start = time.time()
    rf_model, lyrics_model, words = main(artist_genre_lyrics_df)
    tot_time = time.time() - start
    print(tot_time)