### Step5 Music title classification to predict music genre

##### Semantic Similarity Classifier

In [1]:
from gensim.models import KeyedVectors

In [2]:
# location of the downloaded binary
filepath='/Users/johannebear/Library/Mobile Documents/com~apple~CloudDocs/APPLIED ANALYTICS/APAN 5430/Assignments/assignment5/GoogleNews-vectors-negative300.bin.gz'

In [3]:
model_word2vec = KeyedVectors.load_word2vec_format(filepath, binary=True)

In [4]:
topic_taxonomy2 = {
    "Music Genres": {
        "Folk": "Represented by albums such as 'Teaser And The Firecat (Remastered)', 'Essential Shawn Mullins', and others.",
        "Rock": "Iconic albums including 'Rock Of Ages: The Definitive Collection [2 CD]' and 'Unplugged' define this category.",
        "Electronic": "Characterized by albums like 'CD+Graphics Karaoke - Ultimate Party Hits' and 'Sam's Town'.",
        "Reggae": "Featuring albums like 'Living Like A Refugee' and 'Facing Future'.",
        "Funk/Soul": "With albums like 'Sweet & Sour Tears - 50th Anniversary Collectors Series', the essence of soul is captured.",
        "Jazz": "Jazz vibes from 'Sundown: A Windam Hill Piano Collection' and others.",
        "Pop": "Hits like 'Going Somewhere' and 'A Thousand Different Ways' are pop essentials.",
        "Hip Hop": "Defined by beats from albums like 'Funkmaster Flex Presents The Mix Tape Volume 1: 60 Minutes Of Funk'.",
        "R&B": "Unique blend showcased in 'Leonard Cohen: I'm Your Man' and others."
    },
    "Movie & Show Soundtracks": {
        "Dramatic Scores": "Noteworthy soundtracks like 'The Phantom of the Opera (2004 Movie Soundtrack)' and 'Shawshank Redemption: Hopes and Dreams'.",
        "Stage & Screen": "Musical hits from 'My Fair Lady (20th Anniversary Broadway Cast)' to 'Hamilton: An American Musical'.",
        "Horrifying Sounds": "Creepy soundtracks from movies like 'A Nightmare On Elm Street I & II'.",
        "Action Packed": "Energetic soundtracks such as 'The Matrix: Music From The Motion Picture'.",
        "Family Bonds": "Emotional scores from 'The Education Of Little Tree: Music From The Motion Picture' and more.",
        "Romantic Melodies": "Love tunes from 'The Notebook: Timeless Love' and 'Pride and Prejudice: First Glances'."
    },
    "Holiday & Thematic": {
        "Spooky Vibes": "Halloween hits like 'Halloween Howls' and 'Spooky Stories: Midnight Chills'.",
        "Christmas Cheers": "Festive tunes from 'Jingle Bells Rock Compilation' and 'Winter Wonderland: Snowy Melodies'.",
        "Summer Grooves": "Vibes from 'Sunny Vibes Playlist' and 'Beach Days: Waves & Tunes'.",
        "Love Tunes": "Romantic hits for Valentine's Day like 'Love Ballads: Hearts in Harmony'.",
        "Spring Renewal": "Easter tunes from 'Spring Awakening: Renewed Hopes' and others."
    }
}


In [5]:
class TaxonomyClassifier:
    def __init__(self, model, taxonomy, topN=3):
        '''
        Constructor for TaxonomyClassifier
        model: word2vec model
        taxonomy: 2-level taxonomy defined as dictinoary-in-dictionary
        '''
        self.model = model
        self.taxonomy = topic_taxonomy2 
        self.topN = topN
        
    def classify(self, text, topN=None):
        '''
        Classfify text to subtopics based on the similarity between the input text and the subtopic keywords 
        text: the input text
        topN: number of subtopics to be shown
        '''
        
        topN = topN if topN else self.topN
        text = str(text).lower()
        scores = dict()
        for topic, taxonomy in self.taxonomy.items():

            subtopic_scores = dict()

            for subtopic, keywords in taxonomy.items():
                subtopic_text = (subtopic + ' ' + keywords).strip().lower()
                subtopic_scores[subtopic] = self.model.n_similarity(text.split(), subtopic_text.split())

            topSubtopic, subtopiScore = sorted(subtopic_scores.items(), key=lambda x: x[1], reverse=True)[0]
            scores[topSubtopic] = subtopiScore

        return sorted(scores.items(), key=lambda x: x[1])[::-1][:topN]

In [6]:
tc2 = TaxonomyClassifier(model=model_word2vec, taxonomy=topic_taxonomy2)

In [7]:
test_titles = [
    "Funkmaster Flex Presents The Mix Tape Volume 1: 60 Minutes Of Funk",
    "Hed Kandi: Disco Kandi (63) / Various",
    "A Nightmare On Elm Street I & II",
    "Meet Joe Black: Original Motion Picture Soundtrac."
]

In [8]:
test_genres = ['Funk',  'Electronic', 'Stage & Scene',  'Jazz']

In [9]:
predicted_genres = []

for title in test_titles:
    prediction = tc2.classify(title, topN=1)
    predicted_genres.append(prediction[0][0])
    
print(predicted_genres)

['Hip Hop', 'Folk', 'Horrifying Sounds', 'Action Packed']


#### Accuracy rate is still 0

In [10]:
correct_predictions = sum(1 for predicted, truth in zip(predicted_genres, test_genres) if predicted == truth)
accuracy = correct_predictions / len(test_genres) * 100

print(test_genres)
print(predicted_genres)
print(f"Accuracy of this model  is: {accuracy:.2f}%")

['Funk', 'Electronic', 'Stage & Scene', 'Jazz']
['Hip Hop', 'Folk', 'Horrifying Sounds', 'Action Packed']
Accuracy of this model  is: 0.00%


##### Naive Bayes Classifier

In [11]:
import pandas as pd

In [12]:
model_df_data = [(subtopic, description) for topic, subtopics in topic_taxonomy2.items() for subtopic, description in subtopics.items()]
df = pd.DataFrame(model_df_data, columns=['genre', 'title'])


In [13]:
df

Unnamed: 0,genre,title
0,Folk,Represented by albums such as 'Teaser And The ...
1,Rock,Iconic albums including 'Rock Of Ages: The Def...
2,Electronic,Characterized by albums like 'CD+Graphics Kara...
3,Reggae,Featuring albums like 'Living Like A Refugee' ...
4,Funk/Soul,With albums like 'Sweet & Sour Tears - 50th An...
5,Jazz,Jazz vibes from 'Sundown: A Windam Hill Piano ...
6,Pop,Hits like 'Going Somewhere' and 'A Thousand Di...
7,Hip Hop,Defined by beats from albums like 'Funkmaster ...
8,R&B,Unique blend showcased in 'Leonard Cohen: I'm ...
9,Dramatic Scores,Noteworthy soundtracks like 'The Phantom of th...


In [14]:
df['genre']

0                  Folk
1                  Rock
2            Electronic
3                Reggae
4             Funk/Soul
5                  Jazz
6                   Pop
7               Hip Hop
8                   R&B
9       Dramatic Scores
10       Stage & Screen
11    Horrifying Sounds
12        Action Packed
13         Family Bonds
14    Romantic Melodies
15         Spooky Vibes
16     Christmas Cheers
17       Summer Grooves
18           Love Tunes
19       Spring Renewal
Name: genre, dtype: object

In [15]:
# Covert text to numerical features
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,4), min_df=2)
X = vectorizer.fit_transform(df['title'])

In [16]:
from sklearn.model_selection import train_test_split
import numpy as np

In [17]:
# Just to keep a part of dataset as test sample in order to test the accuracy of the model 
test_size = 0.3

In [18]:
labels = df['genre'] # Column genres as labels
indexes = np.arange(X.shape[0])    # Create indexes

In [19]:
X_train, X_test, Y_train, Y_test, train_idx, test_idx = \
train_test_split(X, labels, indexes, test_size=test_size, random_state=244)

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [21]:
nb_classifier = MultinomialNB()

In [22]:
nb_classifier.fit(X_train, Y_train)

In [23]:
Y_pred = nb_classifier.predict(X_test)

In [24]:
print(f'Accuracy {accuracy_score(Y_test, Y_pred):0.3}')

Accuracy 0.0


In [25]:
# Predict with the test sample
for k in test_idx:
    title = df.iloc[k][0]
    genre = df.iloc[k][1]
    pred = nb_classifier.predict(X[k])[0]
    
    print(title)
    print(f'predicted genre: {pred}, actual genre : {genre}')
    print()

Funk/Soul
predicted genre: Hip Hop, actual genre : With albums like 'Sweet & Sour Tears - 50th Anniversary Collectors Series', the essence of soul is captured.

Folk
predicted genre: R&B, actual genre : Represented by albums such as 'Teaser And The Firecat (Remastered)', 'Essential Shawn Mullins', and others.

Spooky Vibes
predicted genre: Reggae, actual genre : Halloween hits like 'Halloween Howls' and 'Spooky Stories: Midnight Chills'.

Rock
predicted genre: Dramatic Scores, actual genre : Iconic albums including 'Rock Of Ages: The Definitive Collection [2 CD]' and 'Unplugged' define this category.

Pop
predicted genre: Reggae, actual genre : Hits like 'Going Somewhere' and 'A Thousand Different Ways' are pop essentials.

Electronic
predicted genre: Reggae, actual genre : Characterized by albums like 'CD+Graphics Karaoke - Ultimate Party Hits' and 'Sam's Town'.



##### Test again on same dataset with same genres

In [26]:
test_title = [
    "Blowin' in the Wind", "A Case of You", "If I Had a Hammer", 
    "Stairway to Heaven", "Bohemian Rhapsody", "Hotel California", 
    "Strobe", "Windowlicker", "Midnight City", 
    "No Woman, No Cry", "Red Red Wine", "Beautiful Girls", 
    "Superstition", "Respect", "September", 
    "Take Five", "So What", "My Favorite Things", 
    "Shape of You", "Billie Jean", "Bad Guy", 
    "Lose Yourself", "Sicko Mode", "Mo Money Mo Problems", 
    "End of the Road", "Love on the Brain", "Adorn", 
    "Imperial March", "Time", "The Godfather Theme", 
    "Over the Rainbow", "Memory", "Circle of Life", 
    "Tubular Bells", "The Jaws Theme", "Halloween Theme", 
    "James Bond Theme", "He's a Pirate", "Mission Impossible Theme", 
    "We Are Family", "Father and Daughter", "Teach Your Children", 
    "Can't Help Falling in Love", "My Heart Will Go On", "Perfect", 
    "Thriller", "Monster Mash", "Somebody's Watching Me", 
    "Last Christmas", "All I Want for Christmas is You", "Jingle Bell Rock", 
    "Hot Fun in the Summertime", "California Gurls", "Summertime", 
    "Endless Love", "Always and Forever", "I Will Always Love You", 
    "Here Comes the Sun", "Springsteen", "April Come She Will"
]

test_genre = [
    "Folk", "Folk", "Folk", 
    "Rock", "Rock", "Rock", 
    "Electronic", "Electronic", "Electronic", 
    "Reggae", "Reggae", "Reggae", 
    "Funk/Soul", "Funk/Soul", "Funk/Soul", 
    "Jazz", "Jazz", "Jazz", 
    "Pop", "Pop", "Pop", 
    "Hip Hop", "Hip Hop", "Hip Hop", 
    "R&B", "R&B", "R&B", 
    "Dramatic Scores", "Dramatic Scores", "Dramatic Scores", 
    "Stage & Screen", "Stage & Screen", "Stage & Screen", 
    "Horrifying Sounds", "Horrifying Sounds", "Horrifying Sounds", 
    "Action Packed", "Action Packed", "Action Packed", 
    "Family Bonds", "Family Bonds", "Family Bonds", 
    "Romantic Melodies", "Romantic Melodies", "Romantic Melodies", 
    "Spooky Vibes", "Spooky Vibes", "Spooky Vibes", 
    "Christmas Cheers", "Christmas Cheers", "Christmas Cheers", 
    "Summer Grooves", "Summer Grooves", "Summer Grooves", 
    "Love Tunes", "Love Tunes", "Love Tunes", 
    "Spring Renewal", "Spring Renewal", "Spring Renewal"
]


In [27]:
len(test_title)

60

In [28]:
len(test_genre)

60

In [29]:
test= pd.DataFrame({
    'test_title':test_title,
    'test_genre':test_genre
})
test

Unnamed: 0,test_title,test_genre
0,Blowin' in the Wind,Folk
1,A Case of You,Folk
2,If I Had a Hammer,Folk
3,Stairway to Heaven,Rock
4,Bohemian Rhapsody,Rock
5,Hotel California,Rock
6,Strobe,Electronic
7,Windowlicker,Electronic
8,Midnight City,Electronic
9,"No Woman, No Cry",Reggae


In [30]:
title_vectorized = vectorizer.transform(test['test_title'])

In [31]:
predictions = nb_classifier.predict(title_vectorized)

In [32]:
test['NB_Predicted_Genre'] = predictions

In [33]:
test

Unnamed: 0,test_title,test_genre,NB_Predicted_Genre
0,Blowin' in the Wind,Folk,Dramatic Scores
1,A Case of You,Folk,Hip Hop
2,If I Had a Hammer,Folk,Action Packed
3,Stairway to Heaven,Rock,Action Packed
4,Bohemian Rhapsody,Rock,Action Packed
5,Hotel California,Rock,Action Packed
6,Strobe,Electronic,Action Packed
7,Windowlicker,Electronic,Action Packed
8,Midnight City,Electronic,Action Packed
9,"No Woman, No Cry",Reggae,Action Packed


In [34]:
test['SS_Predicted_Genre'] = test['test_title'].apply(lambda x: tc2.classify(x, topN=1))

In [35]:
test['SS_Predicted_Genre']= test['SS_Predicted_Genre'].apply(lambda x: x[0][0])

In [36]:
test

Unnamed: 0,test_title,test_genre,NB_Predicted_Genre,SS_Predicted_Genre
0,Blowin' in the Wind,Folk,Dramatic Scores,Family Bonds
1,A Case of You,Folk,Hip Hop,Funk/Soul
2,If I Had a Hammer,Folk,Action Packed,Folk
3,Stairway to Heaven,Rock,Action Packed,Horrifying Sounds
4,Bohemian Rhapsody,Rock,Action Packed,Romantic Melodies
5,Hotel California,Rock,Action Packed,Stage & Screen
6,Strobe,Electronic,Action Packed,Horrifying Sounds
7,Windowlicker,Electronic,Action Packed,Spooky Vibes
8,Midnight City,Electronic,Action Packed,Spooky Vibes
9,"No Woman, No Cry",Reggae,Action Packed,Love Tunes


In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
genres = test['test_genre']
nb_predicted_genres = test['NB_Predicted_Genre']
ss_predicted_genres = test['SS_Predicted_Genre']


In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [40]:
# Calculate classification metrics for Naive Bayes predictions
nb_accuracy = accuracy_score(genres, nb_predicted_genres)
nb_precision = precision_score(genres, nb_predicted_genres, average='weighted')
nb_recall = recall_score(genres, nb_predicted_genres, average='weighted')
nb_f1 = f1_score(genres, nb_predicted_genres, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
# Calculate classification metrics for Semantic Similarity predictions
ss_accuracy = accuracy_score(genres, ss_predicted_genres)
ss_precision = precision_score(genres, ss_predicted_genres, average='weighted')
ss_recall = recall_score(genres, ss_predicted_genres, average='weighted')
ss_f1 = f1_score(genres, ss_predicted_genres, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
print("Naive Bayes Classifier Metrics:")
print("Accuracy:", nb_accuracy)
print("Precision:", nb_precision)
print("Recall:", nb_recall)
print("F1-Score:", nb_f1)

print("\nSemantic Similarity Classifier Metrics:")
print("Accuracy:", ss_accuracy)
print("Precision:", ss_precision)
print("Recall:", ss_recall)
print("F1-Score:", ss_f1)


Naive Bayes Classifier Metrics:
Accuracy: 0.08333333333333333
Precision: 0.05904392764857881
Recall: 0.08333333333333333
F1-Score: 0.03985507246376811

Semantic Similarity Classifier Metrics:
Accuracy: 0.18333333333333332
Precision: 0.14464285714285716
Recall: 0.18333333333333332
F1-Score: 0.13912698412698413


We think the reason that Semantic Simalarity Classifier had 0 accuracy is because how we defined our train data labels, and we still think genre and title have low correlation, with all these reasons combined, the accuracy of both models on genre prediction was very low.

### Update the dataframe with genres (we chose semantic similarity model)

In [43]:
import pandas as pd

In [45]:
df=pd.read_csv('data_w_review_topic.csv')

In [46]:
df

Unnamed: 0,product_title,review_body,summary,review_topic
0,"""2""","If you're a fighter pilot, are with a fighter ...","If you're a fighter pilot, are with a fighter ...",Clear Audio
1,"""DLG (Dark, Latin Groove) - Greatest Hits""",I liked this cd. Its a good compilation of son...,"Its a good compilation of songs, however I wis...",Clear Audio
2,"""If I Go Away"" / ""Man Like Me""",I love his voice!!!!! My boyfriend knows that ...,"Trust me when you hear JP sing his OWN music, ...",Clear Audio
3,"""John Skinner's Second Complete Ballroom Dance""",Good music for ballroom dancing -- both for th...,Good music for ballroom dancing -- both for th...,Energetic Music
4,"""Metal Box 3x 12"""" 45 Rpm Lp in Metal Box""",Indie label 4 men with Beards gets permission ...,many will want to get the turn table out of mo...,Clear Audio
...,...,...,...,...
17100,this is the fire,"its like Tom Petty, remixed by David Lynch. <b...","<br /> <br />1) \\""la news\\"" is a great start...",Balanced Sound
17101,tin cans & car tires,"Very good studio moe. Plane crash, Happy Hour...","Plane crash, Happy Hour Hero, Nebraska are som...",Clear Audio
17102,travelling,"Sorry for the mixed metaphor of a title, but M...",Spear is one of reggae's all-time most importa...,Well-Balanced Mixing
17103,¿La Calle Es Tuya?,Compare to the others Estopa CD's this is not ...,Compare to the others Estopa CD's this is not ...,Clear Audio


In [47]:
df['genre'] = df['product_title'].apply(lambda x: tc2.classify(x, topN=1))

In [48]:
df['genre'] = df['genre'].apply(lambda x: x[0][0])

In [49]:
df

Unnamed: 0,product_title,review_body,summary,review_topic,genre
0,"""2""","If you're a fighter pilot, are with a fighter ...","If you're a fighter pilot, are with a fighter ...",Clear Audio,Spooky Vibes
1,"""DLG (Dark, Latin Groove) - Greatest Hits""",I liked this cd. Its a good compilation of son...,"Its a good compilation of songs, however I wis...",Clear Audio,Folk
2,"""If I Go Away"" / ""Man Like Me""",I love his voice!!!!! My boyfriend knows that ...,"Trust me when you hear JP sing his OWN music, ...",Clear Audio,Horrifying Sounds
3,"""John Skinner's Second Complete Ballroom Dance""",Good music for ballroom dancing -- both for th...,Good music for ballroom dancing -- both for th...,Energetic Music,Stage & Screen
4,"""Metal Box 3x 12"""" 45 Rpm Lp in Metal Box""",Indie label 4 men with Beards gets permission ...,many will want to get the turn table out of mo...,Clear Audio,Hip Hop
...,...,...,...,...,...
17100,this is the fire,"its like Tom Petty, remixed by David Lynch. <b...","<br /> <br />1) \\""la news\\"" is a great start...",Balanced Sound,Funk/Soul
17101,tin cans & car tires,"Very good studio moe. Plane crash, Happy Hour...","Plane crash, Happy Hour Hero, Nebraska are som...",Clear Audio,Funk/Soul
17102,travelling,"Sorry for the mixed metaphor of a title, but M...",Spear is one of reggae's all-time most importa...,Well-Balanced Mixing,Spooky Vibes
17103,¿La Calle Es Tuya?,Compare to the others Estopa CD's this is not ...,Compare to the others Estopa CD's this is not ...,Clear Audio,Folk


In [50]:
df['genre'].unique()

array(['Spooky Vibes', 'Folk', 'Horrifying Sounds', 'Stage & Screen',
       'Hip Hop', 'Rock', 'Funk/Soul', 'Pop', 'Romantic Melodies',
       'Love Tunes', 'Family Bonds', 'Spring Renewal', 'Dramatic Scores',
       'Christmas Cheers', 'Reggae', 'Jazz', 'Electronic',
       'Action Packed', 'R&B', 'Summer Grooves'], dtype=object)

In [51]:
df.to_csv('data.csv',index=False)