In [42]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline
# import ipywidgets
import csv
import json
import spacy
nlp = spacy.load('en_core_web_sm')
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk 
from gensim.corpora import Dictionary
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim import models
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

---
### Books Dataset

In [43]:
data =[]
with open('booksummaries.txt','r', encoding='utf-8') as f:
    reader = csv.reader(f, dialect='excel-tab')
    for row in reader:
        data.append(row)

books = pd.DataFrame.from_records(data, columns=['book_id', 'freebase_id', 'book_title', 'author', 'publication_date', 'genre', 'summary'])

In [44]:
def parse_genre_entry(genre_info):
    if genre_info is None or genre_info == '':
        return []
    genre_dict = json.loads(genre_info)
    genres = list(genre_dict.values())
    return genres

books['genre'] = books['genre'].apply(parse_genre_entry)

In [45]:
books = books.drop(columns=['freebase_id'])

In [46]:
books.head(3)

Unnamed: 0,book_id,book_title,author,publication_date,genre,summary
0,620,Animal Farm,George Orwell,1945-08-17,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca..."
1,843,A Clockwork Orange,Anthony Burgess,1962,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan..."
2,986,The Plague,Albert Camus,1947,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...


In [47]:
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

books[''] = books['book_id'].apply(clean_ids)
books = books[books['book_id'].notnull()]

In [48]:
def contains_romance(genres):
    if genres is None:
        return False
    return any('romance' in genre.lower() for genre in genres)

rom_books = books[books['genre'].apply(contains_romance)]

In [49]:
books.shape

(488, 7)

In [50]:
def book_soup(x):
    genres = ""
    for genre in x['genre']:
        genres += genre + " "
    return f"{x['book_title']} {genres} {x['summary']}"

In [51]:
books["DOCUMENT"] = books.apply(book_soup, axis=1)

In [52]:
def preprocess_text(text):
    # lowercasing
    lowercased_text = text.lower()

    # cleaning 
    remove_punctuation = re.sub(r'[^\w\s]', '', lowercased_text)
    remove_white_space = remove_punctuation.strip()

    # Tokenization
    tokenized_text = word_tokenize(remove_white_space)

    # Stop Words
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    stopwords_removed = [word for word in tokenized_text if word not in stopwords]

    # Stemming
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in stopwords_removed]

    return {
        'DOCUMENT': text,
        'LOWERCASE': lowercased_text,
        'CLEANING': remove_white_space,
        'TOKENIZATION': tokenized_text,
        'STOP-WORDS': stopwords_removed,
        'STEMMING': stemmed_text
    }
 
    # return df

def preprocessing(corpus, corpus_type):
    preprocessed_data = []

    # Running preprocessing one by one
    for doc in corpus['DOCUMENT']:
        # Call the preprocess_text function
        preprocessed_data.append(preprocess_text(doc))

    # Create DataFrame based on corpus_type
    if corpus_type == 'books':
        df = pd.DataFrame({
            'book_id': corpus['book_id'],
            'book_title': corpus['book_title'],
            'author': corpus['author'],
            'publication_date': corpus['publication_date'],
            'genre': corpus['genre'],
            'summary': corpus['summary'],
            'DOCUMENT': [d['DOCUMENT'] for d in preprocessed_data],
            'LOWERCASE': [d['LOWERCASE'] for d in preprocessed_data],
            'CLEANING': [d['CLEANING'] for d in preprocessed_data],
            'TOKENIZATION': [d['TOKENIZATION'] for d in preprocessed_data],
            'STOP-WORDS': [d['STOP-WORDS'] for d in preprocessed_data],
            'STEMMING': [d['STEMMING'] for d in preprocessed_data]
        })
    else:
        df = pd.DataFrame({
            'track_title': corpus['track_title'],
            'album_name': corpus['album_name'],
            'track_n': corpus['track_n'],
            'lyric': corpus['lyric'],
            'DOCUMENT': [d['DOCUMENT'] for d in preprocessed_data],
            'LOWERCASE': [d['LOWERCASE'] for d in preprocessed_data],
            'CLEANING': [d['CLEANING'] for d in preprocessed_data],
            'TOKENIZATION': [d['TOKENIZATION'] for d in preprocessed_data],
            'STOP-WORDS': [d['STOP-WORDS'] for d in preprocessed_data],
            'STEMMING': [d['STEMMING'] for d in preprocessed_data]
        })

    return df


In [53]:
books_result_preprocessing = preprocessing(books, 'books')

In [54]:
def soup_after_lemma(x):
    soups = ""
    for text in x['STEMMING']:
        soups += text + " "
    return soups

In [55]:
books_result_preprocessing["STEMMING"] = books_result_preprocessing.apply(soup_after_lemma, axis=1)

---
### Taylor Swift Songs Dataset

In [56]:
ts_1 = pd.read_csv("01-taylor_swift.csv")
ts_2 = pd.read_csv("02-fearless_taylors_version.csv")
ts_3 = pd.read_csv("03-speak_now_deluxe_package.csv")
ts_4 = pd.read_csv("04-red_deluxe_edition.csv")
ts_5 = pd.read_csv("05-1989_deluxe.csv")
ts_6 = pd.read_csv("06-reputation.csv")
ts_7 = pd.read_csv("07-lover.csv")
ts_8 = pd.read_csv("08-folklore_deluxe_version.csv")
ts_9 = pd.read_csv("09-evermore_deluxe_version.csv")

lyrics = pd.concat([ts_1, ts_2, ts_3, ts_4, ts_5, ts_6, ts_7, ts_8, ts_9])
lyrics.reset_index(drop=True, inplace=True)

In [57]:
lyrics = lyrics.drop(columns=['line'])

lyrics = lyrics.groupby('track_title').agg({
    'album_name': 'first',
    'track_n': 'first',
     'lyric': ' '.join
}).reset_index()

lyrics.drop_duplicates(subset=['lyric'])

Unnamed: 0,track_title,album_name,track_n,lyric
0,...Ready for It?,reputation,1,Knew he was a killer first time that I saw him...
1,22,Red (Deluxe Edition),6,It feels like a perfect night To dress up like...
2,A Perfectly Good Heart,Taylor Swift,14,Why would you wanna break a perfectly good hea...
3,A Place In This World,Taylor Swift,4,"I don't know what I want, so don't ask me 'Cau..."
4,Afterglow,Lover,15,"I blew things out of proportion, now you're bl..."
...,...,...,...,...
158,​r​ight where you left me,evermore (deluxe version),16,"Friends break up, friends get married Stranger..."
159,​the lakes,folklore (deluxe version),17,Is it romantic how all my elegies eulogize me?...
160,​the last great american dynasty,folklore (deluxe version),3,"Rebekah rode up on the afternoon train, it was..."
161,​tolerate it,evermore (deluxe version),5,I sit and watch you reading with your head low...


In [58]:
for index, row in lyrics.iterrows():
    print(index,row['track_title'])

0 ...Ready for It?
1 22
2 A Perfectly Good Heart
3 A Place In This World
4 Afterglow
5 All Too Well
6 All You Had to Do Was Stay
7 Back to December
8 Bad Blood
9 Begin Again
10 Better Than Revenge
11 Blank Space
12 Breathe (Taylor’s Version)
13 Bye Bye Baby (Taylor’s Version) [From the Vault]
14 Call It What You Want
15 Change (Taylor’s Version)
16 Clean
17 Cold As You
18 Come Back... Be Here
19 Come In With the Rain (Taylor’s Version)
20 Cornelia Street
21 Cruel Summer
22 Dancing with Our Hands Tied
23 Daylight
24 Dear John
25 Death by a Thousand Cuts
26 Delicate
27 Don’t Blame Me
28 Don’t You (Taylor’s Version) [From the Vault]
29 Dress
30 Enchanted
31 End Game
32 Everything Has Changed
33 False God
34 Fearless (Taylor’s Version)
35 Fifteen (Taylor’s Version)
36 Forever & Always (Piano Version) [Taylor’s Version]
37 Forever & Always (Taylor’s Version)
38 Getaway Car
39 Girl at Home
40 Gorgeous
41 Haunted
42 Hey Stephen (Taylor’s Version)
43 Holy Ground
44 How You Get the Girl
45 I Al

In [59]:
lyrics = lyrics.drop(index=[36,80,90,97,116])

In [60]:
lyrics = lyrics.reset_index()
for index, row in lyrics.iterrows():
    print(index,row['track_title'])

0 ...Ready for It?
1 22
2 A Perfectly Good Heart
3 A Place In This World
4 Afterglow
5 All Too Well
6 All You Had to Do Was Stay
7 Back to December
8 Bad Blood
9 Begin Again
10 Better Than Revenge
11 Blank Space
12 Breathe (Taylor’s Version)
13 Bye Bye Baby (Taylor’s Version) [From the Vault]
14 Call It What You Want
15 Change (Taylor’s Version)
16 Clean
17 Cold As You
18 Come Back... Be Here
19 Come In With the Rain (Taylor’s Version)
20 Cornelia Street
21 Cruel Summer
22 Dancing with Our Hands Tied
23 Daylight
24 Dear John
25 Death by a Thousand Cuts
26 Delicate
27 Don’t Blame Me
28 Don’t You (Taylor’s Version) [From the Vault]
29 Dress
30 Enchanted
31 End Game
32 Everything Has Changed
33 False God
34 Fearless (Taylor’s Version)
35 Fifteen (Taylor’s Version)
36 Forever & Always (Taylor’s Version)
37 Getaway Car
38 Girl at Home
39 Gorgeous
40 Haunted
41 Hey Stephen (Taylor’s Version)
42 Holy Ground
43 How You Get the Girl
44 I Almost Do
45 I Did Something Bad
46 I Forgot That You Exi

In [61]:
def lyrics_soup(x):
    return x['album_name'] + " " + x['track_title']  + " " + x['lyric']

In [62]:
lyrics["DOCUMENT"] = lyrics.apply(lyrics_soup, axis=1)

In [63]:
lyrics_result_preprocessing = preprocessing(lyrics, 'lyrics')

In [64]:
lyrics_result_preprocessing["STEMMING"] = lyrics_result_preprocessing.apply(soup_after_lemma, axis=1)

---
### Creating Similarity Matrices

In [65]:
lyrics_books_soups = pd.concat([books_result_preprocessing['STEMMING'], lyrics_result_preprocessing['STEMMING']],ignore_index=True)
lyrics_books_soups

0      pride prejudic satir fiction romanc novel nove...
1      sens sensibl children literatur fiction romanc...
2      lord ring adventur novel specul fiction fantas...
3      time machin scienc fiction children literatur ...
4      invis man albino bia scienc fiction children l...
                             ...                        
641    evermor delux version right left friend break ...
642    folklor delux version lake romant elegi eulog ...
643    folklor delux version last great american dyna...
644    evermor delux version toler sit watch read hea...
645    evermor delux version ti damn season want know...
Name: STEMMING, Length: 646, dtype: object

In [66]:
# lyrics_books_count = CountVectorizer()
# lyrics_books_count.fit(lyrics_books_soups)

lyrics_books_count = TfidfVectorizer()
lyrics_books_count.fit(lyrics_books_soups)

lyrics_to_books_matrix = lyrics_books_count.transform(lyrics_result_preprocessing['STEMMING'])
books_to_lyrics_matrix = lyrics_books_count.transform(books_result_preprocessing['STEMMING'])

In [67]:
cosim_result_lyrics_books = cosine_similarity(lyrics_to_books_matrix, books_to_lyrics_matrix)
cosim_result_books_lyrics = cosine_similarity(books_to_lyrics_matrix, lyrics_to_books_matrix)

In [68]:
books_system = books_result_preprocessing.reset_index()
books_indices = pd.Series(books_system.index, index=books_result_preprocessing['book_title'].apply(lambda x: x.lower() if x is not np.nan else "")).drop_duplicates()

# books_system

In [69]:
lyrics_system = lyrics_result_preprocessing.reset_index()
lyrics_indices = pd.Series(lyrics_system.index, index=lyrics_result_preprocessing['track_title'].apply(lambda x: x.lower() if x is not np.nan else "")).drop_duplicates()

# lyrics_system.head(3)

---
### Recommendation System

In [70]:
def get_books_from_lyrics(title):
    idx = lyrics_indices[title.lower()]
    sim_scores = list(enumerate(cosim_result_lyrics_books[idx]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)

    sim_scores = sim_scores[:3]


    book_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    # print (scores)

    arr = books_system.iloc[book_indices]

    ctr = 0
    results = []
    for idx, book in arr.iterrows():
        results.append([ctr+1, book['book_title'], scores[ctr], book['summary']])
        ctr += 1

    result_df = pd.DataFrame(results, columns=['Index', 'Book Title', 'Cosine Similarity', 'Summary'])

    return result_df

def get_lyrics_from_books(title):
    idx = books_indices[title.lower()]
    sim_scores = list(enumerate(cosim_result_books_lyrics[idx]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)

    sim_scores = sim_scores[:3]

    #print(sim_scores)

    lyrics_indices = [i[0] for i in sim_scores]

    scores = [i[1] for i in sim_scores]
    # print (scores)

    arr = lyrics_system.iloc[lyrics_indices]

    ctr = 0;
    results = []
    for idx, lyric in arr.iterrows():
        results.append([ctr+1, lyric['track_title'], scores[ctr], lyric['album_name'], lyric['lyric']])
        ctr += 1

    result_df = pd.DataFrame(results, columns=['Index', 'Track Title', 'Cosine Similarity', 'Album Name', 'Lyric'])

    return result_df

---
### Examples

In [71]:
rand = books_system["book_title"].sample(1).values[0]
rand

'Adam Loveday'

In [72]:
get_lyrics_from_books(rand)

Unnamed: 0,Index,Track Title,Cosine Similarity,Album Name,Lyric
0,1,Dear John,0.044316,Speak Now (Deluxe),Long were the nights when My days once revolve...
1,2,The Best Day (Taylor’s Version),0.032755,Fearless (Taylor’s Version),"I'm five years old, it's getting cold, I've go..."
2,3,This Love,0.028813,1989 (Deluxe),Clear blue water High tide came and brought yo...


In [73]:
get_lyrics_from_books("Shannon's Way")  

Unnamed: 0,Index,Track Title,Cosine Similarity,Album Name,Lyric
0,1,You Belong With Me (Taylor’s Version),0.083723,Fearless (Taylor’s Version),"You're on the phone with your girlfriend, she'..."
1,2,Tim McGraw,0.04289,Taylor Swift,He said the way my blue eyes shined Put those ...
2,3,Enchanted,0.032664,Speak Now (Deluxe),"There I was again tonight Forcing laughter, fa..."


In [74]:
get_lyrics_from_books('Romeo and Juliet')

Unnamed: 0,Index,Track Title,Cosine Similarity,Album Name,Lyric
0,1,Love Story (Taylor’s Version),0.423161,Fearless (Taylor’s Version),We were both young when I first saw you I clos...
1,2,Miss Americana & The Heartbreak Prince,0.025691,Lover,"You know I adore you, I'm crazier for you Than..."
2,3,Come In With the Rain (Taylor’s Version),0.021898,Fearless (Taylor’s Version),I could go back to every laugh But I don't wan...


In [75]:
get_lyrics_from_books('Pride and Prejudice')

Unnamed: 0,Index,Track Title,Cosine Similarity,Album Name,Lyric
0,1,Mr. Perfectly Fine (Taylor’s Version) [From th...,0.288197,Fearless (Taylor’s Version),"Mr. ""Perfect face"" Mr. ""Here to stay"" Mr. ""Loo..."
1,2,Cold As You,0.018839,Taylor Swift,You have a way of coming easily to me And when...
2,3,You Are in Love,0.017085,1989 (Deluxe),"One look, dark room Meant just for you Time mo..."


In [76]:
rand = lyrics["track_title"].sample(1).values[0]
rand

'\u200btolerate it'

In [77]:
get_books_from_lyrics(rand)

Unnamed: 0,Index,Book Title,Cosine Similarity,Summary
0,1,Royal Escape,0.077048,Two years after the execution of his father (...
1,2,The Gospel According to Larry,0.064674,The Gospel According to Larry revolves around...
2,3,Unfinished Portrait,0.064462,"In the midst of divorce, bereft of the only p..."


In [78]:
get_books_from_lyrics('Mr. Perfectly Fine (Taylor’s Version) [From the Vault]')

Unnamed: 0,Index,Book Title,Cosine Similarity,Summary
0,1,Pride and Prejudice,0.288197,"The narrative opens with Mr Bingley, a wealth..."
1,2,Nightmare Abbey,0.267108,Nightmare Abbey is a Gothic topical satire in...
2,3,Emma,0.224042,"Emma Woodhouse, aged 20 at the start of the n..."


In [79]:
get_books_from_lyrics('Dancing with Our Hands Tied')

Unnamed: 0,Index,Book Title,Cosine Similarity,Summary
0,1,Alice Adams,0.127189,The novel begins with Virgil Adams confined t...
1,2,Boy Meets Boy,0.11776,Openly gay sophomore Paul lives in a gay-frie...
2,3,Fire in Stubble,0.107932,The book centres on the love life of Rose Mar...


In [80]:
get_books_from_lyrics('Paper Rings')

Unnamed: 0,Index,Book Title,Cosine Similarity,Summary
0,1,The Lord of the Rings,0.088379,"Long before the events of the novel, the Dark..."
1,2,The Gospel According to Larry,0.08527,The Gospel According to Larry revolves around...
2,3,Undead and Unpopular,0.073222,"Betsy Taylor, Queen of Vampires, is celebrati..."


In [81]:
books_system.sample(5)['book_title']

320               Eva Luna
230                Indiana
77     The Kreutzer Sonata
451              Heartless
104               Scarlett
Name: book_title, dtype: object

In [82]:
lyrics_system.sample(5)['track_title']

73                   Our Song
2      A Perfectly Good Heart
52                   Innocent
144         ​invisible string
146                      ​ivy
Name: track_title, dtype: object