In [52]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline
import ipywidgets
import csv
import json
import spacy
nlp = spacy.load('en_core_web_sm')
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk 
import string
from gensim.corpora import Dictionary
from gensim.models import Phrases
from gensim.models.phrases import Phraser

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

---
### Books Dataset

In [58]:
data =[]
with open('booksummaries.txt','r', encoding='utf-8') as f:
    reader = csv.reader(f, dialect='excel-tab')
    for row in reader:
        data.append(row)

books = pd.DataFrame.from_records(data, columns=['book_id', 'freebase_id', 'book_title', 'author', 'publication_date', 'genre', 'summary'])

In [59]:
def parse_genre_entry(genre_info):
    if genre_info == '':
        return []
    genre_dict = json.loads(genre_info)
    genres = list(genre_dict.values())
    return genres

books['genre'] = books['genre'].apply(parse_genre_entry)

In [60]:
books = books.drop(columns=['freebase_id'])

In [61]:
books.head(3)

Unnamed: 0,book_id,book_title,author,publication_date,genre,summary
0,620,Animal Farm,George Orwell,1945-08-17,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca..."
1,843,A Clockwork Orange,Anthony Burgess,1962,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan..."
2,986,The Plague,Albert Camus,1947,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...


In [62]:
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

books[''] = books['book_id'].apply(clean_ids)
books = books[books['book_id'].notnull()]

In [64]:
books.shape[0]

16559

In [65]:
books = books.sample(5000)
books.shape[0]

5000

In [66]:
def book_soup(x):
    genres = ""
    for genre in x['genre']:
        genres += genre + " "
    return f"{x['book_title']} {genres} {x['summary']}"

In [67]:
books["soup"] = books.apply(book_soup, axis=1)
# books.sample()

books.iloc[635]

book_id                                                      15290077
book_title                                       Waiting for The Rain
author                                                  Sheila Gordon
publication_date                                                     
genre                                         [Children's literature]
summary              When the book begins, Tengo and Frikkie are t...
                                                             15290077
soup                Waiting for The Rain Children's literature   W...
Name: 11509, dtype: object

In [68]:
def sanitize(x):
    if isinstance(x, list):
        return [str.lower(i.replace('/','')) for i in x]
        return [str.lower(i.replace('(','')) for i in x]
        return [str.lower(i.replace(')','')) for i in x]
        return [str.lower(i.replace("'",'')) for i in x]
        return [str.lower(i.replace("\\",'')) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace('/', ''))
            return str.lower(x.replace('(', ''))
            return str.lower(x.replace(')', ''))
            return str.lower(x.replace("'", ''))
            return str.lower(x.replace('\\', ''))
        else:
            return ''

In [69]:
books["soup"] = books["soup"].apply(sanitize)

books.head(3)

Unnamed: 0,book_id,book_title,author,publication_date,genre,summary,Unnamed: 7,soup
14646,24940694,Anima,,,[Mystery],"Megan hopes to be an actress, and goes to Lon...",24940694,"anima mystery megan hopes to be an actress, ..."
9585,10613695,Knitting Under the Influence,,,[],Sometimes it feels like their weekly knitting...,10613695,knitting under the influence sometimes it fe...
2114,1121236,Crabwalk,Günter Grass,2002.0,[Novel],The narrator of the novella is the journalist...,1121236,crabwalk novel the narrator of the novella i...


In [71]:
stop_words= stopwords.words('english') + list(string.punctuation)
# += list(['fire', 'emblem', 'game','awakening', 'conquest', 'fate', 'birthright', 'revelations', 'echo', 'shadow', 'valentia' ,'three', 'house', 'engage', 'play', 'buy', 'rpg'])
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [73]:
def lemmatization(texts,allowed_postags=['NOUN','ADJ','VERB','ADV']):
    """https://spacy.io/api/annotation"""
    data_words=simple_preprocess(str(texts))
    doc=nlp(' '.join(data_words))
    tokens = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    tokens = [word for word in tokens if word not in stop_words]
    # tokens = [word for word in tokens if word not in positive_words]
    # tokens = [word for word in tokens if word not in negative_words]

    return tokens

In [84]:
def soup_after_lemma(x):
    soups = ""
    for text in x['soup']:
        soups += text + " "
    return soups

In [74]:
books['soup'] = books['soup'].apply(lemmatization)


In [85]:
books["soup"] = books.apply(soup_after_lemma, axis=1)

In [87]:
books

Unnamed: 0,book_id,book_title,author,publication_date,genre,summary,Unnamed: 7,soup
14646,24940694,Anima,,,[Mystery],"Megan hopes to be an actress, and goes to Lon...",24940694,anima mystery megan hope actress go audition t...
9585,10613695,Knitting Under the Influence,,,[],Sometimes it feels like their weekly knitting...,10613695,knit influence sometimes feel weekly knitting ...
2114,1121236,Crabwalk,Günter Grass,2002,[Novel],The narrator of the novella is the journalist...,1121236,crabwalk novel narrator novella journalist bea...
5139,3983954,.hack//AI buster 2,,,[Science Fiction],"Haruka Mizuhara, whose name was misspelled in...",3983954,hackai buster science name misspell translatio...
12200,17370186,Sasquatch,Roland Smith,,[Fantasy],When Dylan accompanies his father to a meetin...,17370186,sasquatch fantasy accompany father meeting big...
...,...,...,...,...,...,...,...,...
13115,19998458,The Unforgiving Wind,John Harris,1963-01-01,[Adventure novel],An expedition is planned by Commander Adams a...,19998458,unforgiving wind adventure novel expedition pl...
11715,16036731,Star Bridge,Jack Williamson,1955,[Science Fiction],The scattered planets are held together by th...,16036731,science fiction scatter planet hold together e...
11733,16081895,Lilith: A Snake in the Grass,Jack L. Chalker,1981,"[Science Fiction, Speculative fiction]","The Confederacy, a massive space empire, dupl...",16081895,grass science fiction speculative fiction conf...
7000,5913404,Skybreaker,Kenneth Oppel,2005-07-20,"[Speculative fiction, Fantasy, Steampunk, Adve...","Matt Cruse has been stationed on the Flotsam,...",5913404,skybreaker speculative fiction fantasy steampu...


---
### Taylor Swift Songs Dataset

In [76]:
ts_1 = pd.read_csv("01-taylor_swift.csv")
ts_2 = pd.read_csv("02-fearless_taylors_version.csv")
ts_3 = pd.read_csv("03-speak_now_deluxe_package.csv")
ts_4 = pd.read_csv("04-red_deluxe_edition.csv")
ts_5 = pd.read_csv("05-1989_deluxe.csv")
ts_6 = pd.read_csv("06-reputation.csv")
ts_7 = pd.read_csv("07-lover.csv")
ts_8 = pd.read_csv("08-folklore_deluxe_version.csv")
ts_9 = pd.read_csv("09-evermore_deluxe_version.csv")

lyrics = pd.concat([ts_1, ts_2, ts_3, ts_4, ts_5, ts_6, ts_7, ts_8, ts_9])
lyrics.reset_index(drop=True, inplace=True)

In [77]:
lyrics.sample(5)

Unnamed: 0,album_name,track_title,track_n,lyric,line
7936,evermore (deluxe version),​ivy,10,And drink my husband's wine,41
660,Fearless (Taylor’s Version),Fifteen (Taylor’s Version),2,This is life before you know who you're gonna be,13
5433,reputation,King of My Heart,10,'Cause all the boys and their expensive cars,8
7146,folklore (deluxe version),​illicit affairs,10,A million little times,26
2366,Speak Now (Deluxe),Innocent,11,And everybody believed in you?,8


In [78]:
lyrics = lyrics.drop(columns=['line'])

lyrics = lyrics.groupby('track_title').agg({
    'album_name': 'first',  # You can use 'first' to retain the first occurrence of the author
    'track_n': 'first',
     'lyric': ' '.join
}).reset_index()

lyrics.drop_duplicates(subset=['lyric'])

lyrics['lyric']

0      Knew he was a killer first time that I saw him...
1      It feels like a perfect night To dress up like...
2      Why would you wanna break a perfectly good hea...
3      I don't know what I want, so don't ask me 'Cau...
4      I blew things out of proportion, now you're bl...
                             ...                        
158    Friends break up, friends get married Stranger...
159    Is it romantic how all my elegies eulogize me?...
160    Rebekah rode up on the afternoon train, it was...
161    I sit and watch you reading with your head low...
162    If I wanted to know who you were hanging with ...
Name: lyric, Length: 163, dtype: object

In [80]:
def lyrics_soup(x):
    return x['album_name'] + " " + x['track_title']  + " " + x['lyric']

In [81]:
lyrics["soup"] = lyrics.apply(lyrics_soup, axis=1)
lyrics["soup"] = lyrics["soup"].apply(sanitize)
lyrics.sample()

Unnamed: 0,track_title,album_name,track_n,lyric,soup
141,​coney island,evermore (deluxe version),9,Break my soul in two looking for you But you'r...,evermore (deluxe version) ​coney island break ...


In [82]:
lyrics['soup'] = lyrics['soup'].apply(lemmatization)

In [88]:
lyrics["soup"] = lyrics.apply(soup_after_lemma, axis=1)

In [89]:
lyrics.head()

Unnamed: 0,track_title,album_name,track_n,lyric,soup
0,...Ready for It?,reputation,1,Knew he was a killer first time that I saw him...,reputation ready know first time see wonder ma...
1,22,Red (Deluxe Edition),6,It feels like a perfect night To dress up like...,red deluxe edition feel perfect night dress hi...
2,A Perfectly Good Heart,Taylor Swift,14,Why would you wanna break a perfectly good hea...,swift perfectly good heart wanna break perfect...
3,A Place In This World,Taylor Swift,4,"I don't know what I want, so don't ask me 'Cau...",swift place world know want ask still try figu...
4,Afterglow,Lover,15,"I blew things out of proportion, now you're bl...",blow thing proportion blue put jail pin hand b...


---
### Creating Similarity Matrices

In [90]:
books_lyrics_soups = pd.concat([books['soup'], lyrics['soup']],ignore_index=True)

In [91]:
books_lyrics_count = CountVectorizer(stop_words = "english")
books_lyrics_count.fit(books_lyrics_soups)

books_to_lyrics_matrix = books_lyrics_count.transform(books['soup'])
lyrics_to_books_matrix = books_lyrics_count.transform(lyrics['soup'])

print(books_to_lyrics_matrix.shape, lyrics_to_books_matrix.shape)

(5000, 43876) (163, 43876)


In [92]:
cosine_sim_books_lyrics = cosine_similarity(books_to_lyrics_matrix, lyrics_to_books_matrix)
cosine_sim_books_lyrics

array([[0.04635508, 0.02847816, 0.00813459, ..., 0.11995427, 0.06236211,
        0.04303619],
       [0.09388312, 0.09155068, 0.04314887, ..., 0.02239114, 0.04811513,
        0.0674463 ],
       [0.03940182, 0.0316424 , 0.00542306, ..., 0.07352036, 0.04157474,
        0.05379524],
       ...,
       [0.00191942, 0.0104817 , 0.03817382, ..., 0.00320447, 0.06197324,
        0.02375991],
       [0.08311239, 0.04713226, 0.01795068, ..., 0.05123307, 0.07568827,
        0.0534197 ],
       [0.05573051, 0.00507229, 0.        , ..., 0.01240562, 0.01332889,
        0.01724679]])

In [93]:
cosine_sim_lyrics_books = cosine_similarity(lyrics_to_books_matrix, books_to_lyrics_matrix)
cosine_sim_lyrics_books

array([[0.04635508, 0.09388312, 0.03940182, ..., 0.00191942, 0.08311239,
        0.05573051],
       [0.02847816, 0.09155068, 0.0316424 , ..., 0.0104817 , 0.04713226,
        0.00507229],
       [0.00813459, 0.04314887, 0.00542306, ..., 0.03817382, 0.01795068,
        0.        ],
       ...,
       [0.11995427, 0.02239114, 0.07352036, ..., 0.00320447, 0.05123307,
        0.01240562],
       [0.06236211, 0.04811513, 0.04157474, ..., 0.06197324, 0.07568827,
        0.01332889],
       [0.04303619, 0.0674463 , 0.05379524, ..., 0.02375991, 0.0534197 ,
        0.01724679]])

In [94]:
books = books.reset_index()
books_indices = pd.Series(books.index, index=books['book_title'].apply(lambda x: x.lower() if x is not np.nan else "")).drop_duplicates()

books.head()

Unnamed: 0,index,book_id,book_title,author,publication_date,genre,summary,Unnamed: 8,soup
0,14646,24940694,Anima,,,[Mystery],"Megan hopes to be an actress, and goes to Lon...",24940694,anima mystery megan hope actress go audition t...
1,9585,10613695,Knitting Under the Influence,,,[],Sometimes it feels like their weekly knitting...,10613695,knit influence sometimes feel weekly knitting ...
2,2114,1121236,Crabwalk,Günter Grass,2002.0,[Novel],The narrator of the novella is the journalist...,1121236,crabwalk novel narrator novella journalist bea...
3,5139,3983954,.hack//AI buster 2,,,[Science Fiction],"Haruka Mizuhara, whose name was misspelled in...",3983954,hackai buster science name misspell translatio...
4,12200,17370186,Sasquatch,Roland Smith,,[Fantasy],When Dylan accompanies his father to a meetin...,17370186,sasquatch fantasy accompany father meeting big...


In [96]:
lyrics = lyrics.reset_index()
lyrics_indices = pd.Series(lyrics.index, index=lyrics['track_title'].apply(lambda x: x.lower() if x is not np.nan else "")).drop_duplicates()

lyrics.head()

Unnamed: 0,level_0,index,track_title,album_name,track_n,lyric,soup
0,0,0,...Ready for It?,reputation,1,Knew he was a killer first time that I saw him...,reputation ready know first time see wonder ma...
1,1,1,22,Red (Deluxe Edition),6,It feels like a perfect night To dress up like...,red deluxe edition feel perfect night dress hi...
2,2,2,A Perfectly Good Heart,Taylor Swift,14,Why would you wanna break a perfectly good hea...,swift perfectly good heart wanna break perfect...
3,3,3,A Place In This World,Taylor Swift,4,"I don't know what I want, so don't ask me 'Cau...",swift place world know want ask still try figu...
4,4,4,Afterglow,Lover,15,"I blew things out of proportion, now you're bl...",blow thing proportion blue put jail pin hand b...


In [97]:
class RecommendationEngine:
    def get_books_from_lyrics(title):
        try:
            idx = lyrics_indices[title.lower()]
            sim_scores = list(enumerate(cosine_sim_lyrics_books[idx]))
            sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)

            sim_scores = sim_scores[:5]

            book_indices = [i[0] for i in sim_scores]

            return books.iloc[book_indices]
        except :
            print("No song found in our database. Please try again.")

    def get_lyrics_from_books(title):
        try:
            idx = books_indices[title.lower()]
            sim_scores = list(enumerate(cosine_sim_books_lyrics[idx]))
            sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)

            sim_scores = sim_scores[:5]

            lyrics_indices = [i[0] for i in sim_scores]

            return lyrics.iloc[lyrics_indices]
        except :
            print("No book found in our database. Please try again.")

In [98]:
rand = books["book_title"].sample(1).values[0]
rand

'The Great Eight'

In [99]:
RecommendationEngine.get_lyrics_from_books(rand)

Unnamed: 0,level_0,index,track_title,album_name,track_n,lyric,soup
27,27,27,Don’t Blame Me,reputation,4,"Don't blame me, love made me crazy If it doesn...",reputation blame blame love make crazy doin ri...
61,61,61,Long Live,Speak Now (Deluxe),14,"I said, remember this moment, in the back of m...",speak long live say remember moment back mind ...
57,57,57,Jump Then Fall (Taylor’s Version),Fearless (Taylor’s Version),14,I like the way you sound in the morning We're ...,version jump fall taylor version way sound mor...
42,42,42,Hey Stephen (Taylor’s Version),Fearless (Taylor’s Version),4,"Mmm-mm, mm-mm Mmm-mm, mm-mm Mmm-mm, mm-mm, yea...",version mmm mmm know look deceive know see lig...
53,53,53,Innocent,Speak Now (Deluxe),11,I guess you really did it this time Left yours...,speak deluxe innocent guess really time leave ...


In [101]:
# RecommendationEngine.get_lyrics_from_books('The Ramsay Scallop')

No book found in our database. Please try again.


In [102]:
rand = lyrics["track_title"].sample(1).values[0]
rand

'I Knew You Were Trouble'

In [103]:
RecommendationEngine.get_books_from_lyrics(rand)

Unnamed: 0,index,book_id,book_title,author,publication_date,genre,summary,Unnamed: 8,soup
1034,11911,16614375,Ivan the Terrible,Anne Fine,2007-06-04,[Children's literature],It is Ivan's first day of school. He can only...,16614375,terrible child literature first day school spe...
3646,15290,27550660,Sugar and Spice,Lauren Katherine Conrad,2010-10-05,[],Lauren Conrad's second L.A. Candy book opens ...,27550660,candy book open red carpet premiere season red...
175,5900,4844614,Henry and Ribsy,Beverly Cleary,,"[Children's literature, Fiction, Novel]","Like most of the Henry Huggins books, the inc...",4844614,ribsy child literature fiction novel huggin bo...
2679,13528,21162721,The White Tiger,,1987,"[Thriller, Fiction]","Lu Hong, a policeman in Beijing finds himself...",21162721,white tiger find trouble death strange circums...
2882,16244,32570419,The Russian Concubine,,2007,[Historical fiction],The story begins in 1917 when a five-year-old...,32570419,russian concubine historical fiction story beg...


In [106]:
RecommendationEngine.get_books_from_lyrics('Love Story (Taylor’s Version)')

Unnamed: 0,index,book_id,book_title,author,publication_date,genre,summary,Unnamed: 8,soup
609,2015,1049941,Black Sun,Edward Abbey,1971,"[Western, Fiction, Romance novel]",The book is divided into three parts: In the ...,1049941,black sun western fiction romance novel book d...
2537,12944,19488801,Lulu Atlantis and the Quest for True Blue Love,,2008-01-08,[],Lulu Lantis lives in Sweet Pea Lane. She has ...,19488801,quest true blue live sweet pea lane baby broth...
4556,3168,2010476,Scarlett,Alexandra Ripley,1991-09,"[Travel, Novel, History, Historical fiction, R...",The book begins where Gone with the Wind left...,2010476,travel novel history historical fiction romanc...
1487,6923,5833227,My Cousin Rachel,Daphne du Maurier,1951,"[Mystery, Novel, Romance novel]","The story of ""My Cousin Rachel"" begins with A...",5833227,cousin mystery novel romance novel story cousi...
3604,4337,3133672,The Scapegoat,Daphne du Maurier,1957,[Crime Fiction],"'I left the car by the side of the cathedral,...",3133672,crime fiction leave car side cathedral walk st...


In [581]:
books.sample(5)['book_title']

5607                               Pamela
10124               The House at Riverton
16398          Debt: The First 5000 Years
3245     The Mystery of the Aztec Warrior
6370               Murder Being Once Done
Name: book_title, dtype: object

In [582]:
lyrics.sample(5)['track_title']

124    You All Over Me (Taylor’s Version) [From the V...
34                           Fearless (Taylor’s Version)
99       That’s When (Taylor’s Version) [From the Vault]
56                         I’m Only Me When I’m with You
22                           Dancing with Our Hands Tied
Name: track_title, dtype: object