# Welcome to "ML4Recsys : Intro to content-based filtering" Notebook

In this notebook we will try recommend list of film based on one film that the user already watch, so the instruction is:

1. Read the data
2. Make the vector representation
3. Calculate the similarity betweenfilm based on the vector representation

## Read the data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('SongsDataset.csv')

df.Lirik = df.Lirik.astype(str)
df.head()

Unnamed: 0,NIM,Submisike,Artis,Judul,Lirik
0,1301180000.0,1.0,Reality Club,Is It The Answer?,I make you break\nYou move I take\nLove is the...
1,1301180000.0,2.0,Simple Plan,Jet lag,"Whoa, oh, oh\nWhoa, oh, oh\nSo jet-lagged\n\nW..."
2,1301180000.0,3.0,The Script,Superheroes,All the life she has seen\nAll the meaner side...
3,1301180000.0,4.0,The Script,Breakeven,I'm still alive but I'm barely breathing\nJust...
4,1301180000.0,5.0,Green Day,21 Guns,"Do you know what's worth fighting for,\nWhen i..."


## Preprocessing

In [2]:
%%time
import re

# Converting all words to lower case and removing punctuation
df['Lirik'] = [re.sub(r'\d+\S*', '',
                  row.lower().replace('.', ' ').replace('_', '').replace('/', ''))
                  for row in df['Lirik']]

df['Lirik'] = [re.sub(r'(?:^| )\w(?:$| )', '', row)
                  for row in df['Lirik']]

# Removing numbers
df['Lirik'] = [re.sub(r'\d+', '', row) for row in df['Lirik']]

CPU times: user 71.2 ms, sys: 0 ns, total: 71.2 ms
Wall time: 71.5 ms


In [3]:
df.head()

Unnamed: 0,NIM,Submisike,Artis,Judul,Lirik
0,1301180000.0,1.0,Reality Club,Is It The Answer?,make you break\nyou movetake\nlove is the answ...
1,1301180000.0,2.0,Simple Plan,Jet lag,"whoa, oh, oh\nwhoa, oh, oh\nso jet-lagged\n\nw..."
2,1301180000.0,3.0,The Script,Superheroes,all the life she has seen\nall the meaner side...
3,1301180000.0,4.0,The Script,Breakeven,i'm still alive but i'm barely breathing\njust...
4,1301180000.0,5.0,Green Day,21 Guns,"do you know what's worth fighting for,\nwhen i..."


In [4]:
%%time
import nltk
nltk.download("stopwords")

# Tokenizing comments and putting them into a new column
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')  # by blank space
df['tokens'] = df['Lirik'].apply(tokenizer.tokenize)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
CPU times: user 651 ms, sys: 141 ms, total: 793 ms
Wall time: 743 ms


In [5]:
df.head()

Unnamed: 0,NIM,Submisike,Artis,Judul,Lirik,tokens
0,1301180000.0,1.0,Reality Club,Is It The Answer?,make you break\nyou movetake\nlove is the answ...,"[make, you, break, you, movetake, love, is, th..."
1,1301180000.0,2.0,Simple Plan,Jet lag,"whoa, oh, oh\nwhoa, oh, oh\nso jet-lagged\n\nw...","[whoa, oh, oh, whoa, oh, oh, so, jet, lagged, ..."
2,1301180000.0,3.0,The Script,Superheroes,all the life she has seen\nall the meaner side...,"[all, the, life, she, has, seen, all, the, mea..."
3,1301180000.0,4.0,The Script,Breakeven,i'm still alive but i'm barely breathing\njust...,"[i, m, still, alive, but, i, m, barely, breath..."
4,1301180000.0,5.0,Green Day,21 Guns,"do you know what's worth fighting for,\nwhen i...","[do, you, know, what, s, worth, fighting, for,..."


In [6]:
%%time
# Removing Stopwords & Punctuation
from nltk.corpus import stopwords
#stopwords.words('english')

filtered_words = []
for row in df['tokens']:
    filtered_words.append([
        word.lower() for word in row
        if word.lower() not in nltk.corpus.stopwords.words('english')
    ])

df['tokens'] = filtered_words

CPU times: user 16.3 s, sys: 1.84 s, total: 18.1 s
Wall time: 18.2 s


In [7]:
df.head()

Unnamed: 0,NIM,Submisike,Artis,Judul,Lirik,tokens
0,1301180000.0,1.0,Reality Club,Is It The Answer?,make you break\nyou movetake\nlove is the answ...,"[make, break, movetake, love, answer, say, ifw..."
1,1301180000.0,2.0,Simple Plan,Jet lag,"whoa, oh, oh\nwhoa, oh, oh\nso jet-lagged\n\nw...","[whoa, oh, oh, whoa, oh, oh, jet, lagged, time..."
2,1301180000.0,3.0,The Script,Superheroes,all the life she has seen\nall the meaner side...,"[life, seen, meaner, side, took, away, prophet..."
3,1301180000.0,4.0,The Script,Breakeven,i'm still alive but i'm barely breathing\njust...,"[still, alive, barely, breathing, prayin, togo..."
4,1301180000.0,5.0,Green Day,21 Guns,"do you know what's worth fighting for,\nwhen i...","[know, worth, fighting, worth, dying, take, br..."


In [8]:
%%time
# Setting the Lemmatization object
nltk.download("wordnet")
lmtzr = nltk.stem.wordnet.WordNetLemmatizer()

# Looping through the words and appending the lemmatized version to a list
stemmed_words = []
for row in df['tokens']:
    stemmed_words.append([
        # Verbs
        lmtzr.lemmatize(  
            # Adjectives
            lmtzr.lemmatize(  
                # Nouns
                lmtzr.lemmatize(word.lower()), 'a'), 'v')
        for word in row
        if word.lower() not in nltk.corpus.stopwords.words('english')])

# Adding the list as a column in the data frame
df['tokens'] = stemmed_words

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
CPU times: user 10.8 s, sys: 917 ms, total: 11.7 s
Wall time: 11.7 s


In [9]:
df.head()

Unnamed: 0,NIM,Submisike,Artis,Judul,Lirik,tokens
0,1301180000.0,1.0,Reality Club,Is It The Answer?,make you break\nyou movetake\nlove is the answ...,"[make, break, movetake, love, answer, say, ifw..."
1,1301180000.0,2.0,Simple Plan,Jet lag,"whoa, oh, oh\nwhoa, oh, oh\nso jet-lagged\n\nw...","[whoa, oh, oh, whoa, oh, oh, jet, lag, time, m..."
2,1301180000.0,3.0,The Script,Superheroes,all the life she has seen\nall the meaner side...,"[life, see, mean, side, take, away, prophet, d..."
3,1301180000.0,4.0,The Script,Breakeven,i'm still alive but i'm barely breathing\njust...,"[still, alive, barely, breathe, prayin, togod,..."
4,1301180000.0,5.0,Green Day,21 Guns,"do you know what's worth fighting for,\nwhen i...","[know, worth, fight, worth, die, take, breath,..."


In [10]:
# Appends all words to a list in order to find the unique words
allWords = []
for row in stemmed_words:
    for word in row:
        allWords.append(str(word))
            
uniqueWords = np.unique(allWords)

print('Number of unique words:', len(uniqueWords), '\n')
print('Previewing sample of unique words:\n', uniqueWords[1234:1244])

Number of unique words: 6334 

Previewing sample of unique words:
 ['creation' 'creature' 'credit' 'creed' 'creep' 'crest' 'crib' 'crier'
 'crime' 'crimson']


In [11]:
stemmed_sentences = []

# Spacing out the words in the reviews for each restaurant
for row in df['tokens']:
    stemmed_string = ''
    for word in row:
        stemmed_string = stemmed_string + ' ' + word
    stemmed_sentences.append(stemmed_string)
    
df['tokens'] = stemmed_sentences

In [12]:
df.head()

Unnamed: 0,NIM,Submisike,Artis,Judul,Lirik,tokens
0,1301180000.0,1.0,Reality Club,Is It The Answer?,make you break\nyou movetake\nlove is the answ...,make break movetake love answer say ifwent aw...
1,1301180000.0,2.0,Simple Plan,Jet lag,"whoa, oh, oh\nwhoa, oh, oh\nso jet-lagged\n\nw...",whoa oh oh whoa oh oh jet lag time miss anyth...
2,1301180000.0,3.0,The Script,Superheroes,all the life she has seen\nall the meaner side...,life see mean side take away prophet dream fo...
3,1301180000.0,4.0,The Script,Breakeven,i'm still alive but i'm barely breathing\njust...,still alive barely breathe prayin togod thatd...
4,1301180000.0,5.0,Green Day,21 Guns,"do you know what's worth fighting for,\nwhen i...",know worth fight worth die take breath away f...


## TF/IDF

In [13]:
%%time
import sklearn
# Creating the sklearn object
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(smooth_idf=False)

# Transforming our 'tokens' column into a TF-IDF matrix and then a data frame
tfidf_df = pd.DataFrame(tfidf.fit_transform(df['tokens']).toarray(), 
                        columns=tfidf.get_feature_names())

CPU times: user 77 ms, sys: 8.01 ms, total: 85 ms
Wall time: 85.5 ms




In [14]:
print(tfidf_df.shape)
tfidf_df.head()

(533, 6320)


Unnamed: 0,aaliyah,aback,abandon,abide,able,aboutgirlfriend,abouthouse,abouthundred,aboutlife,absence,absolute,absolutely,abuse,aby,ac,acapulco,accent,accept,accessory,accord,account,accurate,accuse,ache,achestill,achilles,achoo,acquaintance,acre,across,acrossfallen,act,actavis,actfool,actin,action,activity,add,addict,addiction,...,yoop,york,youam,youbeliever,youbouquet,youcan,youdon,youi,youlil,youmillion,youmiss,youneed,young,youngblood,youonce,youphantom,youriver,youseat,yousee,youset,yousick,yousmirk,youstory,youth,youthink,youtime,youwhore,youwill,youwon,ypocrites,yuh,zappa,zaytoven,zenzenzense,zero,ziggy,zimmerman,zipper,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076316,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Removing sparse columns
tfidf_df = tfidf_df[tfidf_df.columns[tfidf_df.sum() > 1.5]]

# Removing any remaining digits
tfidf_df = tfidf_df.filter(regex=r'^((?!\d).)*$')

print(tfidf_df.shape)
tfidf_df.head()

(533, 361)


Unnamed: 0,afraid,ah,alive,allneed,almost,alone,along,alright,always,anddon,andknow,andneed,andwill,angel,another,anymore,anything,apart,arm,around,ask,asleep,away,ayy,babe,baby,back,bad,beat,beautiful,become,bed,begin,behind,believe,best,bitch,bleed,blue,body,...,touch,trouble,true,truth,try,tryna,turn,two,uh,understand,use,voice,wait,wake,walk,wanna,want,waste,watch,water,way,well,whoa,whole,wind,wish,without,wonder,word,work,world,worth,would,wrong,ya,yeah,year,yes,yesterday,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062789,0.0,0.0,0.0,0.0,0.066511,0.0,0.0,0.0,0.149498,0.0,0.0,0.07692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086157,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.095026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029813,0.203048,0.0,0.0,0.0,0.100439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015714,0.051113,0.020102,0.0,0.0,0.0,0.0,0.0,0.020004,0.102226,0.0,0.031428,0.0,0.0,0.0,0.0,0.0,0.0,0.049383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010139,0.0,0.0,0.0,0.019484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.131377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028329,0.0,0.0,0.0,0.016137,0.0,0.0,0.0,0.0,0.0,0.0,0.046915,0.0,0.0,0.0,0.0,0.0,0.008922,0.0,0.016271,0.0,0.0
3,0.0,0.0,0.074079,0.0,0.0,0.0,0.0,0.0,0.077108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057771,0.0,0.0,0.0,0.0,0.0,0.0,0.06317,0.12634,0.0,0.043425,0.0,0.0,...,0.0,0.041512,0.0,0.0,0.0,0.036357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029648,0.0,0.0,0.0,0.0,0.0,0.0,0.097105,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058411,0.0,0.0,0.0,0.357318,0.0,0.0,0.0,0.040563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.082168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Storing the original data frame before the merge in case of changes
df_orig = df.copy()

# Renaming columns that conflict with column names in tfidfCore
df.rename(columns={'title': 'Title', 
                   'genre': 'Genre', 
                   'director': 'Director', 
                   'actors': 'Actors', 
                   'plot': 'Plot', 
                   'tokens': 'Tokens'}, inplace=True)

# Merging the data frames by index
df = pd.merge(df, tfidf_df, how='inner', left_index=True, right_index=True)

df.head()

Unnamed: 0,NIM,Submisike,Artis,Judul,Lirik,Tokens,afraid,ah,alive,allneed,almost,alone,along,alright,always,anddon,andknow,andneed,andwill,angel,another,anymore,anything,apart,arm,around,ask,asleep,away,ayy,babe,baby,back,bad,beat,beautiful,become,bed,begin,behind,...,touch,trouble,true,truth,try,tryna,turn,two,uh,understand,use,voice,wait,wake,walk,wanna,want,waste,watch,water,way,well,whoa,whole,wind,wish,without,wonder,word,work,world,worth,would,wrong,ya,yeah,year,yes,yesterday,young
0,1301180000.0,1.0,Reality Club,Is It The Answer?,make you break\nyou movetake\nlove is the answ...,make break movetake love answer say ifwent aw...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062789,0.0,0.0,0.0,0.0,0.066511,0.0,0.0,0.0,0.149498,0.0,0.0,0.07692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230865,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086157,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1301180000.0,2.0,Simple Plan,Jet lag,"whoa, oh, oh\nwhoa, oh, oh\nso jet-lagged\n\nw...",whoa oh oh whoa oh oh jet lag time miss anyth...,0.0,0.0,0.0,0.0,0.0,0.095026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029813,0.203048,0.0,0.0,0.0,0.100439,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015714,0.051113,0.020102,0.0,0.0,0.0,0.0,0.0,0.020004,0.102226,0.0,0.031428,0.0,0.0,0.0,0.0,0.0,0.0,0.049383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1301180000.0,3.0,The Script,Superheroes,all the life she has seen\nall the meaner side...,life see mean side take away prophet dream fo...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010139,0.0,0.0,0.0,0.019484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.131377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028329,0.0,0.0,0.0,0.016137,0.0,0.0,0.0,0.0,0.0,0.0,0.046915,0.0,0.0,0.0,0.0,0.0,0.008922,0.0,0.016271,0.0,0.0
3,1301180000.0,4.0,The Script,Breakeven,i'm still alive but i'm barely breathing\njust...,still alive barely breathe prayin togod thatd...,0.0,0.0,0.074079,0.0,0.0,0.0,0.0,0.0,0.077108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057771,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.041512,0.0,0.0,0.0,0.036357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029648,0.0,0.0,0.0,0.0,0.0,0.0,0.097105,0.0,0.0,0.0,0.0
4,1301180000.0,5.0,Green Day,21 Guns,"do you know what's worth fighting for,\nwhen i...",know worth fight worth die take breath away f...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058411,0.0,0.0,0.0,0.357318,0.0,0.0,0.0,0.040563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.082168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Summary stats of TF-IDF
print('Max:', np.max(tfidf_df.max()), '\n',
      'Mean:', np.mean(tfidf_df.mean()), '\n',
      'Standard Deviation:', np.std(tfidf_df.std()))

Max: 1.0 
 Mean: 0.007970174371637 
 Standard Deviation: 0.013498575693393


##TASK: Please Implement the Recommender Systems Using The Songs dataset (created by all class members)

1. Input: Song title (_st = "Is It The Answer?"), number of recomended songs (_nt = 10 )
2. Process: calculate cosine similarity over TFIDF columns
3. Returning: _nt songs which close to _st based on cosine similarity
4. Powerpoint yang menjelaskan tugas TF/IDF dengan Cosine similarity

Dataset Lagu dapat diisi dan dilihat pada: https://docs.google.com/spreadsheets/d/1vjszULKCcS4LPup3VJ9MofYPiYhcaoXTC4zdohLFwpQ/edit?usp=sharing






---



---



In [18]:
# 2. Process: calculate cosine similarity over TFIDF columns
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_df)

In [21]:
print(cosine_sim)

[[1.         0.05542913 0.09609139 ... 0.02036893 0.12652308 0.10990208]
 [0.05542913 1.         0.11687944 ... 0.08925686 0.06899328 0.10974492]
 [0.09609139 0.11687944 1.         ... 0.10675465 0.10322775 0.27561446]
 ...
 [0.02036893 0.08925686 0.10675465 ... 1.         0.0696057  0.07647248]
 [0.12652308 0.06899328 0.10322775 ... 0.0696057  1.         0.28003586]
 [0.10990208 0.10974492 0.27561446 ... 0.07647248 0.28003586 1.        ]]


In [25]:
def get_index_from_title(st):
  return df[df.Judul == st].index.values[0]

def get_title_from_index(index):
   return df[df.index == index]["Judul"].values[0]

In [26]:
# 1. Input: Song title (_st = "Is It The Answer?"), number of recomended songs (_nt = 10 )
songs_st = input("Judul Lagu Yang Disukai: ")
n = input("Banyak Lagu Yang Ingin Di Rekomendasi: ")
songs_nt = int(n)

# 3. Returning: _nt songs which close to _st based on cosine similarity
songs_index = get_index_from_title(songs_st)
similar_songs = list(enumerate(cosine_sim[songs_index]))
sorted_similar_songs = sorted(similar_songs, key=lambda x:x[1], reverse=True)

print("\n Rekomen Lagu Berdasarkan Yang Memiliki Kemiripan Konten/Lirik:")
i=0
for Judul in sorted_similar_songs:
  print(i,get_title_from_index(Judul[0]))
  i=i+1
  if i>songs_nt:
    break

Judul Lagu Yang Disukai: Snowman
Banyak Lagu Yang Ingin Di Rekomendasi: 10

 Rekomen Lagu Berdasarkan Yang Memiliki Kemiripan Konten/Lirik:
0 Snowman
1 Don't Cry
2 Home
3 No Tears Left To Cry
4 No woman no cry
5 Can't Take My Eyes Off You
6 Pro
7 Hold On
8 Love On Top
9 Love yourself
10 Into The Night


Reference yang digunakan: https://towardsdatascience.com/using-cosine-similarity-to-build-a-movie-recommendation-system-ae7f20842599