In [2]:
import pandas as pd, numpy as np
import ast

In [75]:
dataframe = pd.read_csv("../scraping/out.csv", encoding='utf-8',encoding_errors='replace' )

In [76]:
dataframe.columns

Index(['title', 'singers', 'directors', 'lyricist', 'genre', 'album',
       'download_link', 'poster', 'year'],
      dtype='object')

In [77]:
dataframe.shape

(5610, 9)

In [84]:
dataframe.tail()

Unnamed: 0,title,singers,directors,lyricist,genre,album,download_link,poster,year
5605,"Yeh Sama, Sama Hai Yeh Pyaar Ka",['Nikhita Gandhi'],"['Kalyanji - Anandji', ' Nikhita Gandhi']",['Anand Bakshi'],['Pop'],"Yeh Sama, Sama Hai Yeh Pyaar Ka",https://www.youtube.com/watch?v=D7AmTviGVf0,https://is2-ssl.mzstatic.com/image/thumb/Music...,2020
5606,Zaalima,['Sona Mohapatra'],['JAM8'],['Amitabh Bhattacharya'],['Filmi'],Zaalima,https://www.youtube.com/watch?v=sPoS_aYj7Jw,https://is5-ssl.mzstatic.com/image/thumb/Music...,2020
5607,Zara Thehro,"['Armaan Malik', ' Tulsi Kumar']",['Amaal Mallik'],['Rashmi Virag'],['Filmi'],Zara Thehro,https://www.youtube.com/watch?v=5Up8XW-K-Ik,https://is3-ssl.mzstatic.com/image/thumb/Music...,2020
5608,Zeher,['Bharatt - Saurabh'],['Bharatt - Saurabh'],['Bharatt - Saurabh'],"['Hip-hop', ' Filmi']",Zeher,https://www.youtube.com/watch?v=DG5SfLqZeA8,https://is2-ssl.mzstatic.com/image/thumb/Music...,2020
5609,Zindagi Tere Naam,['Raghav Kapoor'],"['Raghav Kapoor', ' Amdad Ali']",['Raghav Kapoor'],['Filmi'],Zindagi Tere Naam,https://www.youtube.com/watch?v=0viILFQ5-ks,https://is3-ssl.mzstatic.com/image/thumb/Music...,2020


In [85]:
#checking for null values
null_lyricist = dataframe.isna()['lyricist'].sum()
null_directors = dataframe.isna()['directors'].sum()
print(null_directors, null_lyricist)

61 505


In [86]:
#removing brackets 
dataframe = dataframe[~dataframe['title'].str.contains(r'reprise|title|theme|version|track|edit|redux|unplugged|duet|beat|mix|instrumental|cover|tribute|acoustic', case=False)]
dataframe.shape

(5049, 9)

In [87]:
dataframe.dropna(subset=['title','singers','download_link', 'year', 'poster'], inplace= True)
# dataframe.drop_duplicates(subset=['title'], inplace = True) # should not perform as two songs can have same names
dataframe.drop_duplicates(subset=['download_link'], inplace = True)
dataframe.reset_index(drop = True, inplace=True)
dataframe.shape

(3883, 9)

In [11]:
def convert_to_words(words):
    try:
        if not words:
            return []
        list = ast.literal_eval(words)
        return " " + " ".join([one_word(s) for s in list])
    except:
        return " "
def one_word(s):
    return "".join(s.lower().split())

In [88]:
new_df = pd.DataFrame()
new_df['tags'] = dataframe['title']
new_df['tags'] += dataframe['singers'].apply(convert_to_words)
new_df['tags'] += dataframe['directors'].apply(convert_to_words)
new_df['tags'] += dataframe['lyricist'].apply(convert_to_words)
new_df['tags'] += dataframe['genre'].apply(convert_to_words)
new_df['tags'] += " " + dataframe['album']

In [13]:
new_df.head().iloc[0].tags

'Allah Teri Kya Shaan Hai jaannissarlone kamalkhan jaannissarlone sahilfatehpuri filmi sufi/qawwali 18.11 (A Code Of Secrecy)'

In [89]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000,stop_words='english')

In [90]:
vector = cv.fit_transform(new_df['tags']).toarray()

In [16]:
type(new_df['tags'])

pandas.core.series.Series

In [91]:
vector.shape

(3883, 8120)

In [92]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [93]:
similarity.shape

(3883, 3883)

In [94]:
sorted_indices = np.array([np.argsort(-row)[1: 51] for row in similarity])
np.save('similarity.npy', sorted_indices)

In [None]:
#loading data from numpy
arr = np.load('similarity.npy')
list(arr[0])

In [None]:
def recommend(song):
    index = dataframe[dataframe['title'] == song].index[0]
    similar = enumerate(similarity[index])
    r_songs = sorted(similar, key = lambda x : -x[1])[1:51]
    list_of_songs = []
    for sng in r_songs:
        list_of_songs.append(dataframe.iloc[sng[0]].to_dict())
    return list_of_songs

In [None]:
r = recommend(dataframe.iloc[0].title)

In [95]:
#for saving the database
save_df = pd.DataFrame()
save_df = dataframe[['title', 'download_link', 'year']]

In [96]:
save_df.tail()

Unnamed: 0,title,download_link,year
3878,"Yeh Sama, Sama Hai Yeh Pyaar Ka",https://www.youtube.com/watch?v=D7AmTviGVf0,2020
3879,Zaalima,https://www.youtube.com/watch?v=sPoS_aYj7Jw,2020
3880,Zara Thehro,https://www.youtube.com/watch?v=5Up8XW-K-Ik,2020
3881,Zeher,https://www.youtube.com/watch?v=DG5SfLqZeA8,2020
3882,Zindagi Tere Naam,https://www.youtube.com/watch?v=0viILFQ5-ks,2020


In [97]:
save_df.to_pickle('songs.pkl')

In [98]:
#loading dataframe
df = pd.read_pickle('songs.pkl')
df

Unnamed: 0,title,download_link,year
0,Allah Teri Kya Shaan Hai,https://www.youtube.com/watch?v=rF7JYlu2mhM,2014
1,Yeh Zamin Yeh Aasman Roshni Se Nahaane Lage,https://www.youtube.com/watch?v=6av6kNVx-0I,2014
2,Mera Yaar Thanedaar,https://www.youtube.com/watch?v=djznDrQ05wA,2014
3,Eagle Sa Ego Hai,https://www.youtube.com/watch?v=mI5qs83q6Z8,2014
4,Aandhi Jaisi Raftar Chal Chala Chal (Chal Chal...,https://www.youtube.com/watch?v=8R1rs5dvqyM,2014
...,...,...,...
3878,"Yeh Sama, Sama Hai Yeh Pyaar Ka",https://www.youtube.com/watch?v=D7AmTviGVf0,2020
3879,Zaalima,https://www.youtube.com/watch?v=sPoS_aYj7Jw,2020
3880,Zara Thehro,https://www.youtube.com/watch?v=5Up8XW-K-Ik,2020
3881,Zeher,https://www.youtube.com/watch?v=DG5SfLqZeA8,2020


### User Search recommendation

In [None]:
def join_words(words):
    try:
        if not words:
            return []
        list = ast.literal_eval(words)
        return " " + " ".join(list)
    except:
        return " "

In [None]:
user_search = 'aaj na jaana'

song_title = pd.DataFrame()
song_titles['title'] = dataframe['title']
song_titles['tags'] = dataframe['title'] + dataframe['singers'].apply(join_words) + dataframe['directors'].apply(join_words) + dataframe['lyricist'].apply(join_words)
song_titles['tags'] = song_titles['tags'] + " " + dataframe['album']
song_titles['tags'] = song_titles['tags'] + " " + dataframe['year'].apply(lambda x : str(x))
print(song_titles.shape)
song_titles.tail()

In [None]:
search_words = np.array(song_titles['tags'])
print(len(search_words))
np.save('search_similarity.npy', search_words)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorizer = TfidfVectorizer()
all_transform = vectorizer.fit_transform(song_titles['tags'])

In [None]:
all_transform

In [None]:
query_transform = vectorizer.transform([user_search])

In [None]:
similar_songs = cosine_similarity(query_transform, all_transform)

In [None]:
songs = np.argsort(-similar_songs[0]).tolist()[:50]

### homepage songs

In [None]:
songs = dataframe.sort_values(by = "year").iloc[-50:]

In [None]:
songs