In [40]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# nltk processing
import nltk
from nltk.tag import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import re
import breakinghits_script as bh

In [41]:
bh.show_tables()

bh_invited_user
bh_login_activity
bh_messenger
bh_mgenre
bh_music_views
bh_pages_activity
bh_search_activity
bh_shared_activity
bh_social_link_activity
cities
countries
regions
user_activities
user_following
user_music_album
user_music_comments
user_music_votes
user_musics
user_saved
user_saves
user_spotlight
users


In [42]:
bh.pull_dataframe("select * from user_musics");

In [43]:
df = bh.pull_dataframe("select * from user_music_comments").drop(['id', 'date_added'],axis=1)

In [44]:
df.shape

(52, 3)

In [45]:
df.head(5)
temp = df.drop(['user_music_id', 'comment'],axis=1)
temp['user_music_id'] = df['user_music_id']
temp['comment'] = df['comment']

In [46]:
df = temp.copy()
df.head(5)

Unnamed: 0,user_id,user_music_id,comment
0,1,1,nice music!
1,1,2,i wanted to hear more.. just like this
2,1,1,great song..
3,1,1,awesome... !
4,458,45,Nice


In [47]:
rate_df = bh.pull_dataframe("select * from user_music_votes").drop(['id','date_added'],axis=1);

In [48]:
rate_df.head(5)

Unnamed: 0,user_id,user_music_id,rating
0,298,15,4
1,298,16,4
2,298,7,5
3,298,17,4
4,296,7,5


In [49]:
reviews = df.merge(rate_df, how='left')

In [50]:
df.drop(['user_id'], inplace=True,axis=1)

In [51]:
#bh.pull_dataframe('select id, genre from bh_mgenre')

In [52]:
music = bh.pull_dataframe('select id,genre from user_musics')
genre = bh.pull_dataframe('select id,genre from bh_mgenre')

In [53]:
x = music.merge(genre, left_on='genre', right_on='id').drop(['genre_x', 'id_y'],axis=1)
x.columns = ['user_music_id', 'comment']

In [54]:
descriptions = pd.concat([df,x], axis=0)

In [55]:
review_docs = descriptions.groupby(['user_music_id'])['comment'].apply(lambda x: ''.join(x)).reset_index()

In [56]:
review_docs

Unnamed: 0,user_music_id,comment
0,0,Love the Song.
1,1,nice music!great song.. awesome... !this is co...
2,2,i wanted to hear more.. just like thisyow just...
3,6,meehhh!Love this song!!!!Country
4,7,Song has a great bass line!Bass line is great!...
5,8,Country
6,9,R&B
7,10,R&B
8,11,i would add to the bridgei would add to the br...
9,12,R&B


In [57]:
stopwrds = stopwords.words('english')
# aux function to clean up text
def cleaning_text(sentence):
    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = re.sub('[^\w\s]',' ', sentence)
    sentence = re.sub('_',' ', sentence)
    sentence = re.sub('\d+',' ', sentence)
    cleaned = ' '.join([w for w in sentence.split() if not w in stopwrds])
    cleaned = ' '.join([w for w in cleaned.split() if not len(w)<=2 ])
    cleaned = cleaned.strip()
    return cleaned

In [58]:
#nltk.download()

In [59]:
review_docs['textClean'] = review_docs.apply(lambda row: cleaning_text(row['comment'].encode("utf8")), axis=1)

In [109]:
review_docs.sort_values('user_music_id');

In [61]:
corpus = list(review_docs.textClean)
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
# tfidf matrix
tfidf_matrix = vectorizer.fit_transform(corpus)
idf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print(tfidf_matrix.todense())

[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.43296522  0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]


In [62]:
tfidf_matrix = tfidf_matrix.todense()

In [110]:
# recommendation based on tfidf
from sklearn.metrics.pairwise import cosine_similarity
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = cosine_similarity(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities if i != index]
    return([(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n])

In [114]:
find_similar(tfidf_matrix,10,len(review_docs))

[(51, 0.21986553316205731),
 (27, 0.18250251321555191),
 (57, 0.0),
 (14, 0.0),
 (25, 0.0),
 (24, 0.0),
 (23, 0.0),
 (22, 0.0),
 (21, 0.0),
 (20, 0.0),
 (19, 0.0),
 (18, 0.0),
 (17, 0.0),
 (16, 0.0),
 (15, 0.0),
 (12, 0.0),
 (13, 0.0),
 (11, 0.0),
 (9, 0.0),
 (8, 0.0),
 (7, 0.0),
 (6, 0.0),
 (5, 0.0),
 (4, 0.0),
 (3, 0.0),
 (2, 0.0),
 (1, 0.0),
 (26, 0.0),
 (28, 0.0),
 (56, 0.0),
 (29, 0.0),
 (55, 0.0),
 (54, 0.0),
 (53, 0.0),
 (52, 0.0),
 (50, 0.0),
 (49, 0.0),
 (48, 0.0),
 (47, 0.0),
 (46, 0.0),
 (45, 0.0),
 (44, 0.0),
 (43, 0.0),
 (42, 0.0),
 (41, 0.0),
 (40, 0.0),
 (39, 0.0),
 (38, 0.0),
 (37, 0.0),
 (36, 0.0),
 (35, 0.0),
 (34, 0.0),
 (33, 0.0),
 (32, 0.0),
 (31, 0.0),
 (30, 0.0),
 (0, 0.0)]