# Youtube Search Term Exploration
#### By: Tim Santos 
<ztdsantos@globe.com.ph>
<timothyisrael.santos@thinkbiganalytics.com>

#### Background:
- Present to AMP and identify potential use case and interest
- To explore search term data and apply basic NLP techniques 
- To explore search term categorization

Accommpanying presentation: https://goo.gl/cbC57D

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVR
from sklearn import svm, metrics, linear_model
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import io
import requests

### Load the CSV File of AMP-Labelled Data  to Pandas Dataframe

In [2]:
adata=pd.read_csv('/data/raw_data/zted0040/amp_data.csv')

In [3]:
adata.head()

Unnamed: 0,searchTerm,videoID,categoryID,category
0,jeremi planez,--5u48IaR4M,10.0,Music
1,kundiman cho,--B791t__ok,22.0,People_and_Blogs
2,art garfunkel the sound of silence,--DbgPXwLlM,10.0,Music
3,sowmoy,--GmYWoFyJ4,23.0,Comedy
4,20r,--XVqynW3-M,22.0,People_and_Blogs


### Load the CSV File of AMP-Labelled Data  to Pandas Dataframe

In [4]:
summary_data=pd.read_csv('/data/raw_data/zted0040/summary.csv')

In [5]:
summary_data.head()

Unnamed: 0,cluster_id,count
0,Adult,374
1,Autos_and_Vehicles,141
2,Comedy,427
3,Education,786
4,Entertainment,3775


### Display the search term categories defined by AMP

In [6]:
for cat in summary_data.cluster_id:
    print(cat)

Adult
Autos_and_Vehicles
Comedy
Education
Entertainment
Film_and_Animation
Gaming
Howto_and_Style
Movies
Music
News_and_Politics
No_Category_Found
Nonprofits_and_Activism
People_and_Blogs
Pets_and_Animals
Science_and_Technology
Shows
Sports
Trailers
Travel_and_Events


## Processing and Feature Extraction

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

### Do some processing here

- cleaning
- parsing
- stemming
- lemmatization
- POS-tagging
- chunking
- sentenizer

In [8]:
adata_parsed = adata 

In [9]:
# Some basic parsing (removed known html tags for youtube)
adata_parsed.searchTerm=adata_parsed.searchTerm.str.replace('gl', '').str.replace('en', '').str.replace('hl', '')
# adata_parsed=adata_parsed.searchTerm.dropna()

In [10]:
adata_parsed.head()

Unnamed: 0,searchTerm,videoID,categoryID,category
0,jeremi planez,--5u48IaR4M,10.0,Music
1,kundiman cho,--B791t__ok,22.0,People_and_Blogs
2,art garfunkel the sound of silce,--DbgPXwLlM,10.0,Music
3,sowmoy,--GmYWoFyJ4,23.0,Comedy
4,20r,--XVqynW3-M,22.0,People_and_Blogs


## Vectorize and Show top Unigram terms

In [38]:
# Search only Music category
adata_parsed_temp=adata[adata.category=='Music'].searchTerm
tvec = TfidfVectorizer(min_df=.0001, max_df=0.95,  stop_words='english', ngram_range=[1,1])
tvec_weights = tvec.fit_transform(adata_parsed_temp)
weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
temp_weights = weights_df.sort_values(by='weight', ascending=False).head(200)
temp_weights.head(10)

Unnamed: 0,term,weight
2359,love,0.018035
2400,lyrics,0.014737
3547,sm,0.009634
3581,song,0.008842
2290,like,0.0064
1249,ed,0.006292
1160,don,0.006055
2264,let,0.005519
3582,songs,0.005322
3451,sheeran,0.005234


### Export the top Unigram to CSV (for visualization purposes)

In [41]:
for cat in summary_data.cluster_id:
    
    adata_parsed_temp=adata[adata.category==cat].searchTerm
    tvec = TfidfVectorizer(min_df=.0001, max_df=0.95,  stop_words='english', ngram_range=[1,1])
    tvec_weights = tvec.fit_transform(adata_parsed_temp)

    weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    temp_weights = weights_df.sort_values(by='weight', ascending=False).head(200)
    temp_weights.weight=temp_weights.weight/temp_weights.weight.max()    
    np_df = temp_weights.as_matrix()
    print(cat+'.csv')

    with open(cat+'_cnt.csv', 'wb') as f:
        fs = ['cluster_id','type','word','count']
        writer = csv.writer(f)
        writer.writerow(fs)
        for i in np_df:
            writer.writerow([cat,'unigram',i[0],round(i[1]*100)])
    print(cat+'_cnt.csv SAVED!')


Adult.csv
Adult.csv SAVED!
Autos_and_Vehicles.csv
Autos_and_Vehicles.csv SAVED!
Comedy.csv
Comedy.csv SAVED!
Education.csv
Education.csv SAVED!
Entertainment.csv
Entertainment.csv SAVED!
Film_and_Animation.csv
Film_and_Animation.csv SAVED!
Gaming.csv
Gaming.csv SAVED!
Howto_and_Style.csv
Howto_and_Style.csv SAVED!
Movies.csv
Movies.csv SAVED!
Music.csv
Music.csv SAVED!
News_and_Politics.csv
News_and_Politics.csv SAVED!
No_Category_Found.csv
No_Category_Found.csv SAVED!
Nonprofits_and_Activism.csv
Nonprofits_and_Activism.csv SAVED!
People_and_Blogs.csv
People_and_Blogs.csv SAVED!
Pets_and_Animals.csv
Pets_and_Animals.csv SAVED!
Science_and_Technology.csv
Science_and_Technology.csv SAVED!
Shows.csv
Shows.csv SAVED!
Sports.csv
Sports.csv SAVED!
Trailers.csv
Trailers.csv SAVED!
Travel_and_Events.csv
Travel_and_Events.csv SAVED!


Export the actual search terms to csv files

In [42]:
with open(cat+'_details.csv', 'wb') as f:
    fs = ['cluster_id','paragraph']
    writer = csv.writer(f)
    writer.writerow(fs)
    for i in adata_parsed_temp:
        writer.writerow([cat,i])


### Export the top Bigram to CSV (for visualization purposes)


In [43]:
for cat in summary_data.cluster_id:
    
    adata_parsed_temp=adata[adata.category==cat].searchTerm
    tvec = TfidfVectorizer(min_df=.0001, max_df=0.95,  stop_words='english', ngram_range=[2,2])
    tvec_weights = tvec.fit_transform(adata_parsed_temp)

    weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    temp_weights = weights_df.sort_values(by='weight', ascending=False).head(200)
    temp_weights.weight=temp_weights.weight/temp_weights.weight.max()    
    np_df = temp_weights.as_matrix()
    print(cat+'.csv')
    with open(cat+'_cnt.csv', 'a') as f:
        fs = ['cluster_id','type','word','count']
        writer = csv.writer(f)
        writer.writerow(fs)
        for i in np_df:
            writer.writerow([cat,'bigram',i[0],round(i[1]*100)])
    print(cat+'_cnt.csv SAVED!')

Adult.csv
Adult_cnt.csv SAVED!
Autos_and_Vehicles.csv
Autos_and_Vehicles_cnt.csv SAVED!
Comedy.csv
Comedy_cnt.csv SAVED!
Education.csv
Education_cnt.csv SAVED!
Entertainment.csv
Entertainment_cnt.csv SAVED!
Film_and_Animation.csv
Film_and_Animation_cnt.csv SAVED!
Gaming.csv
Gaming_cnt.csv SAVED!
Howto_and_Style.csv
Howto_and_Style_cnt.csv SAVED!
Movies.csv
Movies_cnt.csv SAVED!
Music.csv
Music_cnt.csv SAVED!
News_and_Politics.csv
News_and_Politics_cnt.csv SAVED!
No_Category_Found.csv
No_Category_Found_cnt.csv SAVED!
Nonprofits_and_Activism.csv
Nonprofits_and_Activism_cnt.csv SAVED!
People_and_Blogs.csv
People_and_Blogs_cnt.csv SAVED!
Pets_and_Animals.csv
Pets_and_Animals_cnt.csv SAVED!
Science_and_Technology.csv
Science_and_Technology_cnt.csv SAVED!
Shows.csv
Shows_cnt.csv SAVED!
Sports.csv
Sports_cnt.csv SAVED!
Trailers.csv
Trailers_cnt.csv SAVED!
Travel_and_Events.csv
Travel_and_Events_cnt.csv SAVED!


### Export the top Trigram to CSV (for visualization purposes)

In [45]:
for cat in summary_data.cluster_id:
    
    adata_parsed_temp=adata[adata.category==cat].searchTerm
    tvec = TfidfVectorizer(min_df=.0001, max_df=0.95,  stop_words='english', ngram_range=[3,3])
    tvec_weights = tvec.fit_transform(adata_parsed_temp)

    weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    temp_weights = weights_df.sort_values(by='weight', ascending=False).head(200)
    temp_weights.weight=temp_weights.weight/temp_weights.weight.max()    
    np_df = temp_weights.as_matrix()
    print(cat+'.csv')
    with open(cat+'_cnt.csv', 'a') as f:
        fs = ['cluster_id','type','word','count']
        writer = csv.writer(f)
        writer.writerow(fs)
        for i in np_df:
            writer.writerow([cat,'trigram',i[0],round(i[1]*100)])
    print(cat+'_cnt.csv SAVED!')

Adult.csv
Adult_cnt.csv SAVED!
Autos_and_Vehicles.csv
Autos_and_Vehicles_cnt.csv SAVED!
Comedy.csv
Comedy_cnt.csv SAVED!
Education.csv
Education_cnt.csv SAVED!
Entertainment.csv
Entertainment_cnt.csv SAVED!
Film_and_Animation.csv
Film_and_Animation_cnt.csv SAVED!
Gaming.csv
Gaming_cnt.csv SAVED!
Howto_and_Style.csv
Howto_and_Style_cnt.csv SAVED!
Movies.csv
Movies_cnt.csv SAVED!
Music.csv
Music_cnt.csv SAVED!
News_and_Politics.csv
News_and_Politics_cnt.csv SAVED!
No_Category_Found.csv
No_Category_Found_cnt.csv SAVED!
Nonprofits_and_Activism.csv
Nonprofits_and_Activism_cnt.csv SAVED!
People_and_Blogs.csv
People_and_Blogs_cnt.csv SAVED!
Pets_and_Animals.csv
Pets_and_Animals_cnt.csv SAVED!
Science_and_Technology.csv
Science_and_Technology_cnt.csv SAVED!
Shows.csv
Shows_cnt.csv SAVED!
Sports.csv
Sports_cnt.csv SAVED!
Trailers.csv
Trailers_cnt.csv SAVED!
Travel_and_Events.csv
Travel_and_Events_cnt.csv SAVED!


Select Only Music Data

In [52]:
adata_parsed_music=adata

In [53]:
adata_parsed_music.category[adata_parsed_music.category!='Music']=0
adata_parsed_music.category[adata_parsed_music.category=='Music']=1
# adata_parsed_music.category=adata_parsed_music.category.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [54]:
adata_parsed_music.category=adata_parsed_music.category.astype(int)

In [55]:
len(adata_parsed_music[adata_parsed_music.category==1])

17365

In [56]:
len(adata_parsed_music[adata_parsed_music.category==0])

16609

In [57]:
y=adata_parsed_music.category

In [None]:
## Split Train-Test
train, validate, test = np.split(adata_parsed_music.sample(frac=1), 
                                 [int(.6*len(adata_parsed_music)), 
                                  int(.8*len(adata_parsed_music))])
## Extract Features:Term Frequency times Inverse Document Frequency (tf-idf)
tfidf_transformer = TfidfVectorizer(min_df=.0001, max_df=0.95,  stop_words='english', ngram_range=[1,3])

# Use transform() method to transform count-matrix to 'tf-idf' representation
x_train_tfidf = tfidf_transformer.fit_transform(train['searchTerm'])

## Train SVM/SGD Classifier


In [69]:
clf = linear_model.SGDClassifier(loss='log')
SG = clf.fit(x_train_tfidf.toarray(), train['category'])

## Prediction on test data
# Tokenizing test phrase
# x_test_counts_ft_sel = count_vector_ft_sel.transform(test['searchTerm'])
# Use transform() method to transform test count-matrix to 'tf-idf' representation
# x_test_tfidf_ft_sel = tfidf_transformer_ft_sel.transform(x_test_counts_ft_sel)
x_test_tfidf = tfidf_transformer.transform(test['searchTerm'])

predicted=SG.predict(x_test_tfidf)

    
test['predicted']=predicted    

In [67]:
import nltk
from nltk.metrics import ConfusionMatrix
from sklearn.metrics import classification_report

### Compute Confusion Matrix for Prediction

In [70]:
cm = nltk.ConfusionMatrix(test['category'], predicted)

print(cm)

print(classification_report(test['category'], predicted))

  |    0    1    2 |
--+----------------+
0 |<1629>1712    . |
1 |  365<3089>   . |
2 |    .    .   <.>|
--+----------------+
(row = reference; col = test)

             precision    recall  f1-score   support

          0       0.82      0.49      0.61      3341
          1       0.64      0.89      0.75      3454

avg / total       0.73      0.69      0.68      6795



## Topic Modelling

Perform theme extraction uusing LDA to discover underlying themes

In [74]:
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import  CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

### Extract Features using Count Vectorizer
For LDA and NMF, frequency counts will suffice

In [80]:
adata_parsed=adata
# Some basic parsing (removed known html tags for youtube)
adata_parsed.searchTerm=adata_parsed.searchTerm.str.replace('gl', '').str.replace('en', '').str.replace('hl', '')
# adata_parsed=adata_parsed.searchTerm.dropna()

In [None]:
tfvec = CountVectorizer(min_df=.0001, max_df=0.95,  stop_words='english', ngram_range=[2,2])
tfvec_weights = tfvec.fit_transform(adata_parsed.searchTerm)

In [88]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=100000, stop_words='english', ngram_range=[2,2])
tfidf = tfidf_vectorizer.fit_transform(adata_parsed.searchTerm)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=100000, stop_words='english', ngram_range=[2,2])
tf = tf_vectorizer.fit_transform(adata_parsed.searchTerm)
tf_feature_names = tf_vectorizer.get_feature_names()

In [89]:
no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tvec_weights)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tfvec_weights)

In [93]:
### Helper Function to display topics

In [90]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])

### Display LDA Topics and Corresponding Top Tokens


In [92]:
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
2015 sm 42 maryzark 33 miles 30 sm 30 seconds 80 e0 80 e6 20 sorry 20 scarlet 20 sm
Topic 1:
81 d9 83 88 83 87 2016 best 20 sa 20 sarah 20 scarlet 20 sm 20 soapdish 83 ad
Topic 2:
83 81 81 97 21 guns 25 2017 80 99t 83 a1 83 ad 20 soapdish 20 sarah 20 scarlet
Topic 3:
81 e3 2016 best 83 8a 83 ad 20 soapdish 20 sabihin 20 sarah 20 scarlet 20 sm 20 sorry
Topic 4:
22 2015 70 80 83 ad 20 steve 20 sarah 20 scarlet 20 sm 20 soapdish 20 sorry 20 sub
Topic 5:
83 96 2x faster 20 curse 60 70 83 ad 20 steve 20 scarlet 20 sm 20 soapdish 20 sorry
Topic 6:
82 92 21 21 20 tj 20 tanging 20 tagalog 20 tadhana 20 sud 20 sub 20 steve 20 sorry
Topic 7:
81 84 20 yohan 16 latest 20 scarlet 20 sm 20 soapdish 20 sorry 20 steve 20 sub 20 sud
Topic 8:
80 eb 83 83 83 ad 20 sarah 20 scarlet 20 sm 20 soapdish 20 sorry 20 steve 20 sa
Topic 9:
20 sarah 2013 sm 80 93 83 ad 20 sabihin 20 tj 20 tanging 20 tagalog 20 tadhana 20 sud
Topic 10:
16 sm 81 8a 83 ad 20 sabihin 20 tj 20 tanging 20 tagalog 20 tadhana 20 

### Display NMF Topics and Corresponding Top Tokens

In [91]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
2015 sm 42 maryzark 33 miles 30 sm 30 seconds 80 e0 80 e6 20 sorry 20 scarlet 20 sm
Topic 1:
81 d9 83 88 83 87 2016 best 20 sa 20 sarah 20 scarlet 20 sm 20 soapdish 83 ad
Topic 2:
83 81 81 97 21 guns 25 2017 80 99t 83 a1 83 ad 20 soapdish 20 sarah 20 scarlet
Topic 3:
81 e3 2016 best 83 8a 83 ad 20 soapdish 20 sabihin 20 sarah 20 scarlet 20 sm 20 sorry
Topic 4:
22 2015 70 80 83 ad 20 steve 20 sarah 20 scarlet 20 sm 20 soapdish 20 sorry 20 sub
Topic 5:
83 96 2x faster 20 curse 60 70 83 ad 20 steve 20 scarlet 20 sm 20 soapdish 20 sorry
Topic 6:
82 92 21 21 20 tj 20 tanging 20 tagalog 20 tadhana 20 sud 20 sub 20 steve 20 sorry
Topic 7:
81 84 20 yohan 16 latest 20 scarlet 20 sm 20 soapdish 20 sorry 20 steve 20 sub 20 sud
Topic 8:
80 eb 83 83 83 ad 20 sarah 20 scarlet 20 sm 20 soapdish 20 sorry 20 steve 20 sa
Topic 9:
20 sarah 2013 sm 80 93 83 ad 20 sabihin 20 tj 20 tanging 20 tagalog 20 tadhana 20 sud
Topic 10:
16 sm 81 8a 83 ad 20 sabihin 20 tj 20 tanging 20 tagalog 20 tadhana 20 