Kannan: performing decomposition and representing textual features (Title & Content) in n-components using LDA, NMF and LSI.

In [3]:
#Author: kannan / extracting LDA, NMF & LSI features
import pandas as pd
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

from datetime import datetime
import joblib

#### Reading from extracted content file

In [4]:
in_filepath='../data/output/SS_Extracted_content.xlsx'
in_filepath_kw='../data/output/2_keywords_list.xlsx'
out_filepath='../data/output/2_NLP_LDA_NMF_LSI.xlsx'

df=pd.read_excel(in_filepath)
df_kw=pd.read_excel(in_filepath_kw)

df.head()

Unnamed: 0,Id,url,title,html,content
0,1,http://mashable.com/2014/09/01/americans-held-...,Americans Held in North Korea Ask U.S. for Ass...,<!DOCTYPE html>\n<html data-env='production' l...,"PYONGYANG, North Korea — North Korea gave fore..."
1,2,http://mashable.com/2014/09/01/apple-visa-mast...,"Apple Partners With Visa, MasterCard, AmEx for...",<!DOCTYPE html>\n<html data-env='production' l...,According to new reports from Bloomberg and ot...
2,3,http://mashable.com/2014/09/01/aussie-football...,Aussie Football Players Dress Up as Rolf Harri...,<!DOCTYPE html>\n<html data-env='production' l...,SYDNEY — Two Australian football players are a...
3,4,http://mashable.com/2014/09/01/australia-gover...,Australia Government Buys Bombproof BMW Fleet ...,<!DOCTYPE html>\n<html data-env='production' l...,SYDNEY — The Australia Prime Minister has orde...
4,5,http://mashable.com/2014/09/01/australia-jane-...,Author Compares Women in Traditional Marriages...,<!DOCTYPE html>\n<html data-env='production' l...,"SYDNEY — On Australian panel show Q&A, author ..."


In [5]:
df_kw.head()

Unnamed: 0,Id,keywords
0,1,"north-korea, uncategorized, us-world, world, d..."
1,2,"apple, visa, mobile-payments, american-express..."
2,3,"australia, uncategorized, us-world, sports, ma..."
3,4,"australia, bmw, uncategorized, us-world, tony-..."
4,5,"q-a, australia, uncategorized, tv, us-world, j..."


In [6]:
df=pd.merge(df,df_kw,on='Id',how='left')

In [7]:
df.head()

Unnamed: 0,Id,url,title,html,content,keywords
0,1,http://mashable.com/2014/09/01/americans-held-...,Americans Held in North Korea Ask U.S. for Ass...,<!DOCTYPE html>\n<html data-env='production' l...,"PYONGYANG, North Korea — North Korea gave fore...","north-korea, uncategorized, us-world, world, d..."
1,2,http://mashable.com/2014/09/01/apple-visa-mast...,"Apple Partners With Visa, MasterCard, AmEx for...",<!DOCTYPE html>\n<html data-env='production' l...,According to new reports from Bloomberg and ot...,"apple, visa, mobile-payments, american-express..."
2,3,http://mashable.com/2014/09/01/aussie-football...,Aussie Football Players Dress Up as Rolf Harri...,<!DOCTYPE html>\n<html data-env='production' l...,SYDNEY — Two Australian football players are a...,"australia, uncategorized, us-world, sports, ma..."
3,4,http://mashable.com/2014/09/01/australia-gover...,Australia Government Buys Bombproof BMW Fleet ...,<!DOCTYPE html>\n<html data-env='production' l...,SYDNEY — The Australia Prime Minister has orde...,"australia, bmw, uncategorized, us-world, tony-..."
4,5,http://mashable.com/2014/09/01/australia-jane-...,Author Compares Women in Traditional Marriages...,<!DOCTYPE html>\n<html data-env='production' l...,"SYDNEY — On Australian panel show Q&A, author ...","q-a, australia, uncategorized, tv, us-world, j..."


In [8]:
df.shape

(7795, 6)

#### feature extraction - decomposition

In [9]:
NUM_TOPICS=5

In [10]:
df.iloc[0]['title']
#df.iloc[0]['content']

'Americans Held in North Korea Ask U.S. for Assistance'

In [11]:
titles=df['title'].values.astype('U')
titles=titles.tolist()
titles=list(filter(None,titles))
titles[0:5]

['Americans Held in North Korea Ask U.S. for Assistance',
 'Apple Partners With Visa, MasterCard, AmEx for iPhone 6 Payments: Reports',
 'Aussie Football Players Dress Up as Rolf Harris and Victim',
 'Australia Government Buys Bombproof BMW Fleet for G20 Summit',
 'Author Compares Women in Traditional Marriages to Prostitutes on TV Panel Show']

In [12]:
content=df['content'].values.astype('U')
content=content.tolist()
content=list(filter(None,content))
#content[0:2]

In [13]:
keywords=df['keywords'].values.astype('U')
keywords=keywords.tolist()
keywords=list(filter(None,keywords))
keywords[0:5]

['north-korea, uncategorized, us-world, world, detainees, matthew-miller, jeffrey-fowle',
 'apple, visa, mobile-payments, american-express, mastercard, uncategorized, business, apps-software, mobile',
 'australia, uncategorized, us-world, sports, mad-monday, afl',
 'australia, bmw, uncategorized, us-world, tony-abbott',
 'q-a, australia, uncategorized, tv, us-world, jane-caro']

In [14]:
%%time
#count vectorizing the data
#reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
vectorizerT = CountVectorizer(analyzer='word',min_df=5, max_df=0.95, 
                             stop_words='english', lowercase=True, encoding='utf-8',
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
vectorizerC = CountVectorizer(analyzer='word',min_df=5, max_df=0.95, 
                             stop_words='english', lowercase=True, encoding='utf-8',
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
vectorizerK = CountVectorizer(analyzer='word',min_df=5, max_df=0.95, 
                             stop_words='english', lowercase=True, encoding='utf-8',
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

title_vectorized = vectorizerT.fit_transform(titles)
content_vectorized = vectorizerC.fit_transform(content)
keyword_vectorized = vectorizerK.fit_transform(keywords)

Wall time: 5.76 s


In [15]:
joblib.dump(vectorizerT, '../data/output/models/vectorizerT.pkl')
joblib.dump(vectorizerC, '../data/output/models/vectorizerC.pkl')
joblib.dump(vectorizerK, '../data/output/models/vectorizerK.pkl')

['../data/output/models/vectorizerK.pkl']

In [16]:
%%time
# Build a Latent Dirichlet Allocation Model
lda_modelT = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_modelC = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_modelK = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_title = lda_modelT.fit_transform(title_vectorized)
lda_content = lda_modelC.fit_transform(content_vectorized)
lda_keyword = lda_modelK.fit_transform(keyword_vectorized)
print(lda_title.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(7795, 5)
Wall time: 2min 23s


In [17]:
joblib.dump(lda_modelT, '../data/output/models/lda_modelT.pkl')
joblib.dump(lda_modelC, '../data/output/models/lda_modelC.pkl')
joblib.dump(lda_modelK, '../data/output/models/lda_modelK.pkl')

['../data/output/models/lda_modelK.pkl']

In [18]:
%%time
# Build a Non-Negative Matrix Factorization Model
nmf_modelT = NMF(n_components=NUM_TOPICS)
nmf_modelC = NMF(n_components=NUM_TOPICS)
nmf_modelK = NMF(n_components=NUM_TOPICS)
nmf_title = nmf_modelT.fit_transform(title_vectorized)
nmf_content = nmf_modelC.fit_transform(content_vectorized)
nmf_keyword = nmf_modelK.fit_transform(keyword_vectorized)
print(nmf_title.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(7795, 5)
Wall time: 5.89 s


In [19]:
joblib.dump(nmf_modelT, '../data/output/models/nmf_modelT.pkl')
joblib.dump(nmf_modelC, '../data/output/models/nmf_modelC.pkl')
joblib.dump(nmf_modelK, '../data/output/models/nmf_modelK.pkl')

['../data/output/models/nmf_modelK.pkl']

In [20]:
%%time
# Build a Latent Semantic Indexing Model
lsi_modelT = TruncatedSVD(n_components=NUM_TOPICS)
lsi_modelC = TruncatedSVD(n_components=NUM_TOPICS)
lsi_modelK = TruncatedSVD(n_components=NUM_TOPICS)
lsi_title = lsi_modelT.fit_transform(title_vectorized)
lsi_content = lsi_modelC.fit_transform(content_vectorized)
lsi_keyword = lsi_modelK.fit_transform(keyword_vectorized)
print(lsi_title.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(7795, 5)
Wall time: 1.27 s


In [21]:
joblib.dump(lsi_modelT, '../data/output/models/lsi_modelT.pkl')
joblib.dump(lsi_modelC, '../data/output/models/lsi_modelC.pkl')
joblib.dump(lsi_modelK, '../data/output/models/lsi_modelK.pkl')

['../data/output/models/lsi_modelK.pkl']

In [22]:
models=[]
models.append(('LDA_Title',lda_title))
models.append(('NMF_Title',nmf_title))
models.append(('LSI_Title',lsi_title))
models.append(('LDA_Content',lda_content))
models.append(('NMF_Content',nmf_content))
models.append(('LSI_Content',lsi_content))
models.append(('LDA_Keyword',lda_keyword))
models.append(('NMF_Keyword',nmf_keyword))
models.append(('LSI_Keyword',lsi_keyword))

In [23]:
for name, model in models:
    cols=[]
    for i in range(NUM_TOPICS):
        cols.append(name[:5]+str(i))
    tmp=pd.DataFrame(model,columns=cols)
    tmp['Id']=tmp.index
    tmp['Id']=tmp.Id+1
    df=pd.merge(df,tmp,on='Id',how='left')

In [24]:
df.head(2)

Unnamed: 0,Id,url,title,html,content,keywords,LDA_T0,LDA_T1,LDA_T2,LDA_T3,...,NMF_K0,NMF_K1,NMF_K2,NMF_K3,NMF_K4,LSI_K0,LSI_K1,LSI_K2,LSI_K3,LSI_K4
0,1,http://mashable.com/2014/09/01/americans-held-...,Americans Held in North Korea Ask U.S. for Ass...,<!DOCTYPE html>\n<html data-env='production' l...,"PYONGYANG, North Korea — North Korea gave fore...","north-korea, uncategorized, us-world, world, d...",0.033334,0.033334,0.866665,0.033334,...,0.174287,0.0,0.0,0.000245,0.0,1.337091,-0.312889,-0.075335,0.014325,-0.025997
1,2,http://mashable.com/2014/09/01/apple-visa-mast...,"Apple Partners With Visa, MasterCard, AmEx for...",<!DOCTYPE html>\n<html data-env='production' l...,According to new reports from Bloomberg and ot...,"apple, visa, mobile-payments, american-express...",0.033334,0.033334,0.866664,0.033334,...,0.0,0.0,0.141952,0.0,0.16111,0.108035,0.157401,1.083466,-0.0854,0.860167


In [25]:
df.columns

Index(['Id', 'url', 'title', 'html', 'content', 'keywords', 'LDA_T0', 'LDA_T1',
       'LDA_T2', 'LDA_T3', 'LDA_T4', 'NMF_T0', 'NMF_T1', 'NMF_T2', 'NMF_T3',
       'NMF_T4', 'LSI_T0', 'LSI_T1', 'LSI_T2', 'LSI_T3', 'LSI_T4', 'LDA_C0',
       'LDA_C1', 'LDA_C2', 'LDA_C3', 'LDA_C4', 'NMF_C0', 'NMF_C1', 'NMF_C2',
       'NMF_C3', 'NMF_C4', 'LSI_C0', 'LSI_C1', 'LSI_C2', 'LSI_C3', 'LSI_C4',
       'LDA_K0', 'LDA_K1', 'LDA_K2', 'LDA_K3', 'LDA_K4', 'NMF_K0', 'NMF_K1',
       'NMF_K2', 'NMF_K3', 'NMF_K4', 'LSI_K0', 'LSI_K1', 'LSI_K2', 'LSI_K3',
       'LSI_K4'],
      dtype='object')

In [26]:
#writing the output
#kannan: we can remove url, title, content. just keep id and calculated columns. later we can join using 'id'
df.drop(['url', 'title','content','html','keywords'], axis=1,inplace=True)
df.to_excel(out_filepath, index=False)

In [27]:
"""
#Just visualizing the n_components representation of rows/observations
print(lda_title[0])
print(nmf_title[0])
print(lsi_title[0])
"""

'\n#Just visualizing the n_components representation of rows/observations\nprint(lda_title[0])\nprint(nmf_title[0])\nprint(lsi_title[0])\n'

In [28]:
"""
#Informational (how to extract topics)
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)
"""

'\n#Informational (how to extract topics)\ndef print_topics(model, vectorizer, top_n=10):\n    for idx, topic in enumerate(model.components_):\n        print("Topic %d:" % (idx))\n        print([(vectorizer.get_feature_names()[i], topic[i])\n                        for i in topic.argsort()[:-top_n - 1:-1]])\nprint("LDA Model:")\nprint_topics(lda_model, vectorizer)\nprint("=" * 20)\n \nprint("NMF Model:")\nprint_topics(nmf_model, vectorizer)\nprint("=" * 20)\n \nprint("LSI Model:")\nprint_topics(lsi_model, vectorizer)\nprint("=" * 20)\n'

In [29]:
"""
def infer_components(model,vectorizer,text):
    c=model.transform(vectorizer.transform([text]))[0]
    return c.tolist()

def infer_components_all(models,vectorizer,text):
    components=[]
    for _,model in models:
        components.extend(infer_components(model,vectorizer,text))
    return components
""" 

'\ndef infer_components(model,vectorizer,text):\n    c=model.transform(vectorizer.transform([text]))[0]\n    return c.tolist()\n\ndef infer_components_all(models,vectorizer,text):\n    components=[]\n    for _,model in models:\n        components.extend(infer_components(model,vectorizer,text))\n    return components\n'

In [30]:
"""
text = "Kannan is a good boy. An example of new sentence"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x)
"""

'\ntext = "Kannan is a good boy. An example of new sentence"\nx = lda_model.transform(vectorizer.transform([text]))[0]\nprint(x)\n'

In [31]:
"""
infer_components(lda_model,vectorizer,text)
"""

'\ninfer_components(lda_model,vectorizer,text)\n'