Kannan: performing decomposition and representing textual features (Title & Content) in n-components using LDA, NMF and LSI.

In [1]:
#Author: kannan / extracting LDA, NMF & LSI features
import pandas as pd
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

from datetime import datetime

#### Reading from extracted content file

In [2]:
in_filepath='../data/output/SS_Extracted_content.xlsx'
in_filepath_kw='../data/output/2_keywords_list.xlsx'
out_filepath='../data/output/2_NLP_LDA_NMF_LSI.xlsx'

df=pd.read_excel(in_filepath)
df_kw=pd.read_excel(in_filepath_kw)

df.head()

Unnamed: 0,Id,url,title,html,content
0,1,http://mashable.com/2014/09/01/americans-held-...,Americans Held in North Korea Ask U.S. for Ass...,<!DOCTYPE html>\n<html data-env='production' l...,"PYONGYANG, North Korea — North Korea gave fore..."
1,2,http://mashable.com/2014/09/01/apple-visa-mast...,"Apple Partners With Visa, MasterCard, AmEx for...",<!DOCTYPE html>\n<html data-env='production' l...,According to new reports from Bloomberg and ot...
2,3,http://mashable.com/2014/09/01/aussie-football...,Aussie Football Players Dress Up as Rolf Harri...,<!DOCTYPE html>\n<html data-env='production' l...,SYDNEY — Two Australian football players are a...
3,4,http://mashable.com/2014/09/01/australia-gover...,Australia Government Buys Bombproof BMW Fleet ...,<!DOCTYPE html>\n<html data-env='production' l...,SYDNEY — The Australia Prime Minister has orde...
4,5,http://mashable.com/2014/09/01/australia-jane-...,Author Compares Women in Traditional Marriages...,<!DOCTYPE html>\n<html data-env='production' l...,"SYDNEY — On Australian panel show Q&A, author ..."


In [3]:
df_kw.head()

Unnamed: 0,Id,keywords
0,1,"north-korea, uncategorized, us-world, world, d..."
1,2,"apple, visa, mobile-payments, american-express..."
2,3,"australia, uncategorized, us-world, sports, ma..."
3,4,"australia, bmw, uncategorized, us-world, tony-..."
4,5,"q-a, australia, uncategorized, tv, us-world, j..."


In [4]:
df=pd.merge(df,df_kw,on='Id',how='left')

In [5]:
df.head()

Unnamed: 0,Id,url,title,html,content,keywords
0,1,http://mashable.com/2014/09/01/americans-held-...,Americans Held in North Korea Ask U.S. for Ass...,<!DOCTYPE html>\n<html data-env='production' l...,"PYONGYANG, North Korea — North Korea gave fore...","north-korea, uncategorized, us-world, world, d..."
1,2,http://mashable.com/2014/09/01/apple-visa-mast...,"Apple Partners With Visa, MasterCard, AmEx for...",<!DOCTYPE html>\n<html data-env='production' l...,According to new reports from Bloomberg and ot...,"apple, visa, mobile-payments, american-express..."
2,3,http://mashable.com/2014/09/01/aussie-football...,Aussie Football Players Dress Up as Rolf Harri...,<!DOCTYPE html>\n<html data-env='production' l...,SYDNEY — Two Australian football players are a...,"australia, uncategorized, us-world, sports, ma..."
3,4,http://mashable.com/2014/09/01/australia-gover...,Australia Government Buys Bombproof BMW Fleet ...,<!DOCTYPE html>\n<html data-env='production' l...,SYDNEY — The Australia Prime Minister has orde...,"australia, bmw, uncategorized, us-world, tony-..."
4,5,http://mashable.com/2014/09/01/australia-jane-...,Author Compares Women in Traditional Marriages...,<!DOCTYPE html>\n<html data-env='production' l...,"SYDNEY — On Australian panel show Q&A, author ...","q-a, australia, uncategorized, tv, us-world, j..."


In [6]:
df.shape

(7795, 6)

#### feature extraction - decomposition

In [7]:
NUM_TOPICS=10

In [8]:
df.iloc[0]['title']
#df.iloc[0]['content']

'Americans Held in North Korea Ask U.S. for Assistance'

In [9]:
titles=df['title'].values.astype('U')
titles=titles.tolist()
titles=list(filter(None,titles))
titles[0:5]

['Americans Held in North Korea Ask U.S. for Assistance',
 'Apple Partners With Visa, MasterCard, AmEx for iPhone 6 Payments: Reports',
 'Aussie Football Players Dress Up as Rolf Harris and Victim',
 'Australia Government Buys Bombproof BMW Fleet for G20 Summit',
 'Author Compares Women in Traditional Marriages to Prostitutes on TV Panel Show']

In [10]:
content=df['content'].values.astype('U')
content=content.tolist()
content=list(filter(None,content))
#content[0:2]

In [11]:
keywords=df['keywords'].values.astype('U')
keywords=keywords.tolist()
keywords=list(filter(None,keywords))
keywords[0:5]

['north-korea, uncategorized, us-world, world, detainees, matthew-miller, jeffrey-fowle',
 'apple, visa, mobile-payments, american-express, mastercard, uncategorized, business, apps-software, mobile',
 'australia, uncategorized, us-world, sports, mad-monday, afl',
 'australia, bmw, uncategorized, us-world, tony-abbott',
 'q-a, australia, uncategorized, tv, us-world, jane-caro']

In [12]:
%%time
#count vectorizing the data
#reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
vectorizer = CountVectorizer(analyzer='word',min_df=5, max_df=0.95, 
                             stop_words='english', lowercase=True, encoding='utf-8',
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

title_vectorized = vectorizer.fit_transform(titles)
content_vectorized = vectorizer.fit_transform(content)
keyword_vectorized = vectorizer.fit_transform(keywords)

Wall time: 2.81 s


In [13]:
%%time
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_title = lda_model.fit_transform(title_vectorized)
lda_content = lda_model.fit_transform(content_vectorized)
lda_keyword = lda_model.fit_transform(keyword_vectorized)
print(lda_title.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(7795, 10)
Wall time: 1min 7s


In [14]:
%%time
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_title = nmf_model.fit_transform(title_vectorized)
nmf_content = nmf_model.fit_transform(content_vectorized)
nmf_keyword = nmf_model.fit_transform(keyword_vectorized)
print(nmf_title.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(7795, 10)
Wall time: 3.87 s


In [15]:
%%time
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_title = lsi_model.fit_transform(title_vectorized)
lsi_content = lsi_model.fit_transform(content_vectorized)
lsi_keyword = lsi_model.fit_transform(keyword_vectorized)
print(lsi_title.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(7795, 10)
Wall time: 661 ms


In [16]:
models=[]
models.append(('LDA_Title',lda_title))
models.append(('NMF_Title',nmf_title))
models.append(('LSI_Title',lsi_title))
models.append(('LDA_Content',lda_content))
models.append(('NMF_Content',nmf_content))
models.append(('LSI_Content',lsi_content))
models.append(('LDA_Keyword',lda_keyword))
models.append(('NMF_Keyword',nmf_keyword))
models.append(('LSI_Keyword',lsi_keyword))

In [17]:
for name, model in models:
    cols=[]
    for i in range(NUM_TOPICS):
        cols.append(name[:5]+str(i))
    tmp=pd.DataFrame(model,columns=cols)
    tmp['Id']=tmp.index
    tmp['Id']=tmp.Id+1
    df=pd.merge(df,tmp,on='Id',how='left')

In [18]:
df.head()

Unnamed: 0,Id,url,title,html,content,keywords,LDA_T0,LDA_T1,LDA_T2,LDA_T3,...,LSI_K0,LSI_K1,LSI_K2,LSI_K3,LSI_K4,LSI_K5,LSI_K6,LSI_K7,LSI_K8,LSI_K9
0,1,http://mashable.com/2014/09/01/americans-held-...,Americans Held in North Korea Ask U.S. for Ass...,<!DOCTYPE html>\n<html data-env='production' l...,"PYONGYANG, North Korea — North Korea gave fore...","north-korea, uncategorized, us-world, world, d...",0.016667,0.85,0.016667,0.016667,...,1.337092,-0.31289,-0.075334,0.01434,-0.026191,-0.018709,0.148285,-0.247328,0.032311,0.013983
1,2,http://mashable.com/2014/09/01/apple-visa-mast...,"Apple Partners With Visa, MasterCard, AmEx for...",<!DOCTYPE html>\n<html data-env='production' l...,According to new reports from Bloomberg and ot...,"apple, visa, mobile-payments, american-express...",0.016667,0.016667,0.016667,0.85,...,0.108039,0.157399,1.083487,-0.085303,0.859357,-0.107519,-0.243842,-0.126799,-0.1846,-0.059431
2,3,http://mashable.com/2014/09/01/aussie-football...,Aussie Football Players Dress Up as Rolf Harri...,<!DOCTYPE html>\n<html data-env='production' l...,SYDNEY — Two Australian football players are a...,"australia, uncategorized, us-world, sports, ma...",0.016667,0.016667,0.016667,0.016667,...,0.866041,-0.128184,-0.062708,0.121394,-0.041246,-0.083225,-0.559973,0.589085,-0.031191,-0.294628
3,4,http://mashable.com/2014/09/01/australia-gover...,Australia Government Buys Bombproof BMW Fleet ...,<!DOCTYPE html>\n<html data-env='production' l...,SYDNEY — The Australia Prime Minister has orde...,"australia, bmw, uncategorized, us-world, tony-...",0.02,0.62,0.02,0.02,...,0.845023,-0.20562,-0.058375,-0.006827,-0.049797,-0.086334,-0.558895,0.561352,-0.051666,0.00198
4,5,http://mashable.com/2014/09/01/australia-jane-...,Author Compares Women in Traditional Marriages...,<!DOCTYPE html>\n<html data-env='production' l...,"SYDNEY — On Australian panel show Q&A, author ...","q-a, australia, uncategorized, tv, us-world, j...",0.033333,0.033333,0.7,0.033333,...,0.841904,-0.205212,-0.057965,-0.006457,-0.049351,-0.085909,-0.551905,0.554571,-0.051252,0.002833


In [19]:
#writing the output
#kannan: we can remove url, title, content. just keep id and calculated columns. later we can join using 'id'
df_bak=df.copy()
df.drop(['url', 'title','content','html','keywords'], axis=1,inplace=True)
df.to_excel(out_filepath, index=False)

In [178]:
data[0]

'Americans Held in North Korea Ask U.S. for Assistance'

In [179]:
#Just visualizing the n_components representation of rows/observations
print(lda_title[0])
print(nmf_title[0])
print(lsi_title[0])

[0.01666667 0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
 0.01666667 0.01666667 0.85       0.01666667]
[0.         0.         0.00034471 0.00541244 0.         0.
 0.03228402 0.00084339 0.         0.00647047]
[ 0.00744652  0.00841603  0.01554511  0.07514149  0.03949661  0.01724751
  0.13662781 -0.03699941 -0.06677146  0.14270257]


In [180]:
#Informational (how to extract topics)
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)


LDA Model:
Topic 0:
[('said', 7668.455564861905), ('ebola', 2068.384128882067), ('people', 1988.0129055458444), ('government', 1800.7081231436775), ('state', 1724.271322225878), ('according', 1640.177380226889), ('president', 1611.75055422183), ('told', 1386.8573065940584), ('new', 1315.8544347861605), ('obama', 1295.7054856725022)]
Topic 1:
[('company', 3414.7197332322576), ('people', 3363.0815403441798), ('new', 3226.545453096199), ('like', 2987.4984721971814), ('time', 2797.6119190493864), ('said', 2517.130044903668), ('make', 2328.4810049249354), ('says', 2262.236895724618), ('just', 2249.382635741243), ('work', 2213.3649747501954)]
Topic 2:
[('mashable', 3181.5885044763495), ('com', 2484.432637360426), ('https', 2311.978687743325), ('video', 1807.0453678507201), ('youtube', 1535.0940393989092), ('music', 1413.9407911664064), ('file', 1280.6440293306855), ('aws', 1191.6966989047423), ('vdist', 1189.9620846358719), ('null', 1164.2373367265245)]
Topic 3:
[('fail', 2609.5887839988195)

[('images', 0.4311959536639149), ('getty', 0.40052456613911963), ('mashable', 0.23051172740123893), ('https', 0.19219659914653042), ('com', 0.14672532213326772), ('new', 0.1362833842295376), ('file', 0.12726212613096938), ('aws', 0.1271379768740413), ('vdist', 0.1268867678107286), ('null', 0.12373193077531718)]
Topic 8:
[('police', 0.3004195777844155), ('hong', 0.22111829599383037), ('kong', 0.2172126204731776), ('protesters', 0.20047836087031934), ('press', 0.19904839316854223), ('associated', 0.1914877288116558), ('com', 0.12158778860348063), ('https', 0.11300940916914255), ('ferguson', 0.11106334369732355), ('iphone', 0.10395009550516253)]
Topic 9:
[('police', 0.23511765770998302), ('christina', 0.17988949067537707), ('ascani', 0.17879481007848647), ('hong', 0.17369774584573408), ('kong', 0.1704758247967365), ('protesters', 0.16995749227559706), ('twitter', 0.14207359170368328), ('october', 0.13064096638011274), ('children', 0.12004829753487772), ('job', 0.1197097943685944)]


In [181]:
def infer_components(model,vectorizer,text):
    c=model.transform(vectorizer.transform([text]))[0]
    return c.tolist()

def infer_components_all(models,vectorizer,text):
    components=[]
    for _,model in models:
        components.extend(infer_components(model,vectorizer,text))
    return components
    

In [182]:
text = "Kannan is a good boy. An example of new sentence"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x)

[0.20151986 0.3781287  0.01666953 0.01666905 0.30366269 0.01667154
 0.01666724 0.0166676  0.01667179 0.01667201]


In [183]:
infer_components(lda_model,vectorizer,text)

[0.2015198577492688,
 0.3781287021439153,
 0.016669531964861407,
 0.016669050844705623,
 0.30366269018905556,
 0.01667153993211631,
 0.016667237159911853,
 0.01666759513747562,
 0.016671787216513707,
 0.016672007662175886]

In [185]:
#infer_components_all(models,vectorizer,text)