# Example: news data 

> Prenesemo dataset: https://www.kaggle.com/keitazoumana/abcnewsdata

## LDA Topic Modelling With Gensim

In [1]:
import pandas as pd
import gensim
from sklearn.feature_extraction.text import CountVectorizer

documents = pd.read_csv('data/news-data.csv')
 
documents.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/news-data.csv'

In [None]:
# Use CountVectorizor to find three letter tokens, remove stop_words, 
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents

vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english', token_pattern='(?u)\\b\\w\\w\\w+\\b')

In [None]:
# Fit and transform
X = vect.fit_transform(documents.headline_text)

In [None]:
X

In [None]:
# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

In [None]:
# Mapping from word IDs to words (To be used in LdaModel's id2word parameter)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())

In [None]:
# Use the gensim.models.ldamodel.LdaModel constructor to estimate 
# LDA model parameters on the corpus, and save to the variable `ldamodel`
 
ldamodel = gensim.models.LdaMulticore(corpus=corpus, id2word=id_map, passes=2,  random_state=5, num_topics=10, workers=3)

In [None]:
for idx, topic in ldamodel.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

In [None]:
my_document = documents.headline_text[17]
my_document

In [None]:
def topic_distribution(string_input):
    string_input = [string_input]
    
    # Fit and transform
    X = vect.transform(string_input)
    # Convert sparse matrix to gensim corpus.
    corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)
    output = list(ldamodel[corpus])[0]
    return output

topic_distribution(my_document)

In [None]:
topic_distribution("elections start tommorow")

## Topic Modelling With NMF

In [None]:
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
 
documents = pd.read_csv('data/news-data.csv')
documents.head()

In [None]:
# use tfidf by removing tokens that don't appear in at least 50 documents
vect = TfidfVectorizer(min_df=50, stop_words='english')
 
# Fit and transform
X = vect.fit_transform(documents.headline_text)

In [None]:
# Create an NMF instance: model
# the 10 components will be the topics
model = NMF(init="nndsvd", n_components=10, random_state=5)
 
# Fit the model to TF-IDF
model.fit(X)
 
# Transform the TF-IDF: nmf_features
nmf_features = model.transform(X)

In [None]:
# TF-IDF Dimensions:
X.shape

In [None]:
# Features Dimensions:
nmf_features.shape

In [None]:
# Components Dimensions:
model.components_.shape

In [None]:
# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=vect.get_feature_names_out())
components_df

In [None]:
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'For topic {topic+1} the words with the highest value are:')
    print(tmp.nlargest(10))
    print('\n')

In [None]:
my_document = documents.headline_text[55]
my_document

In [None]:
pd.DataFrame(nmf_features).loc[55]

In [None]:
pd.DataFrame(nmf_features).loc[55].idxmax()

In [None]:
pd.DataFrame(nmf_features).idxmax(axis=1).value_counts()

In [None]:
my_news = """15-year-old girl stabbed to death in grocery store during fight with 4 younger girls
Authorities said they gathered lots of evidence from videos on social media"""
 
# Transform the TF-IDF
X = vect.transform([my_news])

# Transform the TF-IDF: nmf_features
nmf_features = model.transform(X)
 
pd.DataFrame(nmf_features)

In [None]:
pd.DataFrame(nmf_features).idxmax(axis=1)