In [1]:
# https://www.kaggle.com/datasets/thedevastator/new-dataset-for-text-classification-ag-news?resource=download

In [2]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import spacy
from pprint import pprint
import pandas as pd

from tqdm import tqdm

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','edu','use'])



In [3]:
df = pd.read_csv("data/corpus_agnews_train.csv")
df.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [4]:
set(df["label"].tolist())

{0, 1, 2, 3}

In [5]:
def sent_to_words(sentences):
    for sentence in tqdm(sentences):
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [6]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [7]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in tqdm(texts):
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])
    return texts_out

In [8]:
data = df["text"].tolist()
data_words = list(sent_to_words(data))

# Remove Stop Words
print("Start removing stop words")
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# conda install -c conda-forge spacy-model-en_core_web_sm
print("Installing spacy")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
print("Start lemmatizing words")
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

100%|████████████████████████████████████████████████████████████████████████| 120000/120000 [00:14<00:00, 8130.09it/s]


Start removing stop words
Installing spacy


  0%|                                                                             | 19/120000 [00:00<11:19, 176.51it/s]

Start lemmatizing words


100%|█████████████████████████████████████████████████████████████████████████| 120000/120000 [06:30<00:00, 306.92it/s]


In [9]:
print(df["text"].tolist()[0])
print(data_lemmatized[0])

Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
['wall', 'st', 'bear', 'claw', 'back', 'black', 'reuter', 'reuter', 'short', 'seller', 'wall', 'street', 'dwindle', 'band', 'ultra', 'cynic', 'see', 'green']


In [10]:
data_lemmatized_min_length = []

for sublist in tqdm(data_lemmatized):
    # Use a list comprehension to filter out strings with less than two characters
    sublist = [word for word in sublist if len(word) > 2]
    data_lemmatized_min_length.append(sublist)

100%|██████████████████████████████████████████████████████████████████████| 120000/120000 [00:00<00:00, 243830.90it/s]


In [11]:
print(df["text"].tolist()[0])
print(data_lemmatized[0])
print(data_lemmatized_min_length[0])

Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
['wall', 'st', 'bear', 'claw', 'back', 'black', 'reuter', 'reuter', 'short', 'seller', 'wall', 'street', 'dwindle', 'band', 'ultra', 'cynic', 'see', 'green']
['wall', 'bear', 'claw', 'back', 'black', 'reuter', 'reuter', 'short', 'seller', 'wall', 'street', 'dwindle', 'band', 'ultra', 'cynic', 'see', 'green']


In [15]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized_min_length)

# Create Corpus
texts = data_lemmatized_min_length

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View 
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2)]]


### Topic Models

#### Vector Space Model (VSM)

In [16]:
from scipy.sparse import dok_matrix

# Define function to convert Gensim corpus to a sparse pandas DataFrame
def corpus_to_sparse_dataframe(corpus):
    word_freq = dok_matrix((len(corpus), len(id2word)), dtype=int)

    for i, doc in enumerate(corpus):
        for word_id, freq in doc:
            word_freq[i, word_id] = freq

    dataframe = pd.DataFrame.sparse.from_spmatrix(word_freq)
    dataframe.columns = [id2word[word_id] for word_id in range(len(id2word))]
    return dataframe

In [17]:
VSM = corpus_to_sparse_dataframe(corpus)

#### Vector Space Model & tf-idf (VSM & tf-idf)

In [18]:
from gensim.models import TfidfModel

model = TfidfModel(corpus)  # fit model
tfidf_corpus = model[corpus]

#### Latent Semantic Indexing (LSI)

In [19]:
from gensim.models import LsiModel

K = 8
lsi_model = LsiModel(corpus, id2word=id2word, num_topics=K)
lsi_model.print_topics(num_topics=K, num_words=10)

[(0,
  '0.404*"reuter" + 0.378*"say" + 0.284*"new" + 0.174*"stock" + 0.163*"fullquote" + 0.139*"year" + 0.117*"york" + 0.113*"oil" + 0.111*"com" + 0.110*"company"'),
 (1,
  '-0.365*"reuter" + 0.338*"say" + -0.325*"fullquote" + -0.262*"stock" + -0.184*"investor" + -0.173*"http" + -0.172*"href" + -0.172*"www" + -0.171*"ticker" + 0.168*"quot"'),
 (2,
  '0.624*"new" + -0.504*"say" + -0.226*"reuter" + 0.189*"york" + 0.132*"oil" + -0.132*"kill" + 0.117*"price" + -0.110*"iraq" + 0.104*"game" + 0.082*"high"'),
 (3,
  '0.559*"oil" + 0.408*"price" + -0.337*"new" + -0.240*"quot" + 0.193*"high" + 0.136*"rise" + -0.128*"fullquote" + 0.119*"year" + 0.112*"crude" + 0.098*"fall"'),
 (4,
  '-0.361*"quot" + -0.351*"say" + -0.332*"new" + 0.251*"year" + -0.218*"oil" + 0.215*"game" + 0.205*"first" + 0.177*"win" + 0.175*"two" + 0.172*"reuter"'),
 (5,
  '0.667*"quot" + -0.396*"new" + -0.214*"reuter" + 0.152*"year" + -0.150*"say" + -0.145*"kill" + 0.128*"company" + -0.119*"york" + 0.114*"microsoft" + -0.108*"

#### Latent Semantic Indexing & tf-idf (LSI & tf-idf)

In [20]:
from gensim.models import LsiModel

K = 8
tfidf_lsi_model = LsiModel(tfidf_corpus, id2word=id2word, num_topics=K)
tfidf_lsi_model.print_topics(num_topics=K, num_words=10)

[(0,
  '0.201*"reuter" + 0.176*"oil" + 0.151*"stock" + 0.149*"new" + 0.147*"say" + 0.142*"price" + 0.108*"year" + 0.104*"high" + 0.096*"fullquote" + 0.095*"rise"'),
 (1,
  '-0.289*"oil" + -0.280*"stock" + -0.225*"fullquote" + -0.218*"price" + 0.156*"kill" + -0.147*"investor" + 0.138*"iraq" + -0.138*"reuter" + -0.119*"profit" + -0.114*"rise"'),
 (2,
  '0.449*"oil" + 0.269*"price" + -0.197*"microsoft" + -0.179*"fullquote" + 0.156*"iraq" + 0.143*"crude" + 0.136*"kill" + -0.122*"com" + 0.117*"barrel" + -0.115*"software"'),
 (3,
  '-0.303*"fullquote" + -0.250*"kill" + -0.210*"reuter" + 0.196*"oil" + -0.181*"iraq" + -0.152*"quickinfo" + -0.152*"aspx" + -0.151*"href" + -0.151*"http" + -0.150*"baghdad"'),
 (4,
  '-0.320*"microsoft" + 0.210*"game" + 0.157*"sox" + 0.155*"win" + -0.152*"search" + 0.143*"red" + -0.137*"window" + -0.136*"software" + -0.128*"google" + 0.114*"series"'),
 (5,
  '-0.237*"olympic" + -0.195*"athen" + 0.190*"bush" + -0.187*"microsoft" + -0.186*"gold" + -0.160*"medal" + -0

#### Non-Negative Matrix Factorization (NMF)

In [21]:
# https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/nmf_tutorial.ipynb
from gensim.models.nmf import Nmf

K = 8
nmf_model = Nmf(corpus, id2word=id2word, num_topics=K)
nmf_model.show_topics(num_topics=K, num_words=10)

[(0,
  '0.027*"year" + 0.010*"two" + 0.009*"one" + 0.008*"first" + 0.007*"last" + 0.006*"kill" + 0.006*"three" + 0.005*"week" + 0.005*"time" + 0.005*"report"'),
 (1,
  '0.046*"quot" + 0.041*"game" + 0.013*"state" + 0.011*"reuter" + 0.008*"team" + 0.007*"united" + 0.007*"olympic" + 0.007*"world" + 0.006*"athen" + 0.005*"play"'),
 (2,
  '0.065*"oil" + 0.050*"price" + 0.026*"high" + 0.023*"stock" + 0.017*"rise" + 0.013*"crude" + 0.013*"fall" + 0.010*"barrel" + 0.010*"record" + 0.010*"low"'),
 (3,
  '0.092*"new" + 0.025*"york" + 0.004*"search" + 0.004*"yankee" + 0.004*"service" + 0.004*"announce" + 0.003*"launch" + 0.003*"system" + 0.003*"microsoft" + 0.003*"technology"'),
 (4,
  '0.042*"say" + 0.015*"company" + 0.013*"reuter" + 0.009*"microsoft" + 0.009*"inc" + 0.008*"corp" + 0.007*"plan" + 0.007*"million" + 0.007*"service" + 0.006*"software"'),
 (5,
  '0.034*"iraq" + 0.034*"say" + 0.018*"kill" + 0.015*"iraqi" + 0.012*"baghdad" + 0.010*"quot" + 0.009*"official" + 0.008*"afp" + 0.008*"forc

#### Non-Negative Matrix Factorization & tf-idf (NMF & tf-idf)

In [22]:
# https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/nmf_tutorial.ipynb
from gensim.models.nmf import Nmf

K = 8
tfidf_nmf_model = Nmf(tfidf_corpus, id2word=id2word, num_topics=K)
tfidf_nmf_model.show_topics(num_topics=K, num_words=10)

[(0,
  '0.048*"google" + 0.035*"search" + 0.016*"share" + 0.013*"engine" + 0.011*"inc" + 0.010*"public" + 0.010*"yahoo" + 0.010*"web" + 0.009*"internet" + 0.008*"desktop"'),
 (1,
  '0.036*"fullquote" + 0.028*"reuter" + 0.025*"stock" + 0.019*"investor" + 0.019*"http" + 0.019*"href" + 0.018*"www" + 0.018*"aspx" + 0.018*"quickinfo" + 0.018*"ticker"'),
 (2,
  '0.057*"oil" + 0.038*"price" + 0.018*"crude" + 0.017*"high" + 0.015*"barrel" + 0.014*"stock" + 0.012*"supply" + 0.010*"rise" + 0.010*"fall" + 0.010*"record"'),
 (3,
  '0.021*"iraq" + 0.019*"kill" + 0.015*"iraqi" + 0.014*"baghdad" + 0.008*"bomb" + 0.008*"najaf" + 0.008*"militant" + 0.007*"attack" + 0.007*"troop" + 0.007*"hostage"'),
 (4,
  '0.035*"gold" + 0.023*"olympic" + 0.023*"medal" + 0.016*"athen" + 0.009*"woman" + 0.008*"hamm" + 0.008*"phelp" + 0.007*"gaza" + 0.006*"man" + 0.006*"palestinian"'),
 (5,
  '0.013*"game" + 0.007*"sox" + 0.007*"red" + 0.006*"win" + 0.006*"series" + 0.005*"team" + 0.005*"first" + 0.005*"league" + 0.005*

#### Latent Dirichlet Allocation (LDA)

In [23]:
# Build LDA model
K = 8
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=K,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=400,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the Keywords in the 3 topics
pprint(lda_model.print_topics())

KeyboardInterrupt: 