In [1]:
import numpy as np
import json
import glob
import pandas as pd

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
stoplist = set(stopwords.words("english"))

In [3]:
print(stoplist)

{'again', "you'd", 'until', 'was', 'of', 'shouldn', 'themselves', 'do', 'he', 'o', 'on', 'into', 'any', 'not', 'don', 'to', 'these', 'wouldn', 'up', 'their', 'that', 've', 'been', 'such', 'mustn', 'should', 'under', 'isn', 'against', 'hasn', 'more', 'where', 'by', 'too', "it's", "she's", 'ain', 'himself', 'll', 'nor', 'yourselves', 'when', "you've", 'those', 'doesn', 'who', "shan't", 'she', 'an', 'am', 'most', 'there', 'each', 'haven', 'yours', "haven't", "mightn't", "wouldn't", 'they', 'are', 'a', 'below', 'me', 't', 'its', 'having', 'for', 'but', 'this', 'once', 'very', 'whom', 'while', 'some', 'or', 'own', "couldn't", 'aren', 'then', 'mightn', 'ma', 'you', 'further', 'no', "mustn't", 'if', 'above', 're', 'doing', 'as', "shouldn't", 'him', 'them', 'down', 'have', "weren't", 'other', "didn't", 'which', 'now', 'had', 'yourself', "aren't", 'didn', 'from', "wasn't", 'it', 'after', 'the', 'about', 'will', 'out', 'weren', "isn't", 'our', 'my', 'has', 'does', 's', 'at', 'wasn', 'shan', 'thr

In [4]:
df=pd.read_csv('data/abcnews-date-text.csv')
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [5]:
df=df.drop('publish_date', axis=1)
df.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [6]:

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:

from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

df['without_stopwords'] = df['headline_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stoplist)]))
#print(df['without_stopwords'])

# TOKENIZATION
tt = TweetTokenizer()
df['tokenised_tweet'] = df['without_stopwords'].apply(tt.tokenize)
#print(df)

lmtzr = WordNetLemmatizer()
df['lemmatize'] = df['tokenised_tweet'].apply(
                    lambda lst:[lmtzr.lemmatize(word) for word in lst])
print(df['lemmatize'])

0           [aba, decides, community, broadcasting, licence]
1              [act, fire, witness, must, aware, defamation]
2              [g, call, infrastructure, protection, summit]
3                  [air, nz, staff, aust, strike, pay, rise]
4           [air, nz, strike, affect, australian, traveller]
                                 ...                        
1244179    [two, aged, care, resident, die, state, record...
1244180    [victoria, record, 5, ;, 919, new, case, seven...
1244181    [wa, delay, adopting, new, close, contact, def...
1244182    [western, ringtail, possum, found, badly, dehy...
1244183             [make, close, covid, contact, new, rule]
Name: lemmatize, Length: 1244184, dtype: object


In [8]:
#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(df['lemmatize'], min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[df['lemmatize']], threshold=100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(df['lemmatize'])
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams[0])

['aba', 'decides', 'community', 'broadcasting', 'licence']


In [9]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [13]:
id2word = corpora.Dictionary(df['lemmatize'])

corpus = []
for text in df['lemmatize']:
    new = id2word.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word[[0][:1][0]]
print (word)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]
aba


In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=1,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
test_doc = corpus[-1]

vector = lda_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

In [None]:
lda_model.save("models/test_model.model")


## Visualizing The data


In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis