# NLP Project
## Topic extraction
The purpose of this project is to extract topics from news articles.

### Step-by-step Process
1. Find a suitable NLP model to use for topic extraction: LDA
2. Preprocess the data
3. Get results
4. Documentation

In [1]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
import gensim  # necessary?
from gensim import corpora
from gensim.models import Word2Vec, TfidfModel, LdaMulticore, CoherenceModel
import spacy
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pyLDAvis
import pyLDAvis.gensim_models

### Data Pre-processing

In [2]:
# Large dataset model:
# from sklearn.model_selection import train_test_split

# # read in data
# df_1 = pd.read_csv('Data/articles1.csv')['content'].to_frame()  # only get content-column
# df_2 = pd.read_csv('Data/articles2.csv')['content'].to_frame()
# df_3 = pd.read_csv('Data/articles3.csv')['content'].to_frame()
# df = df_1.append(df_2).append(df_3)
# print('\nData set, shape:', df.shape)
# print(df.head(5))

# # check for missing data
# print(df.isna().sum())  # shows no null values in content-column

# # split data into ~67% training and ~33% testing
# train, test = train_test_split(df, test_size=0.33, random_state=1)
# print('\nTraining data set, shape:', train.shape)
# print('Testing data set, shape:', test.shape)

# # reset indices
# train = train.reset_index(drop=True)
# test = test.reset_index(drop=True)
# print(train.head(5))
# print('\n', test.head(5))

# Small dataset model:
# read in data
df = pd.read_csv('Data/articles1.csv')['content'].to_frame()
# df.drop(df.index[0:49900],0,inplace=True)  # drop a few rows to make dataset smaller and more manageable
print('\nData set, shape:', df.shape)
# check for missing data
print(df.isna().sum())  # shows no null values in content-column

# pd.set_option('display.max_colwidth', None)
print(df.head(20))


Data set, shape: (50000, 1)
content    0
dtype: int64
                                              content
0   WASHINGTON  —   Congressional Republicans have...
1   After the bullet shells get counted, the blood...
2   When Walt Disney’s “Bambi” opened in 1942, cri...
3   Death may be the great equalizer, but it isn’t...
4   SEOUL, South Korea  —   North Korea’s leader, ...
5   LONDON  —   Queen Elizabeth II, who has been b...
6   BEIJING  —   President Tsai   of Taiwan sharpl...
7   Danny Cahill stood, slightly dazed, in a blizz...
8   Just how   is Hillary Kerr, the    founder of ...
9   Angels are everywhere in the Muñiz family’s ap...
10  With Donald J. Trump about to take control of ...
11  THOMPSONS, Tex.  —   Can one of the most promi...
12  WEST PALM BEACH, Fla.  —   When   Donald J. Tr...
13  This article is part of a series aimed at help...
14  It’s the season for family travel and photos  ...
15  Finally. The Second Avenue subway opened in Ne...
16    pages into the   jour

### Pre-process the Data
Normalisation, cleaning, etc.

In [3]:
# load spacy nlp pre-processing pipeline to use for lemmatization
nlp = spacy.load('en_core_web_sm')

In [4]:
# create filter for gensim nlp pre-processing pipeline to include all steps except stemmatization
CUSTOM_FILTERS = [lambda x: x.lower(),  # lowercase
                  strip_tags,
                  strip_punctuation,  # replace punctuation with whitespace
                  strip_multiple_whitespaces,  # remove repeating whitespaces
                  strip_numeric,  # remove numbers
                  remove_stopwords,  # remove stopwords
                  strip_short,  # remove words with less than 3 characters
                  #  stem_text  # return porter-stemmed text,
                 ]

In [5]:
sample = "Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?"
print(sample)

Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?


In [6]:
# test sample string without filtered pipeline, i.e., with stemmatizer
test_a = preprocess_string(sample)
print(test_a)

['hello', 'guess', 'kim', 'wrote', 'signatur', 'right', 'parent', 'call']


In [7]:
# test sample string with filtered pipeline and lemmatizer
test_b = ' '.join(preprocess_string(sample, CUSTOM_FILTERS))  # pre-process without stemmatizing
lem = [token.lemma_ for token in nlp(test_b)]  # lemmatize
print(lem)

['hello', 'guess', 'kim', 'write', 'signature', 'right', 'parent', 'call']


In [8]:
def preprocess_articles(x):
    prep = ' '.join(preprocess_string(x, CUSTOM_FILTERS))
    return [token.lemma_ for token in nlp(prep)]

In [9]:
# apply final pipeline to all data
df['preprocessed'] = df['content'].apply(preprocess_articles)
print(df['preprocessed'].head(20))

0     [washington, congressional, republicans, new, ...
1     [bullet, shell, count, blood, dry, votive, can...
2     [walt, disney, ’s, ", bambi, ", open, critic, ...
3     [death, great, equalizer, be, n’t, necessarily...
4     [seoul, south, korea, north, korea, ’s, leader...
5     [london, queen, elizabeth, battling, cold, wee...
6     [beijing, president, tsai, taiwan, sharply, cr...
7     [danny, cahill, stand, slightly, daze, blizzar...
8     [hillary, kerr, founder, digital, medium, comp...
9     [angels, muñiz, family, ’s, apartment, bronx, ...
10    [donald, trump, control, white, house, dark, t...
11    [thompson, tex, promise, troubled, technology,...
12    [west, palm, beach, fla, donald, trump, rang, ...
13    [article, series, aim, helping, navigate, life...
14    [it, ’s, season, family, travel, photo, enlarg...
15    [finally, second, avenue, subway, open, new, y...
16    [pages, journal, dylann, roof, ’s, car, assert...
17    [mumbai, india, bold, risky, gamble, prime

### Prepare pre-processed articles for topic extraction models

In [10]:
# create corpus: article contents embedded according to BoW-model using dictionary
article_dict = corpora.Dictionary(df['preprocessed'])
article_dict.filter_extremes(no_below=3)
bow_corpus = [article_dict.doc2bow(line) for line in df['preprocessed']]
tfidf = TfidfModel(bow_corpus)  # take term-frequency into account in bow-model, fit
tfidf_corpus = tfidf[bow_corpus]  # transform

### LDA - Topic Modelling

In [11]:
np.random.seed(123456)
lda_model = LdaMulticore(corpus=tfidf_corpus,
                         id2word=article_dict,
                         random_state=100,
                         num_topics=5)

# tune the number of topics!

In [12]:
# save the model
lda_model.save("LDA_model/LDA_model")

# load the model
# lda_model = LdaModel.load("LDA_model/LDA_model")

In [13]:
# keywords per topic
lda_model.print_topics(-1)

doc_lda = lda_model[tfidf_corpus]

# print(lda_model.print_topic(0))
# print('\n', lda_model.print_topic(1))
# print('\n', lda_model.print_topic(2))

In [14]:
# Compute Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=df['preprocessed'], dictionary=article_dict, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print('Coherence Score: ', coherence_score)

# change num of topics to increase coherence score

Coherence Score:  0.3570446379596891


In [21]:
# print topic(s) per article
# for topic in doc_lda:
#     print(topic)

# doc_lda[4]

# print dominant topic(s) of the 20 first articles
for i in range(20):
    print(doc_lda[i])

[(2, 0.49211383), (5, 0.44548208)]
[(0, 0.49016407), (2, 0.45146358)]
[(1, 0.55731976), (3, 0.37922713)]
[(0, 0.20885485), (1, 0.10370536), (3, 0.4638048), (4, 0.19817282)]
[(0, 0.0113159055), (1, 0.0113154035), (2, 0.011315905), (3, 0.011315546), (4, 0.011315584), (5, 0.011316446), (6, 0.011315698), (7, 0.011315322), (8, 0.011315481), (9, 0.8981587)]
[(0, 0.015239792), (1, 0.01523938), (2, 0.015243181), (3, 0.01524188), (4, 0.015243668), (5, 0.015239708), (6, 0.86283445), (7, 0.015239701), (8, 0.01523936), (9, 0.015238871)]
[(0, 0.011638821), (1, 0.01163872), (2, 0.011639425), (3, 0.0116392635), (4, 0.011638596), (5, 0.011639334), (6, 0.011639948), (7, 0.011638022), (8, 0.011638247), (9, 0.8952496)]
[(5, 0.9229942)]
[(3, 0.30607757), (4, 0.6146623)]
[(0, 0.010584784), (1, 0.0105838515), (2, 0.0105847325), (3, 0.01058546), (4, 0.010583611), (5, 0.010584187), (6, 0.010583544), (7, 0.01058711), (8, 0.9047397), (9, 0.010583062)]
[(2, 0.93649244)]
[(2, 0.92083514)]
[(6, 0.9459105)]
[(0, 0.

In [16]:
vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=tfidf_corpus, dictionary=article_dict)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)