# NLP Project
## Topic extraction
The purpose of this project is to extract topics from news articles.

### Step-by-step Process
1. Find a suitable NLP model to use for topic extraction: LDA
2. Preprocess the data
3. Get results
4. Documentation

In [42]:
# import dependencies
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
import gensim  # necessary?
from gensim import corpora
from gensim.models import Word2Vec, TfidfModel, LdaMulticore, LsiModel
import spacy

import warnings
warnings.filterwarnings('ignore')

### Data Pre-processing

In [43]:
# Large dataset model:
# from sklearn.model_selection import train_test_split

# # read in data
# df_1 = pd.read_csv('Data/articles1.csv')['content'].to_frame()  # only get content-column
# df_2 = pd.read_csv('Data/articles2.csv')['content'].to_frame()
# df_3 = pd.read_csv('Data/articles3.csv')['content'].to_frame()
# df = df_1.append(df_2).append(df_3)
# print('\nData set, shape:', df.shape)
# print(df.head(5))

# # check for missing data
# print(df.isna().sum())  # shows no null values in content-column

# # split data into ~67% training and ~33% testing
# train, test = train_test_split(df, test_size=0.33, random_state=1)
# print('\nTraining data set, shape:', train.shape)
# print('Testing data set, shape:', test.shape)

# # reset indices
# train = train.reset_index(drop=True)
# test = test.reset_index(drop=True)
# print(train.head(5))
# print('\n', test.head(5))

# Small dataset model:
# read in data
df = pd.read_csv('Data/articles1.csv')['content'].to_frame()
df.drop(df.index[0:49900],0,inplace=True)  # drop a few rows to make dataset smaller and more manageable
print('\nData set, shape:', df.shape)
# check for missing data
print(df.isna().sum())  # shows no null values in content-column

pd.set_option('display.max_colwidth', None)
print(df.head(2))

num_articles = df.shape[0]


Data set, shape: (100, 1)
content    0
dtype: int64
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

### Pre-process the Data

In [44]:
# load spacy nlp pre-processing pipeline to use for lemmatization
nlp = spacy.load('en_core_web_sm')

In [45]:
# create filter for gensim nlp pre-processing pipeline to include all steps except stemmatization
CUSTOM_FILTERS = [lambda x: x.lower(),  # lowercase
                  strip_tags,
                  strip_punctuation,  # replace punctuation with whitespace
                  strip_multiple_whitespaces,  # remove repeating whitespaces
                  strip_numeric,  # remove numbers
                  remove_stopwords,  # remove stopwords
                  strip_short,  # remove words with less than 3 characters
                  #  stem_text  # return porter-stemmed text,
                 ]

In [46]:
sample = "Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?"
print(sample)

Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?


In [47]:
# test sample string without filtered pipeline, i.e., with stemmatizer
test_a = preprocess_string(sample)
print(test_a)

['hello', 'guess', 'kim', 'wrote', 'signatur', 'right', 'parent', 'call']


In [48]:
# test sample string with filtered pipeline and lemmatizer
test_b = ' '.join(preprocess_string(sample, CUSTOM_FILTERS))  # pre-process without stemmatizing
lem = [token.lemma_ for token in nlp(test_b)]  # lemmatize
print(lem)

['hello', 'guess', 'kim', 'write', 'signature', 'right', 'parent', 'call']


In [49]:
def preprocess_articles(x):
    prep = ' '.join(preprocess_string(x, CUSTOM_FILTERS))
    return [token.lemma_ for token in nlp(prep)]

In [50]:
# apply final pipeline to all data
df['preprocessed'] = df['content'].apply(preprocess_articles)

In [61]:
# print head of preprocessed df
print(df['preprocessed'].head(2))

49900                                                                                               [", let, timid, paul, ryan, exhort, members, house, tuesday, moment, republicans, fashion, speaker, gird, party, call, ", the, opportunity, lifetime, ", chance, remake, washington, donald, trump, unify, republican, control, government, ryan, need, n’t, worried, member, ’, gumption, commit, act, political, overreach, congress, swear, night, majority, house, republicans, vote, closed, door, gut, office, congressional, ethic, independent, body, arisen, era, gop, scandal, act, well, judgment, leader, include, ryan, accurately, warn, undermine, party, ’s, effort, open, new, congress, mood, unity, optimism, predictable, uproar, ensue, modest, push, trump, republican, lawmaker, hastily, reconvene, rescind, amendment, barely, minute, congress, gavele, session, gop, ’, ethic, disaster, episode, ...]
49901    [congress, session, thing, look, usual, percent, member, identify, christians, proportion

### Prepare pre-processed articles for topic extraction models

In [52]:
# create corpus: article contents embedded according to BoW-model using dictionary
article_dict = corpora.Dictionary(df['preprocessed'])
bow_corpus = [article_dict.doc2bow(line) for line in df['preprocessed']]
tfidf = TfidfModel(bow_corpus)  # take term-frequency into account in bow-model, fit
tfidf_corpus = tfidf[bow_corpus]  # transform

In [53]:
# model = Word2Vec(df['preprocessed'], workers=4)
# print(model)
# print(model.wv.most_similar(positive=['president'], topn=5))
# model.save('model.bin')
# model = Word2Vec.load('model.bin')

### LDA - Topic Modelling

In [54]:
lda_model = LdaMulticore(corpus=tfidf_corpus,
                         id2word=article_dict,
                         random_state=100,
                         num_topics=num_articles)

# save the model
lda_model.save("LDA_model/LDA_model")
# lda_model = LdaModel.load("LDA_model/LDA_model")

In [59]:
# topics per article
# lda_model.print_topics(-1)

# topics for first and second article
print(lda_model.print_topics(-1)[0])
print('\n', lda_model.print_topics(-1)[1])

(0, '0.002*"siddiqui" + 0.001*"ethic" + 0.001*"hearing" + 0.001*"marine" + 0.001*"elaine" + 0.001*"confirmation" + 0.001*"daily" + 0.001*"tax" + 0.001*"tuesday" + 0.001*"booker"')

 (1, '0.000*"runway" + 0.000*"sandy" + 0.000*"ring" + 0.000*"ringleader" + 0.000*"riot" + 0.000*"roar" + 0.000*"rooftop" + 0.000*"rumple" + 0.000*"rhode" + 0.000*"sand"')


### LSA - Topic Modelling

In [56]:
LSI_model = LsiModel(corpus=tfidf_corpus,
                     id2word=article_dict,
                     num_topics=num_articles) 

# save model


In [60]:
# topics per article
# LSI_model.print_topics(-1)

# topics for first and second article
print(LSI_model.print_topics(-1)[0])
print('\n', LSI_model.print_topics(-1)[1])

(0, '0.176*"trump" + 0.152*"intelligence" + 0.115*"nominee" + 0.110*"senate" + 0.110*"ethic" + 0.099*"hearing" + 0.095*"russian" + 0.093*"confirmation" + 0.093*"russia" + 0.093*"student"')

 (1, '-0.706*"rafsanjani" + -0.190*"iran" + -0.177*"khamenei" + -0.142*"khomeini" + -0.138*"iranian" + 0.133*"nominee" + 0.122*"ethic" + -0.103*"ahmadinejad" + 0.102*"senate" + 0.095*"hearing"')


#### Future improvement: optimise number of topics