# NLP Project
## Topic extraction
The purpose of this project is to extract topics from news articles.

### Step-by-step Process
1. Find a suitable NLP model to use for topic extraction: LDA
2. Preprocess the data
3. Get results
4. Documentation

In [15]:
# import dependencies
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
import gensim  # necessary?
from gensim import corpora
from gensim.models import Word2Vec, TfidfModel, LdaMulticore
import spacy

### Data Pre-processing

In [16]:
# Large dataset model:
# from sklearn.model_selection import train_test_split

# # read in data
# df_1 = pd.read_csv('Data/articles1.csv')['content'].to_frame()  # only get content-column
# df_2 = pd.read_csv('Data/articles2.csv')['content'].to_frame()
# df_3 = pd.read_csv('Data/articles3.csv')['content'].to_frame()
# df = df_1.append(df_2).append(df_3)
# print('\nData set, shape:', df.shape)
# print(df.head(5))

# # check for missing data
# print(df.isna().sum())  # shows no null values in content-column

# # split data into ~67% training and ~33% testing
# train, test = train_test_split(df, test_size=0.33, random_state=1)
# print('\nTraining data set, shape:', train.shape)
# print('Testing data set, shape:', test.shape)

# # reset indices
# train = train.reset_index(drop=True)
# test = test.reset_index(drop=True)
# print(train.head(5))
# print('\n', test.head(5))

# Small dataset model:
# read in data
df = pd.read_csv('Data/articles1.csv')['content'].to_frame()
df.drop(df.index[0:49900],0,inplace=True)  # drop a few rows to make dataset smaller and more manageable
print('\nData set, shape:', df.shape)
# check for missing data
print(df.isna().sum())  # shows no null values in content-column
print(df.head(5))


Data set, shape: (100, 1)
content    0
dtype: int64
                                                 content
49900  “Let us not be timid,” Paul Ryan exhorted memb...
49901  The 115th Congress is back in session, and at ...
49902  Don t Miss, Pop Culture Is Having a Metaphysic...
49903  It was only eight years ago that the United St...
49904     Toward the end of Bright Lights: Starring D...


### Pre-process the Data

In [17]:
# load spacy nlp pre-processing pipeline to use for lemmatization
nlp = spacy.load('en_core_web_sm')

In [18]:
# create filter for gensim nlp pre-processing pipeline to include all steps except stemmatization
CUSTOM_FILTERS = [lambda x: x.lower(),  # lowercase
                  strip_tags,
                  strip_punctuation,  # replace punctuation with whitespace
                  strip_multiple_whitespaces,  # remove repeating whitespaces
                  strip_numeric,  # remove numbers
                  remove_stopwords,  # remove stopwords
                  strip_short,  # remove words with less than 3 characters
                  #  stem_text  # return porter-stemmed text,
                 ]

In [19]:
sample = "Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?"
print(sample)

Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?


In [20]:
# test sample string without filtered pipeline, i.e., with stemmatizer
test_a = preprocess_string(sample)
print(test_a)

['hello', 'guess', 'kim', 'wrote', 'signatur', 'right', 'parent', 'call']


In [21]:
# test sample string with filtered pipeline and lemmatizer
test_b = ' '.join(preprocess_string(sample, CUSTOM_FILTERS))  # pre-process without stemmatizing
lem = [token.lemma_ for token in nlp(test_b)]  # lemmatize
print(lem)

['hello', 'guess', 'kim', 'write', 'signature', 'right', 'parent', 'call']


In [22]:
def preprocess_articles(x):
    prep = ' '.join(preprocess_string(x, CUSTOM_FILTERS))
    return [token.lemma_ for token in nlp(prep)]

In [23]:
# apply final pipeline to all data
df['preprocessed'] = df['content'].apply(preprocess_articles)

In [24]:
# print head of preprocessed df
print(df['preprocessed'].head(5))

49900    [", let, timid, paul, ryan, exhort, members, h...
49901    [congress, session, thing, look, usual, percen...
49902    [miss, pop, culture, have, metaphysical, momen...
49903    [year, ago, united, states, president, time, d...
49904    [end, bright, light, star, debbie, reynolds, c...
Name: preprocessed, dtype: object


### Prepare pre-processed articles for LDA model
#### Embed strings

In [25]:
# create corpus: article contents embedded according to BoW-model using dictionary
article_dict = corpora.Dictionary(df['preprocessed'])
bow_corpus = [article_dict.doc2bow(line) for line in df['preprocessed']]
tfidf = TfidfModel(bow_corpus)  # take term-frequency into account in bow-model
tfidf_corpus = tfidf[bow_corpus]

In [26]:
# model = Word2Vec(df['preprocessed'], workers=4)
# print(model)
# print(model.wv.most_similar(positive=['president'], topn=5))
# model.save('model.bin')
# model = Word2Vec.load('model.bin')

### LDA - Topic Modelling

In [27]:
lda_model = LdaMulticore(corpus=tfidf_corpus,
                         id2word=article_dict,
                         random_state=100,
                         num_topics=100,
                         passes=10)

# save the model
lda_model.save("Data/LDA_model/LDA_model")
# lda_model = LdaModel.load("Data/LDA_model/LDA_model")

In [28]:
lda_model.print_topics(-1)

[(0,
  '0.000*"siddiqui" + 0.000*"marine" + 0.000*"elaine" + 0.000*"ethic" + 0.000*"hearing" + 0.000*"confirmation" + 0.000*"daily" + 0.000*"tax" + 0.000*"tuesday" + 0.000*"booker"'),
 (1,
  '0.000*"runway" + 0.000*"sandy" + 0.000*"ring" + 0.000*"ringleader" + 0.000*"riot" + 0.000*"roar" + 0.000*"rooftop" + 0.000*"rumple" + 0.000*"rhode" + 0.000*"sand"'),
 (2,
  '0.000*"runway" + 0.000*"sandy" + 0.000*"ring" + 0.000*"ringleader" + 0.000*"riot" + 0.000*"roar" + 0.000*"rooftop" + 0.000*"rumple" + 0.000*"rhode" + 0.000*"sand"'),
 (3,
  '0.007*"amoc" + 0.003*"climate" + 0.002*"model" + 0.002*"ocean" + 0.001*"collapse" + 0.000*"warming" + 0.000*"greenland" + 0.000*"freshwater" + 0.000*"warm" + 0.000*"stable"'),
 (4,
  '0.000*"runway" + 0.000*"sandy" + 0.000*"ring" + 0.000*"ringleader" + 0.000*"riot" + 0.000*"roar" + 0.000*"rooftop" + 0.000*"rumple" + 0.000*"rhode" + 0.000*"sand"'),
 (5,
  '0.010*"trump" + 0.006*"conflict" + 0.005*"kunze" + 0.005*"business" + 0.004*"digital" + 0.004*"trust" 