# NLP Project
## Topic extraction
The purpose of this project is to extract topics from news articles.

### Step-by-step Process
1. Find a suitable NLP model to use for topic extraction: LDA
2. Preprocess the data
3. Get results
4. Documentation

In [None]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
import gensim  # necessary?
from gensim import corpora
from gensim.models import Word2Vec, TfidfModel, LdaMulticore, CoherenceModel
import spacy
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pyLDAvis
import pyLDAvis.gensim_models

### Data Pre-processing

In [None]:
# Large dataset model:
# from sklearn.model_selection import train_test_split

# # read in data
# df_1 = pd.read_csv('Data/articles1.csv')['content'].to_frame()  # only get content-column
# df_2 = pd.read_csv('Data/articles2.csv')['content'].to_frame()
# df_3 = pd.read_csv('Data/articles3.csv')['content'].to_frame()
# df = df_1.append(df_2).append(df_3)
# print('\nData set, shape:', df.shape)
# print(df.head(5))

# # check for missing data
# print(df.isna().sum())  # shows no null values in content-column

# # split data into ~67% training and ~33% testing
# train, test = train_test_split(df, test_size=0.33, random_state=1)
# print('\nTraining data set, shape:', train.shape)
# print('Testing data set, shape:', test.shape)

# # reset indices
# train = train.reset_index(drop=True)
# test = test.reset_index(drop=True)
# print(train.head(5))
# print('\n', test.head(5))

# Small dataset model:
# read in data
df = pd.read_csv('Data/articles1.csv')['content'].to_frame()
# df.drop(df.index[0:49900],0,inplace=True)  # drop a few rows to make dataset smaller and more manageable
print('\nData set, shape:', df.shape)
# check for missing data
print(df.isna().sum())  # shows no null values in content-column

# pd.set_option('display.max_colwidth', None)
print(df.head(20))

### Pre-process the Data
Normalisation, cleaning, etc.

In [None]:
# load spacy nlp pre-processing pipeline to use for lemmatization
nlp = spacy.load('en_core_web_sm')

In [None]:
# create filter for gensim nlp pre-processing pipeline to include all steps except stemmatization
CUSTOM_FILTERS = [lambda x: x.lower(),  # lowercase
                  strip_tags,
                  strip_punctuation,  # replace punctuation with whitespace
                  strip_multiple_whitespaces,  # remove repeating whitespaces
                  strip_numeric,  # remove numbers
                  remove_stopwords,  # remove stopwords
                  strip_short,  # remove words with less than 3 characters
                  #  stem_text  # return porter-stemmed text,
                 ]

In [None]:
sample = "Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?"
print(sample)

In [None]:
# test sample string without filtered pipeline, i.e., with stemmatizer
test_a = preprocess_string(sample)
print(test_a)

In [None]:
# test sample string with filtered pipeline and lemmatizer
test_b = ' '.join(preprocess_string(sample, CUSTOM_FILTERS))  # pre-process without stemmatizing
lem = [token.lemma_ for token in nlp(test_b)]  # lemmatize
print(lem)

In [None]:
def preprocess_articles(x):
    prep = ' '.join(preprocess_string(x, CUSTOM_FILTERS))
    return [token.lemma_ for token in nlp(prep)]

In [None]:
# apply final pipeline to all data
df['preprocessed'] = df['content'].apply(preprocess_articles)
print(df['preprocessed'].head(20))

### Prepare pre-processed articles for topic extraction models

In [None]:
# create corpus: article contents embedded according to BoW-model using dictionary
article_dict = corpora.Dictionary(df['preprocessed'])
article_dict.filter_extremes(no_below=3)
bow_corpus = [article_dict.doc2bow(line) for line in df['preprocessed']]
tfidf = TfidfModel(bow_corpus)  # take term-frequency into account in bow-model, fit
tfidf_corpus = tfidf[bow_corpus]  # transform

### LDA - Topic Modelling

In [None]:
np.random.seed(123456)
lda_model = LdaMulticore(corpus=tfidf_corpus,
                         id2word=article_dict,
                         random_state=100,
                         num_topics=500)

In [None]:
# save the model
lda_model.save("LDA_model/LDA_model")

# load the model
# lda_model = LdaModel.load("LDA_model/LDA_model")

In [None]:
# keywords per topic
lda_model.print_topics(-1)

doc_lda = lda_model[tfidf_corpus]

# print(lda_model.print_topic(0))
# print('\n', lda_model.print_topic(1))
# print('\n', lda_model.print_topic(2))

In [None]:
# Compute Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=df['preprocessed'], dictionary=article_dict, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print('Coherence Score: ', coherence_score)

# change num of topics to increase coherence score

In [None]:
# print topic(s) per article
# for topic in doc_lda:
#     print(topic)

# doc_lda[4]

# print dominant topic(s) of the 20 first articles
for i in range(20):
    print(doc_lda[i])

In [None]:
vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=tfidf_corpus, dictionary=article_dict)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)