# NLP Project
## Topic extraction
The purpose of this project is to extract topics from news articles.

### Step-by-step Process
1. Find a suitable NLP model to use for topic extraction: LDA
2. Preprocess the data
3. Get results
4. Documentation

In [1]:
# import dependencies
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
import gensim  # necessary?
from gensim import corpora
from gensim.models import Word2Vec, TfidfModel, LdaMulticore, LsiModel
import spacy

import warnings
warnings.filterwarnings('ignore')

### Data Pre-processing

In [2]:
# Large dataset model:
# from sklearn.model_selection import train_test_split

# # read in data
# df_1 = pd.read_csv('Data/articles1.csv')['content'].to_frame()  # only get content-column
# df_2 = pd.read_csv('Data/articles2.csv')['content'].to_frame()
# df_3 = pd.read_csv('Data/articles3.csv')['content'].to_frame()
# df = df_1.append(df_2).append(df_3)
# print('\nData set, shape:', df.shape)
# print(df.head(5))

# # check for missing data
# print(df.isna().sum())  # shows no null values in content-column

# # split data into ~67% training and ~33% testing
# train, test = train_test_split(df, test_size=0.33, random_state=1)
# print('\nTraining data set, shape:', train.shape)
# print('Testing data set, shape:', test.shape)

# # reset indices
# train = train.reset_index(drop=True)
# test = test.reset_index(drop=True)
# print(train.head(5))
# print('\n', test.head(5))

# Small dataset model:
# read in data
df = pd.read_csv('Data/articles1.csv')['content'].to_frame()
df.drop(df.index[0:40000],0,inplace=True)  # drop a few rows to make dataset smaller and more manageable
print('\nData set, shape:', df.shape)
# check for missing data
print(df.isna().sum())  # shows no null values in content-column

pd.set_option('display.max_colwidth', None)
print(df.head(1))

num_articles = df.shape[0]


Data set, shape: (10000, 1)
content    0
dtype: int64
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

### Pre-process the Data

In [3]:
# load spacy nlp pre-processing pipeline to use for lemmatization
nlp = spacy.load('en_core_web_sm')

In [4]:
# create filter for gensim nlp pre-processing pipeline to include all steps except stemmatization
CUSTOM_FILTERS = [lambda x: x.lower(),  # lowercase
                  strip_tags,
                  strip_punctuation,  # replace punctuation with whitespace
                  strip_multiple_whitespaces,  # remove repeating whitespaces
                  strip_numeric,  # remove numbers
                  remove_stopwords,  # remove stopwords
                  strip_short,  # remove words with less than 3 characters
                  #  stem_text  # return porter-stemmed text,
                 ]

In [5]:
sample = "Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?"
print(sample)

Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?


In [6]:
# test sample string without filtered pipeline, i.e., with stemmatizer
test_a = preprocess_string(sample)
print(test_a)

['hello', 'guess', 'kim', 'wrote', 'signatur', 'right', 'parent', 'call']


In [7]:
# test sample string with filtered pipeline and lemmatizer
test_b = ' '.join(preprocess_string(sample, CUSTOM_FILTERS))  # pre-process without stemmatizing
lem = [token.lemma_ for token in nlp(test_b)]  # lemmatize
print(lem)

['hello', 'guess', 'kim', 'write', 'signature', 'right', 'parent', 'call']


In [8]:
def preprocess_articles(x):
    prep = ' '.join(preprocess_string(x, CUSTOM_FILTERS))
    return [token.lemma_ for token in nlp(prep)]

In [9]:
# apply final pipeline to all data
df['preprocessed'] = df['content'].apply(preprocess_articles)

In [10]:
# print head of preprocessed df
print(df['preprocessed'].head(1))

40000    [cnn, shin, horrific, description, time, north, korean, prison, camp, book, key, witness, united, nations, grab, headline, world, north, korean, defector, win, human, right, award, inspire, documentary, memoir, translate, language, publisher, book, author, shin, claim, bear, escape, north, korean, prison, camp, reveal, part, story, tell, be, n’t, true, shin, spend, life, north, korea, different, prison, camp, total, control, zone, form, title, biography, shin, ’s, account, time, gulag, widely, report, interview, medium, include, cnn, write, opinion, piece, describe, experience, cnn, digital, do, n’t, discount, korean, story, defector, ’s, recant, advocate, blaine, harden, author, book, ", escape, camp, say, statement, website, weekend, shin, ...]
Name: preprocessed, dtype: object


### Prepare pre-processed articles for topic extraction models

In [11]:
# create corpus: article contents embedded according to BoW-model using dictionary
article_dict = corpora.Dictionary(df['preprocessed'])
bow_corpus = [article_dict.doc2bow(line) for line in df['preprocessed']]
tfidf = TfidfModel(bow_corpus)  # take term-frequency into account in bow-model, fit
tfidf_corpus = tfidf[bow_corpus]  # transform

In [12]:
# model = Word2Vec(df['preprocessed'], workers=4)
# print(model)
# print(model.wv.most_similar(positive=['president'], topn=5))
# model.save('model.bin')
# model = Word2Vec.load('model.bin')

### LDA - Topic Modelling

In [13]:
lda_model = LdaMulticore(corpus=tfidf_corpus,
                         id2word=article_dict,
                         random_state=100,
                         num_topics=500)

# save the model
lda_model.save("LDA_model/LDA_model")
# lda_model = LdaModel.load("LDA_model/LDA_model")

In [14]:
# topics per article
lda_model.print_topics(-1)

# doc_bow = doc2bow(document.split())
# doc_lda = lda[doc_bow]

[(0,
  '0.000*"lessening" + 0.000*"carolinians" + 0.000*"bumps" + 0.000*"bejeezus" + 0.000*"acura" + 0.000*"orlofsky" + 0.000*"gtb" + 0.000*"morningstar" + 0.000*"suisse" + 0.000*"hypnotize"'),
 (1,
  '0.000*"lessening" + 0.000*"carolinians" + 0.000*"bumps" + 0.000*"bejeezus" + 0.000*"acura" + 0.000*"orlofsky" + 0.000*"gtb" + 0.000*"morningstar" + 0.000*"suisse" + 0.000*"hypnotize"'),
 (2,
  '0.000*"lessening" + 0.000*"carolinians" + 0.000*"bumps" + 0.000*"bejeezus" + 0.000*"acura" + 0.000*"orlofsky" + 0.000*"gtb" + 0.000*"morningstar" + 0.000*"suisse" + 0.000*"hypnotize"'),
 (3,
  '0.000*"lessening" + 0.000*"carolinians" + 0.000*"bumps" + 0.000*"bejeezus" + 0.000*"acura" + 0.000*"orlofsky" + 0.000*"gtb" + 0.000*"morningstar" + 0.000*"suisse" + 0.000*"hypnotize"'),
 (4,
  '0.000*"lessening" + 0.000*"carolinians" + 0.000*"bumps" + 0.000*"bejeezus" + 0.000*"acura" + 0.000*"orlofsky" + 0.000*"gtb" + 0.000*"morningstar" + 0.000*"suisse" + 0.000*"hypnotize"'),
 (5,
  '0.000*"lessening" + 0.

### LSA - Topic Modelling

In [15]:
LSI_model = LsiModel(corpus=tfidf_corpus,
                     id2word=article_dict,
                     num_topics=500) 

# save model


In [16]:
# topics per article
LSI_model.print_topics(-1)

[(0,
  '0.349*"trump" + 0.172*"clinton" + 0.132*"\'" + 0.103*"president" + 0.097*"obama" + 0.097*"republican" + 0.093*"say" + 0.091*"campaign" + 0.079*"state" + 0.078*"""'),
 (1,
  '0.454*"trump" + 0.267*"clinton" + -0.230*"tesla" + -0.162*"apple" + -0.148*"company" + -0.122*"car" + 0.111*"republican" + 0.111*"campaign" + 0.107*"comey" + 0.095*"donald"'),
 (2,
  '-0.566*"tesla" + -0.209*"musk" + -0.207*"trump" + 0.189*"police" + -0.167*"model" + -0.149*"clinton" + -0.147*"apple" + -0.146*"car" + 0.141*"isis" + -0.128*"company"'),
 (3,
  '-0.414*"tesla" + 0.285*"clinton" + 0.269*"apple" + -0.204*"comey" + -0.181*"russia" + -0.156*"musk" + -0.154*"trump" + -0.134*"russian" + 0.124*"sander" + -0.118*"fbi"'),
 (4,
  '0.417*"apple" + -0.321*"clinton" + -0.290*"tesla" + 0.272*"trump" + 0.176*"iphone" + 0.168*"comey" + -0.151*"police" + -0.132*"sander" + 0.124*"russia" + -0.107*"musk"'),
 (5,
  '0.445*"clinton" + 0.345*"comey" + -0.291*"trump" + 0.262*"fbi" + 0.243*"apple" + 0.205*"email" + 0

#### Compare with results for the same article using NER:
- Sagittarius, Center for Astrophysics, Chile

#### Future improvement: optimise number of topics