# NLP Project
## Topic extraction
The purpose of this project is to extract topics from news articles.

### Step-by-step Process
1. Find a suitable NLP model to use for topic extraction: BERT
2. Preprocess the data
3. Get results
4. Documentation

In [16]:
# !pip install bertopic --ignore-installed llvmlite
# !pip install bertopic[visualization] --ignore-installed llvmlite
# !pip install tensorflow-gpu==1.15
# !pip install -U bert-serving-server bert-serving-client

In [3]:
# import dependencies
import pandas as pd
import spacy
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text

### Data Pre-processing

In [4]:
# read in data
df = pd.read_csv('Data/articles1.csv')['content'].to_frame()
print('\nData set, shape:', df.shape)
print(df.head(1))


Data set, shape: (50000, 1)
                                             content
0  WASHINGTON  —   Congressional Republicans have...


In [5]:
# check for missing data
print(df.isna().sum())  # shows no null values in content-column

content    0
dtype: int64


In [6]:
# load pipeline
nlp = spacy.load('en_core_web_sm')

In [7]:
# create filter for gensim nlp pre-processing pipeline to include all steps except stemmatization
CUSTOM_FILTERS = [lambda x: x.lower(),  # lowercase
                  strip_tags,
                  strip_punctuation,  # replace punctuation with whitespace
                  strip_multiple_whitespaces,  # remove repeating whitespaces
                  strip_numeric,  # remove numbers
                  remove_stopwords,  # remove stopwords
                  strip_short,  # remove words with less than 3 characters
                  #  stem_text  # return porter-stemmed text,
                 ]

In [8]:
def preprocess_articles(x):
    prep = ' '.join(preprocess_string(x, CUSTOM_FILTERS))
    return [token.lemma_ for token in nlp(prep)]

In [9]:
# apply final pipeline to all data
df['preprocessed'] = df['content'].apply(preprocess_articles)

In [10]:
# print head of preprocessed df
print(df['preprocessed'].head(1))

0    [washington, congressional, republicans, new, ...
Name: preprocessed, dtype: object


In [11]:
# save the df with added preprocessed column into a csv file
df.to_csv('Data/articles_preprocessed.csv', index=False)

### Word Embeddings with BERT

In [None]:
bert-serving-start -model_dir /BERT/wwm_uncased_L-24_H-1024_A-16/ -num_worker=1

In [None]:
from bert_serving.client import BertClient()
client = BertClient()
vectors = client.encode([“dog”],[“cat”],[“man”])