# NLP Project
## Topic extraction
The purpose of this project is to extract topics from news articles.

### Step-by-step Process
1. Find a suitable NLP model to use for topic extraction: BERT
2. Preprocess the data
3. Get results
4. Documentation

In [76]:
# import dependencies
import pandas as pd
import spacy
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
from transformers import BertTokenizer, BertModel
import numpy as np
import torch

### Data Pre-processing

In [100]:
# read in data
df = pd.read_csv('Data/articles1.csv')['content'].to_frame()
df.drop(df.index[0:49990],0,inplace=True)  # drop a few rows to make dataset smaller and more manageable
print('\nData set, shape:', df.shape)
print(df.head(1))


Data set, shape: (10, 1)
                                                 content
49990  In January 1999, Prosecutor General Yury Skura...


  df.drop(df.index[0:49990],0,inplace=True)  # drop a few rows to make dataset smaller and more manageable


In [101]:
# check for missing data
print(df.isna().sum())  # shows no null values in content-column

content    0
dtype: int64


In [102]:
# load pipeline
nlp = spacy.load('en_core_web_sm')

In [103]:
# create filter for gensim nlp pre-processing pipeline to include all steps except stemmatization
CUSTOM_FILTERS = [lambda x: x.lower(),  # lowercase
                  strip_tags,
                  strip_punctuation,  # replace punctuation with whitespace
                  strip_multiple_whitespaces,  # remove repeating whitespaces
                  strip_numeric,  # remove numbers
                  remove_stopwords,  # remove stopwords
                  strip_short,  # remove words with less than 3 characters
                  #  stem_text  # return porter-stemmed text,
                 ]

In [104]:
def preprocess_articles(x):
    prep = ' '.join(preprocess_string(x, CUSTOM_FILTERS))
    return [token.lemma_ for token in nlp(prep)]

In [105]:
# apply final pipeline to all data
df['preprocessed'] = df['content'].apply(preprocess_articles)

In [106]:
# print head of preprocessed df
print(df['preprocessed'].head(1))

49990    [january, prosecutor, general, yury, skuratov,...
Name: preprocessed, dtype: object


In [109]:
# ' '.join(df['preprocessed'].iloc[0])
df['text'] = df['preprocessed'].apply(lambda x: ' '.join(x))
df['text']

49990    january prosecutor general yury skuratov summo...
49991    article feature send email politic policy dail...
49992    president obama ’s farewell speech exercise ma...
49993    update january donald trump announce press con...
49994    large cohort americans reservation presidency ...
49995    chairman ceo exxonmobil rex tillerson admit cl...
49996    I ’ve spend nearly year look intelligence chal...
49997    donald trump take necessary step resolve confl...
49998    dozen college force close year week obama admi...
49999    force gravity describe number metaphor it ’s g...
Name: text, dtype: object

### Word Embeddings with BERT

In [84]:
bert_model = BertModel.from_pretrained('bert-base-uncased',
           output_hidden_states = True,)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [116]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def bert_tokenize(text, tokenizer):
    marked_text = '[CLS] ' + text + ' [SEP]'
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

# testing = bert_tokenize('hi my name')
# testing[0]
# testing[1]
# testing[2]

In [117]:
def bert_embed(tokens_tensor, segments_tensors, model):
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [113]:
df['tok'] = df['text'].apply(bert_tokenize)
df['tok']

49990    ([[CLS], january, prosecutor, general, yu, ##r...
49991    ([[CLS], article, feature, send, email, pol, #...
49992    ([[CLS], president, obama, ’, s, farewell, spe...
49993    ([[CLS], update, january, donald, trump, annou...
49994    ([[CLS], large, co, ##hort, americans, reserva...
49995    ([[CLS], chairman, ceo, ex, ##xon, ##mo, ##bil...
49996    ([[CLS], i, ’, ve, spend, nearly, year, look, ...
49997    ([[CLS], donald, trump, take, necessary, step,...
49998    ([[CLS], dozen, college, force, close, year, w...
49999    ([[CLS], force, gravity, describe, number, met...
Name: tok, dtype: object

In [120]:
for text in df['tok']:
    tokenized_text, tokens_tensor, segments_tensors = bert_tokenize(text, tokenizer)
    list_token_embeddings = bert_embed(tokens_tensor, segments_tensors, model)
    
    # Find the position 'bank' in list of tokens
    word_index = tokenized_text.index('bank')
    # Get the embedding for bank
    word_embedding = list_token_embeddings[word_index]

    target_word_embeddings.append(word_embedding)

(['[CLS]', 'january', 'prosecutor', 'general', 'yu', '##ry', 'sk', '##ura', '##tov', 'summon', 'k', '##rem', '##lin', 'boris', 'ye', '##lts', '##in', '’', 's', 'chief', 'staff', 'show', 'video', '##ta', '##pe', 'man', 'look', 'like', '"', 'sk', '##ura', '##tov', 'fr', '##olic', 'bed', 'prostitute', 'ask', 'sk', '##ura', '##tov', 'resign', 'prosecutor', 'middle', 'investigate', 'ye', '##lts', '##in', '’', 's', 'administration', 'take', 'bribe', 'swiss', 'firm', 'try', 'secure', 'lucrative', 'contract', 'k', '##rem', '##lin', 'renovation', 'grain', '##y', 'tape', 'sk', '##ura', '##tov', 'later', 'fake', 'submit', 'resignation', 'nonetheless', 'big', 'intelligence', 'question', 'raise', 'trump', 'dos', '##sier', 'happen', 'decisive', 'battle', 'determine', 'replace', 'ye', '##lts', '##in', 'second', 'presidential', 'term', 'ex', '##pire', 'sk', '##ura', '##tov', '’', 's', 'resignation', 'confirm', 'federation', 'council', 'upper', 'chamber', 'russian', 'parliament', 'k', '##rem', '##lin',