# NLP

This notebook show simple Fasttext model trained on E. A. Poe's, Mary Shelley's, and HP Lovecraft's books.


In [None]:
!python3 -m spacy download en_core_web_sm

In [None]:
import copy
import string
from collections import Counter
from pathlib import Path
import re

from bs4 import BeautifulSoup
import en_core_web_sm
import fasttext
import gensim.models.fasttext
import numpy as np
import pandas as pd
import seaborn as sns;
import spacy
from nltk.corpus import stopwords
import requests
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
sns.set()
%matplotlib inline


### Corpus creation

For my corpus I decided to use [this](https://www.kaggle.com/c/spooky-author-identification) dataset from Kaggle as I won't have to bother too much with cleaning and splitting the sentences. It contains texts by Edgar Allan Poe, Mary Shelley, and HP Lovecraft.

In [None]:
dataset = pd.read_csv('corpus/corpus_raw.csv')
dataset = dataset.drop('Unnamed: 0', axis='columns')

In [None]:
nlp = en_core_web_sm.load()

def cleanup_text(docs, logging=False):
    _stopwords = set(stopwords.words('english'))
    texts = []
    for doc in tqdm(docs):
        doc = nlp(doc, disable=['parser', 'ner', 'tagger', 'textcat'])
        tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
        tokens = [tok for tok in tokens if tok not in _stopwords and tok not in string.punctuation]
        tokens = ' '.join(tokens)
        if len(tokens) == 0:
            tokens = np.nan
        texts.append(tokens)
    return pd.Series(texts)

dataset['cleaned_text'] = cleanup_text(dataset['text'])
dataset = dataset.dropna(axis='rows').reset_index(drop=True)

In [None]:
def save_corpus(path, serie):
    with Path(path).open(mode='w') as fw:
        fw.write('\n'.join(serie))

save_corpus('corpus/corpus_clean.txt', dataset['cleaned_text'])

### Model creation

I decided to go with [fastText](https://fasttext.cc) as it is able to work better with unseen words and mainly becuase I have already used it in the past.

In [None]:
model = fasttext.train_unsupervised('corpus/corpus_clean.txt', model='skipgram', wordNgrams=3, epoch=25, dim=100)
model.save_model('model.mod')

In [None]:
vecs = gensim.models.fasttext.load_facebook_model('model.mod')

### Visualizations

In [None]:
word_counter = Counter((' '.join(dataset['text'])).split())
word, count = zip(*word_counter.most_common(10))
data = {'word': word, 'count': count}
df = pd.DataFrame(data)

In [None]:
sns.barplot(x='word', y='count', data=df).set_title('10 most common words in raw dataset')

In [None]:
word_counter = Counter((' '.join(dataset['cleaned_text'])).split())
word, count = zip(*word_counter.most_common(10))
data = {'word': word, 'count': count}
df = pd.DataFrame(data)

In [None]:
sns.barplot(x='word', y='count', data=df).set_title('10 most common words in cleaned dataset')

The two previous plots show the importance of cleaning the data (mostly removing the stopwords).

In [None]:
def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, 100), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 300 to 50 dimensions with PCA
    reduc = PCA(n_components=15).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)

    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))

Following plot shows most similar and most opposite to word `raven` (from EA Poe's work).

In [None]:
tsnescatterplot(vecs, 'raven', [i[0] for i in vecs.wv.most_similar(negative='raven', topn=15)])

Following plot shows most similar and most opposite to word `frankenstein` (from Mary Shelleys's work).

In [None]:
tsnescatterplot(vecs, 'frankenstein', [i[0] for i in vecs.wv.most_similar(negative='frankenstein', topn=15)])

Following plot shows most similar and most opposite to word `cthulhu` (from HP Lovecraft's work).

In [None]:
tsnescatterplot(vecs, 'cthulhu', [i[0] for i in vecs.wv.most_similar(negative='cthulhu', topn=15)])

### Processing downloaded story

Here I'm gonna download sample from Stephen King's Hearts In Atlantis and replace some words with it most similar and see the results.

In [None]:
url = 'http://authorpages.hoddersystems.com/StephenKing/sample1.asp'

In [None]:
r = requests.get(url)
if r.status_code != 200:
    print('Unable to download the sample!')

In [None]:
bs = BeautifulSoup(r.text)

In [None]:
regex = re.compile('\s+')
texts = list(map(lambda t: t.font.text, bs.find_all('p', attrs={'align': 'left'})))
text = ' '.join(texts)
text = regex.sub(' ', text)
print('Original text')
print('=' * 10)
print(text)

In [None]:
translation = {'father': vecs.wv.most_similar('father', topn=1)[0][0],
               'mother': vecs.wv.most_similar('mother', topn=1)[0][0],
              'gravestone': vecs.wv.most_similar('gravestone', topn=1)[0][0],
              'bike': vecs.wv.most_similar('bike', topn=1)[0][0],
              'phone': vecs.wv.most_similar('phone', topn=1)[0][0]}
translation

In [None]:
def translate(_text, translation):
    text = str(_text)
    for k, v in translation.items():
        text = text.replace(k, v)
    return text

print('Modified text')
print('=' * 10)
print(translate(text, translation))

I decided to get the most similar words to selected 5 words: `father, mother, gravestone, bike, phone` and replace them in the original text. The resulting text definitely lost its original meaning by replacing `father` with `playmate`, `mother` with `child` and `bike` with `crackle`. On the other hand, `gravestone` and `phone` didn't really lost their meaning by replacing with the most similar words.

In [None]:
translation = {'father': vecs.wv.most_similar('father', topn=2)[1][0],
               'mother': vecs.wv.most_similar('mother', topn=2)[1][0],
              'gravestone': vecs.wv.most_similar('gravestone', topn=2)[1][0],
              'bike': vecs.wv.most_similar('bike', topn=2)[1][0],
              'phone': vecs.wv.most_similar('phone', topn=2)[1][0]}
translation

In [None]:
translation = {'father': vecs.wv.most_similar('father', topn=3)[2][0],
               'mother': vecs.wv.most_similar('mother', topn=3)[2][0],
              'gravestone': vecs.wv.most_similar('gravestone', topn=3)[2][0],
              'bike': vecs.wv.most_similar('bike', topn=3)[2][0],
              'phone': vecs.wv.most_similar('phone', topn=3)[2][0]}
translation

I'm not even going to try to replace words with ther 2nd and 3rd most similar words, it can be seen from the translation dictionaries that the resulting text wont make any sense at all.

But I am going to try to replace each word with its most similar (except for stopwords).

In [None]:
preprocessed_text = text.lower().translate({ord("'"): "", ord(","): "", ord("."): "", ord("-"): " ", ord("("): "", ord(")"): ""})

In [None]:
translation = {word: vecs.wv.most_similar(word, topn=1)[0][0] for word in preprocessed_text.split() if word not in set(stopwords.words('english'))}

translate(preprocessed_text, translation)

Yeah, it can be seen that this doesn't even make sense.

### Cleanup

In [None]:
!rm model.mod
!rm corpus/corpus_clean.txt