In [None]:
import os

path = os.path.realpath(os.path.join('..', '..'))
os.chdir(path)

import pandas as pd
import numpy as np
import spacy
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from spacy.tokens import Token

from src.preprocessing.preprocessing import Preprocessing


In [None]:
nlp = spacy.load('en_core_web_md')

In [None]:
preprocessing = Preprocessing()
preprocessing.load_train_test('../../data/')

In [None]:
train = preprocessing.get_train_df(deep_copy=True)

##### Data-Cleaning: Swap &amp; HTML tag with 'and'

In [None]:
print(train['post'].apply(lambda s: '&amp;' in s).sum())
print(train['reply'].apply(lambda s: '&amp;' in s).sum())

In [None]:
train['post'] = train['post'].map(lambda s: s.replace('&amp;', 'and'))
train['reply'] = train['reply'].map(lambda s: s.replace('&amp;', 'and'))

In [None]:
print(train['post'].apply(lambda s: '&amp;' in s).sum())
print(train['reply'].apply(lambda s: '&amp;' in s).sum())

##### Data-preprocessing with Spacy

In [None]:
with open('../../data/stop_words_cut.txt', 'r') as f:
    stop_words = set(line.strip() for line in f)

In [None]:
def is_stop_word(token):
    return token.is_stop or (token.lower_ in stop_words) or (token.lemma_ in stop_words)


Token.set_extension('is_stop', getter=is_stop_word)

In [None]:
%%time
doc_post_list = [doc for doc in nlp.pipe(train['post'][0::2], n_threads=-1, batch_size=500)]
doc_reply_list = [doc for doc in nlp.pipe(train['reply'], n_threads=-1, batch_size=500)]

In [None]:
def lemmatized_doc_corpus(doc_list):
    for doc in doc_list:
        yield u' '.join([token.lemma_ for token in doc if not (token.is_punct or token.is_space)])

In [None]:
%%time
doc_list = doc_post_list + doc_reply_list
assert len(doc_list) == (len(doc_post_list) + len(doc_reply_list))
unigram_docs = np.asarray([doc for doc in lemmatized_doc_corpus(doc_list)])

In [None]:
unigram_docs

In [None]:
%%time
bigram_model = Phrases(unigram_docs, scoring='default', threshold=5)

In [None]:
%%time
bigram_model = Phraser(bigram_model)
bigram_docs = [u''.join(bigram_model[doc]) for doc in unigram_docs]

In [None]:
bigram_docs

In [None]:
bi_phrases = pd.DataFrame(bigram_docs, columns=['text'])['text'].str.extract(r'(\w+_\w+)',
                                                                             expand=False).dropna().tolist()

In [None]:
print("Phrase-count:   ", len(bi_phrases))
print("Unique phrases: ", len(set(bi_phrases)))

In [None]:
set(bi_phrases)

The point-wise mutual information scoring function did use n-grams or it was a bug. Not useful either.