In [None]:
import utils
import pandas as pd
from tqdm import tqdm
from nltk import sent_tokenize
import pickle
import regex as re

from gensim.models.word2vec import LineSentence
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

from time import perf_counter

from params import comments_file, tokenized_sents_file, tokenized_comments_text_file

In [None]:
# Tokenized sentences for processing with word2vec
tokens_file = './data/tokens.txt'
ngrams_file = './data/ngrams'

## Tokenize comments
- Convert comments to lowercase, replace accented letters
- Split comments into sentences and make sure all sentences are unique
- Then split sentences into tokens using TreebankWordTokenizer, removing tokens that are only punctuation
- Finally save the sentences (one sentence per line) which is the input required for Gensim word2vec

In [None]:
comments = pd.read_csv(comments_file, sep=';')['body'].tolist()

In [None]:
comments[:5]

In [None]:
unique_sentences = set()

for comment in tqdm(comments):
    comment = str(comment)
    comment = comment.lower()
    comment = utils.strip_accents(comment)
    
    for sent in sent_tokenize(comment):
        sent = sent.strip()
        sent = utils.tokenize_sentence(sent)
        text_sent = " ".join(sent)
        unique_sentences.add(text_sent)

In [None]:
x=0
print(len(unique_sentences))

for i in unique_sentences:
    if x == 5:
        break
    print(i)
    x += 1

In [None]:
with open(tokens_file, 'w') as f:
    for sent in tqdm(unique_sentences):
        f.write(sent+"\n")

In [None]:
# Learn ngrams
sentences = LineSentence(tokens_file)

In [None]:
ngrams = Phrases(sentences, connector_words=ENGLISH_CONNECTOR_WORDS, min_count=15)
ngrams.save(ngrams_file)

In [None]:
ngrams = Phrases.load(ngrams_file)

In [None]:
with open(tokenized_sents_file, 'w') as f:
    for sent in tqdm(sentences):
        tokenized_sent = ngrams[sent]
        text_sent = " ".join(tokenized_sent)
        f.write(text_sent+"\n")

In [None]:
x=0
for i in sentences:
    print(ngrams[i])
    x+=1
    if x == 2:
        break

In [None]:
with open(tokenized_comments_text_file, 'w') as f:
    
    for comment in tqdm(comments):
        comment = str(comment)
        comment = comment.lower()
        comment = utils.strip_accents(comment)

        tokenized_comment = ""

        for sent in sent_tokenize(comment):
            sent = sent.strip()
            sent = utils.tokenize_sentence(sent)
            x = ngrams[sent]
            tokenized_comment += " ".join(x)+" "
            
        tokenized_comment = tokenized_comment.strip()
        f.write(tokenized_comment+"\n")