In [1]:
import utils
import pandas as pd
from tqdm import tqdm
from nltk import sent_tokenize
import pickle
import regex as re

from gensim.models.word2vec import LineSentence
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

from time import perf_counter

In [2]:
# Converting the submissions to manually annotate them
submissions_file = './data/submissions.csv'
annotated_submissions_file = './data/submissions_annotated.csv'

# Comments from selected submissions
comments_file = './data/comments.csv'

# Tokenized sentences for processing with word2vec
tokens_file = './data/tokens.txt'
tokenized_sents_file = 'data/tokenized_sents.txt'
ngrams_file = './data/ngrams'

# Tokenized comments for matching discourse atoms and concept mover's distance to comments
tokenized_comments_file = './data/tokenized_comments.p' #del
tokenized_comments_v2_file = './data/tokenized_comments' #del? or needed for cmd?
tokenized_comments_text_file = './data/tokenized_comments.txt'

## Tokenize comments
- Convert comments to lowercase, replace accented letters
- Split comments into sentences and make sure all sentences are unique
- Then split sentences into tokens using TreebankWordTokenizer, removing tokens that are only punctuation
- Finally save the sentences (one sentence per line) which is the input required for Gensim word2vec

In [3]:
comments = pd.read_csv(comments_file, sep=';')['body'].tolist()

In [4]:
comments[:5]

["ADDITIONALLY-- thank you so much for caring enough to ask, in a genuinely curious and respectful way. I'm curious to hear your thoughts!",
 'I am watching Shape of Water.  It says some stuff in Russian.  How can I find out what they are saying?',
 'Extremely underwhelmed by the film.\n\nIt just felt dull, the \'romance\' between fish-dick and mute-chick was too quick and I never felt any connection or empathy for the \'asset\'. \n\nThe pie-shop scenes were so quick and short, and ultimately meant nothing. "I want to get to know you" , "Ew, no." and that was it. Just the film saying "Hah, look! The 40\'s hated gays!"\n\nThe strange musical imagination scene was a complete tonal change from the rest of the film. \n\nAnd it seems the film just ignored all logic to tell the story. The \'asset\' had no security at all, no guards, no cameras. The cleaner was allowed in and out whenever she felt like it most of the time. When the guy\'s hand was injured, it took ages for any help to arrive 

In [None]:
unique_sentences = set()

for comment in tqdm(comments):
    comment = str(comment)
    comment = comment.lower()
    comment = utils.strip_accents(comment)
    
    for sent in sent_tokenize(comment):
        sent = sent.strip()
        sent = utils.tokenize_sentence(sent)
        text_sent = " ".join(sent)
        unique_sentences.add(text_sent)

In [None]:
x=0
print(len(unique_sentences))

for i in unique_sentences:
    if x == 5:
        break
    print(i)
    x += 1

In [None]:
with open(tokens_file, 'w') as f:
    for sent in tqdm(unique_sentences):
        f.write(sent+"\n")

In [5]:
# Learn ngrams
sentences = LineSentence(tokens_file)

In [20]:
ngrams = Phrases(sentences, connector_words=ENGLISH_CONNECTOR_WORDS, min_count=15)
ngrams.save(ngrams_file)

In [4]:
ngrams = Phrases.load(ngrams_file)

In [17]:
with open(tokenized_sents_file, 'w') as f:
    for sent in tqdm(sentences):
        tokenized_sent = ngrams[sent]
        text_sent = " ".join(tokenized_sent)
        f.write(text_sent+"\n")

5400518it [02:12, 40863.92it/s]


In [22]:
x=0
for i in sentences:
    print(ngrams[i])
    x+=1
    if x == 2:
        break

['that', 'whole', 'catacombs', 'scene', 'was', 'amazing']
['maybe', 'youre', 'on', 'to', 'something', 'about', 'it', 'being', 'experimental', 'and', 'i', 'think', 'he', 'experimented', 'either', 'too_much', 'or', 'not', 'enough', 'with', 'this', 'one']


In [23]:
# del?
with open(tokenized_comments_v2_file, 'wb') as f:
    
    for comment in tqdm(comments):
        comment = str(comment)
        comment = comment.lower()
        comment = utils.strip_accents(comment)

        tokenized_comment = []

        for sent in sent_tokenize(comment):
            sent = sent.strip()
            sent = utils.tokenize_sentence(sent)
            x = ngrams[sent]
            tokenized_comment.extend(x)

        pickle.dump(tokenized_comment, f)

100%|██████████████████████████████████████████████████████████████████████| 2116273/2116273 [12:29<00:00, 2822.57it/s]


In [7]:
with open(tokenized_comments_text_file, 'w') as f:
    
    for comment in tqdm(comments):
        comment = str(comment)
        comment = comment.lower()
        comment = utils.strip_accents(comment)

        tokenized_comment = ""

        for sent in sent_tokenize(comment):
            sent = sent.strip()
            sent = utils.tokenize_sentence(sent)
            x = ngrams[sent]
            tokenized_comment += " ".join(x)+" "
            
        tokenized_comment = tokenized_comment.strip()
        f.write(tokenized_comment+"\n")

100%|██████████████████████████████████████████████████████████████████████| 2116273/2116273 [12:11<00:00, 2894.82it/s]
