In [1]:
import sqlite3
import utils
import pandas as pd
from tqdm import tqdm
from nltk import sent_tokenize
import pickle

In [2]:
# Define location and name of SQL database, create a connection object
sql_db = './data/film_discussions'
conn = sqlite3.connect(sql_db)

## Manually filter submissions
- I only select submissions with at least 25 comments (according to the submission metadata, not the actual amount of comments I have in the database.
- Based on the submission titles, I select submissions which are about a single movie and not 'meta-comments' (e.g., "why isn't there an official discussion for X?")

In [26]:
submissions = pd.read_sql('SELECT * FROM submissions', conn)

print(len(submissions))

2048


In [8]:
submissions.to_csv('./data/submissions.csv')

In [30]:
# I created a new column which is empty for official discussions (num_comments >= 25 only)
submissions = pd.read_csv('./data/submissions_annotated.csv', sep=';')

In [33]:
submissions = submissions.loc[(submissions['num_comments'] >= 25) & (pd.isna(submissions['discussion_thread']))]

In [49]:
print(len(submissions))

1160


## Select matching comments

In [41]:
x = "'"+"','".join(submissions['submission_id'].tolist())+"'"

In [42]:
comments = pd.read_sql(f"SELECT * FROM comments WHERE submission_id IN ({x})", conn)

print(len(comments))

2118317


In [46]:
comments.to_csv('./data/comments.csv', sep=';', index=False)

In [2]:
comments = pd.read_csv('./data/comments.csv', sep=';')

In [3]:
comments.head()

Unnamed: 0,comment_id,submission_id,body,author,score,created
0,e0r6q9y,7llz2i,ADDITIONALLY-- thank you so much for caring en...,e-lutris,1,1529119335
1,e0enlht,7llz2i,I am watching Shape of Water. It says some st...,figshooting,1,1528587739
2,dz9lw9e,7llz2i,Extremely underwhelmed by the film.\n\nIt just...,Harry101UK,1,1526784495
3,dz9llo2,7llz2i,"""We need a quirky 'mute' girl. We need a gay g...",Harry101UK,1,1526784107
4,dwov5pc,7llz2i,But she wasn't either. She was part fish and ...,porkpie1028,1,1522697652


## Tokenize comments
- Convert comments to lowercase, replace accented letters
- Split comments into sentences and make sure all sentences are unique
- Then split sentences into tokens using TreebankWordTokenizer, removing tokens that are only punctuation
- Finally save the sentences (one sentence per line) which is the input required for Gensim word2vec

In [5]:
unique_sentences = set()
tokenized_comments = []

for comment in tqdm(comments['body'].tolist()):
    comment = str(comment)
    comment = comment.lower()
    comment = utils.strip_accents(comment)
    
    tokenized_comment = []
    
    for sent in sent_tokenize(comment):
        sent = sent.strip()
        sent = utils.tokenize_sentence(sent)
        #x = " ".join(sent)
        #unique_sentences.add(x)
        
        tokenized_comment.extend(sent)
        
    tokenized_comments.append(tokenized_comment)

100%|██████████████████████████████████████████████████████████████████████| 2118317/2118317 [10:11<00:00, 3462.95it/s]


In [59]:
x=0
print(len(unique_sentences))

for i in unique_sentences:
    if x == 5:
        break
    print(i)
    x += 1

5404427



In [60]:
with open('./data/tokens.txt', 'w') as f:
    for sent in tqdm(unique_sentences):
        f.write(sent+"\n")

100%|████████████████████████████████████████████████████████████████████| 5404427/5404427 [00:09<00:00, 569320.71it/s]


In [6]:
with open('./data/tokenized_comments.p', 'wb') as f:
    pickle.dump(tokenized_comments, f)