In [14]:
import sqlite3
import utils
import pandas as pd
from tqdm import tqdm
from nltk import sent_tokenize
import pickle
import regex as re

## Setup

In [2]:
# Define location and name of SQL database, create a connection object
sql_db = './data/film_discussions'
conn = sqlite3.connect(sql_db)

# Converting the submissions to manually annotate them
submissions_file = './data/submissions.csv'
annotated_submissions_file = './data/submissions_annotated.csv'

# Comments from selected submissions
comments_file = './data/comments.csv'

# Tokenized sentences for processing with word2vec
tokens_file = './data/tokens.txt'

# Tokenized comments for matching discourse atoms and concept mover's distance to comments
tokenized_comments_file = './data/tokenized_comments.p'

## Manually filter submissions
- I only select submissions with at least 25 comments (according to the submission metadata, not the actual amount of comments I have in the database.
- Based on the submission titles, I select submissions which are about a single movie and not 'meta-comments' (e.g., "why isn't there an official discussion for X?")

In [26]:
submissions = pd.read_sql('SELECT * FROM submissions', conn)

print(len(submissions))

2048


In [8]:
submissions.to_csv(submissions_file, index=False)

In [87]:
# I created a new column which is empty for official discussions (num_comments >= 25 only)
all_submissions = pd.read_csv(annotated_submissions_file, sep=';')

In [88]:
submissions = all_submissions.loc[(all_submissions['num_comments'] >= 25) & (pd.isna(all_submissions['discussion_thread']))]

In [5]:
print(len(submissions))

1156


In [7]:
submissions.head()

Unnamed: 0,Column1,submission_id,title,score,num_comments,url,created,discussion_thread
0,1653.0,7jwxnd,Official Discussion - Star Wars: Episode VIII ...,15908.0,100558.0,https://www.reddit.com/r/movies/comments/7jwxn...,1513307000.0,
1,1654.0,bh8iei,Official Discussion - Avengers: Endgame [SPOIL...,20046.0,89336.0,https://www.reddit.com/r/movies/comments/bh8ie...,1556248000.0,
2,1655.0,8f84h0,Official Discussion - Avengers: Infinity War [...,24045.0,72761.0,https://www.reddit.com/r/movies/comments/8f84h...,1524794000.0,
3,1656.0,ed3a6g,Official Discussion - Star Wars: Episode IX - ...,17351.0,52017.0,https://www.reddit.com/r/movies/comments/ed3a6...,1576811000.0,
4,1657.0,3xf9gd,Official Discussion - Star Wars: Episode VII -...,8394.0,40877.0,https://www.reddit.com/r/movies/comments/3xf9g...,1450494000.0,


In [92]:
titles = submissions.title.tolist()

to_remove = [
    "[Spoilers]",
    "[SPOILERS]",
    "[SPOILERS}",
    "(SPOILERS)",
    "Official International Release Discussion:",
    "Official International Discussion -",
    "Official International Release Discussion Thread -",
    "- Official Discussion Thread",
    "Official Discussion Thread:",
    "Discussion Thread.",
    "Official Discussion:",
    "Official Discussion -",
    "Official Discussion-",
    "Discussion Thread",
    "IMAX re-release",
    "(International Thread)",
    "(Un)Official Discussion:",
    "(US Release)",
    "(International Release)",
    "(UK Release)",
    "(US Thread)",
    "(Thread Vol. 2)",
    "Official Discussion #2 -",
    "Movie Discussion",
    "Un-Official Discussion :",
    "Psuedo-Official Discussion:",
    "(un-Official Discussion)",
    "Official Late-Comer Megathread -",
    "(Roadshow 70mm Cut)",
    "(Theatrical Cut)",
    "The Pre-Official",
    "discussion thread",
    "OFFICIAL DISCUSSION",
    "Official Discussion",
    "[Serious Replies Only]",
    "(Early Release)",
    "(Spoilers inside)",
    "(Wide Release)",
    "(Netflix Release)",
    "(theater release)",
    "(2nd Thread)",
    "SPOILERS"
]

remove_other = [
    "-Official 48 fps Discussion thread.",
    r" \.",
    r"\d+/\d+  -",
    "31 Days of Halloween -",
    r"(July|August) \d{1,2}",
    '"',
    "'",
    "Official",
    r"- \d+/\d+",
]

film_titles = []
for title in titles:
    for j in to_remove:
        title = title.replace(j, "")
        
    for j in remove_other:
        title = re.sub(j, "", title)
    title = title.strip()
        
    title = title.strip()    
    film_titles.append(title)

In [93]:
for i in film_titles:
    print(i)

Star Wars: Episode VIII – The Last Jedi
Avengers: Endgame
Avengers: Infinity War
Star Wars: Episode IX - The Rise of Skywalker
Star Wars: Episode VII - The Force Awakens
Joker
Zack Snyders Justice League
Wonder Woman 1984
Once Upon a Time in Hollywood
Spider-Man: No Way Home
Batman v. Superman: Dawn of Justice
Rogue One: A Star Wars Story
Blade Runner 2049
Us
Black Panther
Logan
Captain Marvel
A Quiet Place
Dune
Hereditary
The Suicide Squad
Spider-Man: Far From Home
Spider-man: Homecoming
IT
Captain America: Civil War
Deadpool 2
Annihilation
Get Out
The Matrix Resurrections
Justice League
Avengers: Infinity War
Wonder Woman
Midsommar
Godzilla vs. Kong
Solo: A Star Wars Story
Suicide Squad
Dunkirk
The Cloverfield Paradox
Guardians of the Galaxy Vol. 2
No Time To Die
IT Chapter Two
Arrival
Ready Player One
Interstellar
Army of the Dead
Dont Look Up
Thor: Ragnarok
Tenet
Alien: Covenant
Borat Subsequent Moviefilm
The Tomorrow War
Parasite
Ant Man and The Wasp
Black Widow
Knives Out
Spider-

In [94]:
submissions['film_title'] = film_titles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions['film_title'] = film_titles


In [96]:
submissions.head(10)

Unnamed: 0,Column1,submission_id,title,score,num_comments,url,created,discussion_thread,film_title
0,1653.0,7jwxnd,Official Discussion - Star Wars: Episode VIII ...,15908.0,100558.0,https://www.reddit.com/r/movies/comments/7jwxn...,1513307000.0,,Star Wars: Episode VIII – The Last Jedi
1,1654.0,bh8iei,Official Discussion - Avengers: Endgame [SPOIL...,20046.0,89336.0,https://www.reddit.com/r/movies/comments/bh8ie...,1556248000.0,,Avengers: Endgame
2,1655.0,8f84h0,Official Discussion - Avengers: Infinity War [...,24045.0,72761.0,https://www.reddit.com/r/movies/comments/8f84h...,1524794000.0,,Avengers: Infinity War
3,1656.0,ed3a6g,Official Discussion - Star Wars: Episode IX - ...,17351.0,52017.0,https://www.reddit.com/r/movies/comments/ed3a6...,1576811000.0,,Star Wars: Episode IX - The Rise of Skywalker
4,1657.0,3xf9gd,Official Discussion - Star Wars: Episode VII -...,8394.0,40877.0,https://www.reddit.com/r/movies/comments/3xf9g...,1450494000.0,,Star Wars: Episode VII - The Force Awakens
5,1658.0,dd0ynj,Official Discussion- Joker (SPOILERS),16740.0,37715.0,https://www.reddit.com/r/movies/comments/dd0yn...,1570155000.0,,Joker
7,1660.0,m7y4se,Official Discussion - Zack Snyder's Justice Le...,11519.0,30105.0,https://www.reddit.com/r/movies/comments/m7y4s...,1616094000.0,,Zack Snyders Justice League
9,1662.0,kkbdpg,Official Discussion - Wonder Woman 1984 [SPOIL...,8085.0,25178.0,https://www.reddit.com/r/movies/comments/kkbdp...,1608952000.0,,Wonder Woman 1984
11,1664.0,chmknk,Official Discussion: Once Upon a Time in Holly...,4654.0,21634.0,https://www.reddit.com/r/movies/comments/chmkn...,1564106000.0,,Once Upon a Time in Hollywood
13,1666.0,ri7eum,Official Discussion - Spider-Man: No Way Home ...,13114.0,21125.0,https://www.reddit.com/r/movies/comments/ri7eu...,1639710000.0,,Spider-Man: No Way Home


In [103]:
df = pd.merge(all_submissions, submissions, how="outer")

In [108]:
df.head(10)

Unnamed: 0,submission_id,title,score,num_comments,url,created,discussion_thread,film_title
0,7jwxnd,Official Discussion - Star Wars: Episode VIII ...,15908.0,100558.0,https://www.reddit.com/r/movies/comments/7jwxn...,1513307000.0,,Star Wars: Episode VIII – The Last Jedi
1,bh8iei,Official Discussion - Avengers: Endgame [SPOIL...,20046.0,89336.0,https://www.reddit.com/r/movies/comments/bh8ie...,1556248000.0,,Avengers: Endgame
2,8f84h0,Official Discussion - Avengers: Infinity War [...,24045.0,72761.0,https://www.reddit.com/r/movies/comments/8f84h...,1524794000.0,,Avengers: Infinity War
3,ed3a6g,Official Discussion - Star Wars: Episode IX - ...,17351.0,52017.0,https://www.reddit.com/r/movies/comments/ed3a6...,1576811000.0,,Star Wars: Episode IX - The Rise of Skywalker
4,3xf9gd,Official Discussion - Star Wars: Episode VII -...,8394.0,40877.0,https://www.reddit.com/r/movies/comments/3xf9g...,1450494000.0,,Star Wars: Episode VII - The Force Awakens
5,dd0ynj,Official Discussion- Joker (SPOILERS),16740.0,37715.0,https://www.reddit.com/r/movies/comments/dd0yn...,1570155000.0,,Joker
6,487kb1,Official Oscar Thread 2016,3675.0,33999.0,https://www.reddit.com/r/movies/comments/487kb...,1456709000.0,Oscars,
7,m7y4se,Official Discussion - Zack Snyder's Justice Le...,11519.0,30105.0,https://www.reddit.com/r/movies/comments/m7y4s...,1616094000.0,,Zack Snyders Justice League
8,f1i94m,Official Oscars Thread 2020,7971.0,29801.0,https://www.reddit.com/r/movies/comments/f1i94...,1581296000.0,Oscars,
9,kkbdpg,Official Discussion - Wonder Woman 1984 [SPOIL...,8085.0,25178.0,https://www.reddit.com/r/movies/comments/kkbdp...,1608952000.0,,Wonder Woman 1984


In [109]:
df.to_csv(annotated_submissions_file, sep=';', index=False)

## Select matching comments

In [41]:
x = "'"+"','".join(submissions['submission_id'].tolist())+"'"

In [42]:
comments = pd.read_sql(f"SELECT * FROM comments WHERE submission_id IN ({x})", conn)

print(len(comments))

2118317


In [46]:
comments.to_csv(comments_file, sep=';', index=False)

In [2]:
comments = pd.read_csv(comments_file, sep=';')

In [3]:
comments.head()

Unnamed: 0,comment_id,submission_id,body,author,score,created
0,e0r6q9y,7llz2i,ADDITIONALLY-- thank you so much for caring en...,e-lutris,1,1529119335
1,e0enlht,7llz2i,I am watching Shape of Water. It says some st...,figshooting,1,1528587739
2,dz9lw9e,7llz2i,Extremely underwhelmed by the film.\n\nIt just...,Harry101UK,1,1526784495
3,dz9llo2,7llz2i,"""We need a quirky 'mute' girl. We need a gay g...",Harry101UK,1,1526784107
4,dwov5pc,7llz2i,But she wasn't either. She was part fish and ...,porkpie1028,1,1522697652


## Tokenize comments
- Convert comments to lowercase, replace accented letters
- Split comments into sentences and make sure all sentences are unique
- Then split sentences into tokens using TreebankWordTokenizer, removing tokens that are only punctuation
- Finally save the sentences (one sentence per line) which is the input required for Gensim word2vec

In [5]:
unique_sentences = set()
tokenized_comments = []

for comment in tqdm(comments['body'].tolist()):
    comment = str(comment)
    comment = comment.lower()
    comment = utils.strip_accents(comment)
    
    tokenized_comment = []
    
    for sent in sent_tokenize(comment):
        sent = sent.strip()
        sent = utils.tokenize_sentence(sent)
        #x = " ".join(sent)
        #unique_sentences.add(x)
        
        tokenized_comment.extend(sent)
        
    tokenized_comments.append(tokenized_comment)

100%|██████████████████████████████████████████████████████████████████████| 2118317/2118317 [10:11<00:00, 3462.95it/s]


In [59]:
x=0
print(len(unique_sentences))

for i in unique_sentences:
    if x == 5:
        break
    print(i)
    x += 1

5404427



In [60]:
with open(tokens_file, 'w') as f:
    for sent in tqdm(unique_sentences):
        f.write(sent+"\n")

100%|████████████████████████████████████████████████████████████████████| 5404427/5404427 [00:09<00:00, 569320.71it/s]


In [6]:
with open(tokenized_comments_file, 'wb') as f:
    pickle.dump(tokenized_comments, f)

In [4]:
print(utils.interact_with_db(conn, "SELECT COUNT(DISTINCT author) FROM comments", "cur.fetchone()"))

(352990,)
