In [1]:
import sqlite3
import utils
import pandas as pd
from tqdm import tqdm
from nltk import sent_tokenize
import pickle
import regex as re

## Setup

In [2]:
# Define location and name of SQL database, create a connection object
sql_db = './data/film_discussions'
conn = sqlite3.connect(sql_db)

# Converting the submissions to manually annotate them
submissions_file = './data/submissions.csv'
annotated_submissions_file = './data/submissions_annotated.csv'

# Comments from selected submissions
comments_file = './data/comments.csv'

# Tokenized sentences for processing with word2vec
tokens_file = './data/tokens.txt'

# Tokenized comments for matching discourse atoms and concept mover's distance to comments
tokenized_comments_file = './data/tokenized_comments.p'

## Manually filter submissions
- I only select submissions with at least 25 comments (according to the submission metadata, not the actual amount of comments I have in the database.
- Based on the submission titles, I select submissions which are about a single movie and not 'meta-comments' (e.g., "why isn't there an official discussion for X?")

In [26]:
submissions = pd.read_sql('SELECT * FROM submissions', conn)

print(len(submissions))

2048


In [8]:
#submissions.to_csv(submissions_file, index=False)

In [14]:
# I created a new column which is empty for official discussions (num_comments >= 25 only)
all_submissions = pd.read_csv(annotated_submissions_file, sep=';')
all_submissions.dropna(subset=['created'], inplace=True)

# if editing film titles, also do
#all_submissions.drop(columns="film_title", inplace=True)

In [15]:
all_submissions = all_submissions.astype({'score': 'int', 'num_comments': 'int', 'created':'int'})
all_submissions.loc[:,'date'] = pd.to_datetime(all_submissions.loc[:,'created'], unit='s')

In [16]:
all_submissions.head()

Unnamed: 0,submission_id,title,score,num_comments,url,created,discussion_thread,date
0,7jwxnd,Official Discussion - Star Wars: Episode VIII ...,15908,100558,https://www.reddit.com/r/movies/comments/7jwxn...,1513306809,,2017-12-15 03:00:09
1,bh8iei,Official Discussion - Avengers: Endgame [SPOIL...,20046,89336,https://www.reddit.com/r/movies/comments/bh8ie...,1556247619,,2019-04-26 03:00:19
2,8f84h0,Official Discussion - Avengers: Infinity War [...,24045,72761,https://www.reddit.com/r/movies/comments/8f84h...,1524794408,,2018-04-27 02:00:08
3,ed3a6g,Official Discussion - Star Wars: Episode IX - ...,17351,52017,https://www.reddit.com/r/movies/comments/ed3a6...,1576810828,,2019-12-20 03:00:28
4,3xf9gd,Official Discussion - Star Wars: Episode VII -...,8394,40877,https://www.reddit.com/r/movies/comments/3xf9g...,1450493995,,2015-12-19 02:59:55


In [17]:
submissions = all_submissions.loc[(all_submissions['num_comments'] >= 25) & (pd.isna(all_submissions['discussion_thread']))]

In [18]:
print(len(submissions))

1149


In [19]:
titles = submissions.title.tolist()

to_remove = [
    "[Spoilers]",
    "[SPOILERS]",
    "[SPOILERS}",
    "(SPOILERS)",
    "[SPOLERS]",
    "[spoilers]",
    
    "(un-Official Discussion)",
    "Un-Official Discussion :",
    "Un-Official Discussion:",
    "Psuedo-Official Discussion:",
    
    "discussion (Spoilers inside)",
    "Official International Release Discussion:",
    "Official International Discussion -",
    "Official International Release Discussion Thread -",
    
    "- Official Discussion Thread",
    "Official Discussion Thread:",
    
    "Discussion Thread.",
    "Official Discussion:",
    "Official Discussion -",
    "Official Discussion-",
    "Discussion Thread",
    
    "IMAX re-release",
    "(International Thread)",
    ", Netflix",
    "(UK release)",
    "(US Release)",
    "(International Release)",
    "(UK Release)",
    "(US Thread)",
    "(Thread Vol. 2)",
    "Official Discussion #2 -",
    "Movie Discussion",
    
    "Small Axe:",
    "4K Restoration",
    "The Saturday Official",
    "Official Late-Comer Megathread -",
    "(Roadshow 70mm Cut)",
    "(Theatrical Cut)",
    "The Pre-Official",
    "discussion thread",
    "OFFICIAL DISCUSSION",
    "Official Discussion",
    "[Serious Replies Only]",
    "(Early Release)",
    "(Spoilers inside)",
    "(Wide Release)",
    "(Netflix Release)",
    "(theater release)",
    "(2nd Thread)",
    "SPOILERS"
]

remove_other = [
    "-Official 48 fps Discussion thread.",
    r" \.",
    r"\d+/\d+  -",
    "31 Days of Halloween -",
    r"(July|August) \d{1,2}",
    '"',
    "'",
    "Official",
    r"- \d+/\d+",
    "\(Un\)"
]

replace_other = [
    # typos
    ["Godzilla: King of the Monstars", "Godzilla: King of the Monsters"],
    ["Thor: Rangarok", "Thor: Ragnarok"],
    # for tmdb matching
    ["The Hunger Games: Mockingjay - Part II", "The Hunger Games: Mockingjay - Part 2"],
    ["The Hunger Games: Mockingjay Part 2", "The Hunger Games: Mockingjay - Part 2"],
    ["The Hunger Games: Mockingjay Part 1", "The Hunger Games: Mockingjay - Part 1"],
    ["The Edge of Tomorrow", "Edge of Tomorrow"],
    ["T2 Trainspotting: Judgement Day", "T2 Trainspotting"],
    ["Jack Reacher: Never Stop Never Stopping", "Jack Reacher: Never Go Back"],
    ["The Girl in the Spiders Web: A New Dragon Tattoo Story", "The Girl in the Spiders Web"],
    ["T2 Trainspotting: Judgement Day", "T2 Trainspotting"],
    ["T2 Trainspotting: Battle Across Time", "T2 Trainspotting"],
    ["John Wick 2", "John Wick: Chapter 2"],
    ["Oceans 8", "Ocean's Eight"],
    ["Your Name", "君の名は。"],
    ["Sailor Moon Eternal", "劇場版 美少女戦士セーラームーンEternal 前編"],
    ["The Christmas Chronicles 2", "The Christmas Chronicles: Part Two"],
    ["The Croods 2", "The Croods: A New Age"],
    ["El Camino", "El Camino: A Breaking Bad Movie"]
    
]

film_titles = []
for title in titles:
    for j in to_remove:
        title = title.replace(j, "")
        
    for j in remove_other:
        title = re.sub(j, "", title)
        
    for i, j in replace_other:
        if i == title.strip():
            title = title.replace(i, j)
        
    title = title.strip()
        
    title = title.strip()    
    film_titles.append(title)

In [20]:
for i in film_titles:
    if re.search('君の名は。', i):
        print(i)

君の名は。


In [21]:
submissions['film_title'] = film_titles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions['film_title'] = film_titles


In [22]:
submissions.loc[submissions.film_title.str.contains("Camino")]

Unnamed: 0,submission_id,title,score,num_comments,url,created,discussion_thread,date,film_title
78,dfz6t2,Official Discussion - El Camino: A Breaking Ba...,3465,6428,https://www.reddit.com/r/movies/comments/dfz6t...,1570784418,,2019-10-11 09:00:18,El Camino: A Breaking Bad Movie
1240,dgca3v,Official Discussion - El Camino (SPOILERS),15,28,https://www.reddit.com/r/movies/comments/dgca3...,1570783603,,2019-10-11 08:46:43,El Camino: A Breaking Bad Movie


In [23]:
df = pd.merge(all_submissions, submissions, how="outer")

In [24]:
df.head(10)

Unnamed: 0,submission_id,title,score,num_comments,url,created,discussion_thread,date,film_title
0,7jwxnd,Official Discussion - Star Wars: Episode VIII ...,15908,100558,https://www.reddit.com/r/movies/comments/7jwxn...,1513306809,,2017-12-15 03:00:09,Star Wars: Episode VIII – The Last Jedi
1,bh8iei,Official Discussion - Avengers: Endgame [SPOIL...,20046,89336,https://www.reddit.com/r/movies/comments/bh8ie...,1556247619,,2019-04-26 03:00:19,Avengers: Endgame
2,8f84h0,Official Discussion - Avengers: Infinity War [...,24045,72761,https://www.reddit.com/r/movies/comments/8f84h...,1524794408,,2018-04-27 02:00:08,Avengers: Infinity War
3,ed3a6g,Official Discussion - Star Wars: Episode IX - ...,17351,52017,https://www.reddit.com/r/movies/comments/ed3a6...,1576810828,,2019-12-20 03:00:28,Star Wars: Episode IX - The Rise of Skywalker
4,3xf9gd,Official Discussion - Star Wars: Episode VII -...,8394,40877,https://www.reddit.com/r/movies/comments/3xf9g...,1450493995,,2015-12-19 02:59:55,Star Wars: Episode VII - The Force Awakens
5,dd0ynj,Official Discussion- Joker (SPOILERS),16740,37715,https://www.reddit.com/r/movies/comments/dd0yn...,1570154814,,2019-10-04 02:06:54,Joker
6,487kb1,Official Oscar Thread 2016,3675,33999,https://www.reddit.com/r/movies/comments/487kb...,1456708502,Oscars,2016-02-29 01:15:02,
7,m7y4se,Official Discussion - Zack Snyder's Justice Le...,11519,30105,https://www.reddit.com/r/movies/comments/m7y4s...,1616094066,,2021-03-18 19:01:06,Zack Snyders Justice League
8,f1i94m,Official Oscars Thread 2020,7971,29801,https://www.reddit.com/r/movies/comments/f1i94...,1581295598,Oscars,2020-02-10 00:46:38,
9,kkbdpg,Official Discussion - Wonder Woman 1984 [SPOIL...,8085,25178,https://www.reddit.com/r/movies/comments/kkbdp...,1608952105,,2020-12-26 03:08:25,Wonder Woman 1984


In [26]:
df.to_csv(annotated_submissions_file, sep=';', index=False)

## Select matching comments

In [41]:
x = "'"+"','".join(submissions['submission_id'].tolist())+"'"

In [42]:
comments = pd.read_sql(f"SELECT * FROM comments WHERE submission_id IN ({x})", conn)

print(len(comments))

2118317


In [46]:
comments.to_csv(comments_file, sep=';', index=False)

In [2]:
comments = pd.read_csv(comments_file, sep=';')

In [3]:
comments.head()

Unnamed: 0,comment_id,submission_id,body,author,score,created
0,e0r6q9y,7llz2i,ADDITIONALLY-- thank you so much for caring en...,e-lutris,1,1529119335
1,e0enlht,7llz2i,I am watching Shape of Water. It says some st...,figshooting,1,1528587739
2,dz9lw9e,7llz2i,Extremely underwhelmed by the film.\n\nIt just...,Harry101UK,1,1526784495
3,dz9llo2,7llz2i,"""We need a quirky 'mute' girl. We need a gay g...",Harry101UK,1,1526784107
4,dwov5pc,7llz2i,But she wasn't either. She was part fish and ...,porkpie1028,1,1522697652


## Tokenize comments
- Convert comments to lowercase, replace accented letters
- Split comments into sentences and make sure all sentences are unique
- Then split sentences into tokens using TreebankWordTokenizer, removing tokens that are only punctuation
- Finally save the sentences (one sentence per line) which is the input required for Gensim word2vec

In [5]:
unique_sentences = set()
tokenized_comments = []

for comment in tqdm(comments['body'].tolist()):
    comment = str(comment)
    comment = comment.lower()
    comment = utils.strip_accents(comment)
    
    tokenized_comment = []
    
    for sent in sent_tokenize(comment):
        sent = sent.strip()
        sent = utils.tokenize_sentence(sent)
        #x = " ".join(sent)
        #unique_sentences.add(x)
        
        tokenized_comment.extend(sent)
        
    tokenized_comments.append(tokenized_comment)

100%|██████████████████████████████████████████████████████████████████████| 2118317/2118317 [10:11<00:00, 3462.95it/s]


In [59]:
x=0
print(len(unique_sentences))

for i in unique_sentences:
    if x == 5:
        break
    print(i)
    x += 1

5404427



In [60]:
with open(tokens_file, 'w') as f:
    for sent in tqdm(unique_sentences):
        f.write(sent+"\n")

100%|████████████████████████████████████████████████████████████████████| 5404427/5404427 [00:09<00:00, 569320.71it/s]


In [6]:
with open(tokenized_comments_file, 'wb') as f:
    pickle.dump(tokenized_comments, f)

In [4]:
print(utils.interact_with_db(conn, "SELECT COUNT(DISTINCT author) FROM comments", "cur.fetchone()"))

(352990,)
