In [1]:
import os
import re
import nltk
import spacy
import random
import neuralcoref
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johnchen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Preparation

In [3]:
wnl = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [4]:
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fedc1c43250>

# The SMMRY Algorithm

In [5]:
transition_phrases = ['thus', 'for example', 'for instance', 'namely', 'to illustrate',
                      'in other words', 'in particular', 'specifically', 'such as',
                      'on the contrary', 'contrarily', 'notwithstanding', 'but', 'however',
                      'nevertheless', 'in spite of', 'in contrast', 'yet', 'on one hand',
                      'on the other hand', 'rather', 'or', 'nor', 'conversely', 'at the same time',
                      'while this may be true', 'and', 'in addition to', 'furthermore',
                      'moreover', 'besides', 'than', 'too', 'also', 'both-and', 'another',
                      'equally important', 'second', 'etc.', 'again', 'further', 'last',
                      'finally', 'not only-but also', 'as well as', 'in the second place',
                      'next', 'likewise', 'similarly', 'in fact', 'as a result', 'consequently',
                      'in the same way', 'for example', 'for instance', 'however', 'thus',
                      'therefore', 'otherwise', 'after that', 'afterward', 'then', 'next',
                      'last', 'at last', 'at length', 'at first', 'formerly', 'another', 'finally',
                      'meanwhile', 'at the same time', 'afterwards', 'subsequently',
                      'in the meantime', 'eventually', 'concurrently', 'simultaneously', 'although',
                      'at least', 'still', 'even though', 'granted that', 'while it may be true',
                      'in spite of', 'of course', 'similarly', 'likewise', 'in like fashion',
                      'in like manner', 'analogous to', 'above all', 'indeed', 'of course',
                      'certainly', 'surely', 'in fact', 'really', 'in truth', 'again', 'besides',
                      'also', 'furthermore', 'in addition', 'specifically', 'especially',
                      'in particular', 'to explain', 'to list', 'to enumerate', 'in detail',
                      'namely', 'including', 'for example', 'for instance', 'to illustrate',
                      'thus', 'in other words', 'as an illustration', 'in particular', 'so that',
                      'with the result that', 'consequently', 'hence', 'accordingly', 'for this reason',
                      'therefore', 'because', 'due to', 'as a result', 'in other words', 'then',
                      'therefore', 'finally', 'consequently', 'thus', 'in conclusion', 'as a result',
                      'accordingly', 'for this purpose', 'to this end', 'with this in mind',
                      'with this purpose in mind', 'therefore']

In [6]:
def transition_start(first_sent, dialog_turn):
    if dialog_turn == 1:
        for phrase in transition_phrases:
            if first_sent.lower().startswith(phrase):
                return True
        return False
    else:
        return False

In [7]:
def smmry(text, doc, sent_count, dialog_turn):

    # some preprocessing to omit text within brackets and replace u with you. 
    
    # text = re.sub("[\(\[].*?[\)\]]", "", text)
    # text = text.replace(' u ', ' you ')

    formatted_text = re.sub('[^a-zA-Z]', ' ', text)
    formatted_text = re.sub(r'\s+', ' ', formatted_text)

    # doc = nlp(text)

    fdist = {}
    word_arr = nltk.word_tokenize(formatted_text.lower())

    # preparing a frequency dictionary without considering stop words
    
    for word in word_arr:
        if not word in stop_words:
            word = wnl.lemmatize(word)
            if word not in fdist.keys():
                    fdist[word] = 1
            else:
                    fdist[word] += 1

    sent_arr = nltk.sent_tokenize(text)
    sent_score_arr = []
    summary_arr = []

    sent_arr_coref_resolved = nltk.sent_tokenize(doc._.coref_resolved)

    # compute scores for each sentence

    for sent in sent_arr:
        score = 0
        token_arr = nltk.word_tokenize(sent.lower())
        for word in token_arr:
            word = wnl.lemmatize(word)
            if word in fdist.keys():
                score += fdist[word]

        sent_score_arr.append(score/len(token_arr))

    sent_score_arr = np.array(sent_score_arr)

    all_ind_arr = sent_score_arr.argsort()[-len(sent_score_arr):][::-1]

    ind_arr_unsorted = sent_score_arr.argsort()[-sent_count:][::-1]

    ind_arr = np.sort(ind_arr_unsorted) 

    summary = ''
    changed_first = False

    if len(ind_arr) > 0:

        try:

            ind = ind_arr[0]
            first_sent = sent_arr[ind]

            while (first_sent != sent_arr_coref_resolved[ind] or transition_start(first_sent, dialog_turn)):
                changed_first = True
                for index in all_ind_arr:
                    if index < ind:
                        ind = index
                        break
                first_sent = sent_arr[ind]
                if ind == 0:
                    break
            summary = summary + first_sent + ' '     
            
            if (changed_first):
                first_ind = ind
                sent_score_modified = sent_score_arr[first_ind+1:]
                ind_arr_unsorted = sent_score_modified.argsort()[-(sent_count-1):][::-1]
                ind_arr_next = np.sort(ind_arr_unsorted) 
                
                for i in range(0, len(ind_arr_next)):
                    ind = (first_ind+1) + ind_arr_next[i]
                    if i == len(ind_arr_next) - 1:
                        summary = summary + sent_arr[ind]
                    else:
                        summary = summary + sent_arr[ind] + ' '
            
            else:
                for i in range(1, len(ind_arr)):
                    ind = ind_arr[i]
                    if i == len(ind_arr) - 1:
                        summary = summary + sent_arr[ind]
                    else:
                        summary = summary + sent_arr[ind] + ' '

            return summary

        except Exception as e:

            print("EXCEPTION occured")
            return text

    else:
        print(text)
        print(sent_arr)
        print("EXCEPTION occured: length of sentence array is not > 0")
        return text

# Data Cleaning Functions

In [8]:
# aux_verbs = ['be', 'can', 'could', 'dare', 'do', 'have', 'may', 'might', 'must',
#              'need', 'ought', 'shall', 'should', 'will', 'would']
# wh_words = ['what', 'when', 'where', 'which', 'who', 'whom', 'whose', 'why', 'how']
# q_words = aux_verbs + wh_words

In [9]:
def preprocess_raw(text):
    # Check if text is a str
    if not isinstance(text, str):
        return None

    # Replace HTML escape chars
    text = text.replace('&gt;', '>')
    text = text.replace('&lt;', '<')
    text = text.replace('&amp;', '&')
    text = text.replace('#x200B;', ' ')
    text = text.replace('nbsp;', ' ')

    # Remove brackets
    b_pattern = re.compile(r'(\([^\(\)]*\))|(\[[^\[\]]*\])')
    while b_pattern.search(text):
        text = re.sub(r'(\([^\(\)]*\))|(\[[^\[\]]*\])', '', text)

    # Remove redundant spaces (including breaklines)
    text = ' '.join(text.split())

    # Check if text is empty
    if not text:
        return None

    text_lower = text.lower()

    # Check if text is [deleted] or [removed]
    if text_lower == '[deleted]' or text_lower == '[removed]':
        return None

    # Check if text contains URL
    url_pattern = re.compile(r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    if url_pattern.search(text_lower):
        return None

    # Check if text contains 'r/<subreddit>' or 'u/<username>'
    r_pattern = re.compile(r'(^| )\/?r\/[^ ]*')
    if r_pattern.search(text_lower):
        return None
    u_pattern = re.compile(r'(^| )\/?u\/[^ ]*')
    if u_pattern.search(text_lower):
        return None

    # Check if text contains 'reddit'
    if 'reddit' in text_lower:
        return None

    # Check the percentage of alphabetical letters
    num_alphas = 0
    for ch in text:
        if ch.isalpha():
            num_alphas += 1
    if num_alphas / len(text) < 0.7:
        return None

    doc = nlp(text)

    # Check the number of tokens
    if len(doc) < 2:
        return None

    return {'text': text, 'doc': doc}

In [10]:
def preprocess_summary(text):
    # Check if text is a str
    if not isinstance(text, str):
        return None

    # Remove redundant spaces (including breaklines)
    text = ' '.join(text.split())

    # Check if text is empty
    if not text:
        return None

    # Check the percentage of alphabetical letters
    num_alphas = 0
    for ch in text:
        if ch.isalpha():
            num_alphas += 1
    if num_alphas / len(text) < 0.7:
        return None

    doc = nlp(text)

    # Check the number of tokens
    if len(doc) < 2:
        return None

    return {'text': text, 'doc': doc}

In [11]:
def extract_root(text, sent):
#     # Check if the last character is a question mark
#     if sent[-1].text == '?':
#         return None

    if sent.root.pos_ == 'VERB':
#         # Check the first token
#         if sent[0].lemma_.lower() in q_words:
#             return None
        return sent.root.lemma_

    return None

In [12]:
def summarize(preprocessed_text, dialog_turn):
    if preprocessed_text is None:
        return None

    text = preprocessed_text['text']
    doc = preprocessed_text['doc']

    summarized = 0
    sents = [sent for sent in doc.sents]
    if len(sents) > 1:
        summarized = 1
        summary = smmry(text, doc, 1, dialog_turn)
        preprocessed_summary = preprocess_summary(summary)
        if preprocessed_summary is None:
            return None
        summarized_text = preprocessed_summary['text']
        summarized_doc = preprocessed_summary['doc']
        summarized_sents = [sent for sent in summarized_doc.sents]
        if len(summarized_sents) != 1:
            return None
    elif len(sents) == 1:
        summarized_text = text
        summarized_doc = doc
        summarized_sents = sents
    else:
        return None

    if dialog_turn > 1:
        return {'text': summarized_text, 'summarized': summarized, 'length': len(summarized_sents[0])}

    root = extract_root(summarized_text, summarized_sents[0])
    if root is not None:
        return {'text': summarized_text, 'summarized': summarized, 'root': root, 'length': len(summarized_sents[0])}
    else:
        return None

# Filter Submissions

## Batch Process

In [14]:
date_suffices = ['20170101_20171231', '20180101_20181231', '20190101_20191231',
                 '20200101_20201231', '20210101_20211231']

for date_suffix in date_suffices:
    submission_df = pd.read_csv('../pushshift/submission/askReddit_submission_{}.csv'.format(date_suffix))
    print(submission_df.shape)
    submission_filtered_cols = ['id', 'summarized', 'from', 'text', 'root','score', 'length']
    submission_filtered_dict = {col: [] for col in submission_filtered_cols}
    for i in tqdm(range(submission_df.shape[0])):
        submission_id = submission_df.iloc[i]['id']
        submission_score = submission_df.iloc[i]['score']
        
        title = submission_df.iloc[i]['title']
        preprocessed_title = preprocess_raw(title)
        summarized_title = summarize(preprocessed_title, dialog_turn = 1)
        if summarized_title is not None:
            submission_filtered_dict['id'].append(submission_id)
            submission_filtered_dict['summarized'].append(summarized_title['summarized'])
            submission_filtered_dict['from'].append('title')
            submission_filtered_dict['text'].append(summarized_title['text'])
            submission_filtered_dict['root'].append(summarized_title['root'])
            submission_filtered_dict['score'].append(submission_score)
            submission_filtered_dict['length'].append(summarized_title['length'])
        else:
            selftext = submission_df.iloc[i]['selftext']
            preprocessed_selftext = preprocess_raw(selftext)
            summarized_selftext = summarize(preprocessed_selftext, dialog_turn = 1)
            if summarized_selftext is not None:
                submission_filtered_dict['id'].append(submission_id)
                submission_filtered_dict['summarized'].append(summarized_selftext['summarized'])
                submission_filtered_dict['from'].append('selftext')
                submission_filtered_dict['text'].append(summarized_selftext['text'])
                submission_filtered_dict['root'].append(summarized_selftext['root'])
                submission_filtered_dict['score'].append(submission_score)
                submission_filtered_dict['length'].append(summarized_selftext['length'])
    submission_filtered_df = pd.DataFrame(submission_filtered_dict)
    print(submission_filtered_df.shape)
    submission_filtered_df.to_csv('../pushshift/filtered_q/submission/casual_conv_submissions_{}.csv'.format(date_suffix), index = False)

(26106, 7)


100%|█████████████████████████████████████| 26106/26106 [04:48<00:00, 90.62it/s]


(19990, 7)
(24595, 7)


100%|█████████████████████████████████████| 24595/24595 [04:36<00:00, 88.86it/s]


(18486, 7)
(23326, 7)


100%|█████████████████████████████████████| 23326/23326 [04:24<00:00, 88.03it/s]


(17392, 7)
(28113, 7)


100%|█████████████████████████████████████| 28113/28113 [05:19<00:00, 87.95it/s]


(21241, 7)
(32344, 7)


100%|█████████████████████████████████████| 32344/32344 [06:12<00:00, 86.86it/s]


(25242, 7)


# Filter Comments

## Batch Process

In [14]:
date_suffices = ['20170101_20171231']
for date_suffix in date_suffices:
    comment_df = pd.read_csv('../pushshift/comment/askReddit_comment_{}.csv'.format(date_suffix))
#     print(comment_df.shape)
    submission_filtered_df = pd.read_csv('../pushshift/submission/askReddit_submission_{}.csv'.format(date_suffix))
    submission_filtered_ids = submission_filtered_df['id'].tolist()
    submission_filtered_ids = ['t3_' + x for x in submission_filtered_ids]
    
    coment_coment_df=comment_df[comment_df['parent_id'].str.startswith('t1',na=False)]
    coment_coment_ids=coment_coment_df['parent_id'].apply(lambda x:x.split('_')[1]).tolist()
    coment_coment_filtered_ids=[x for x in coment_coment_ids]
    coment_coment_filtered_ids = list(dict.fromkeys(coment_coment_filtered_ids))

    total_ids=submission_filtered_ids+coment_coment_filtered_ids
    total_ids = list(dict.fromkeys(total_ids))   
    comment_df=comment_df[comment_df['id'].isin(total_ids)]
    
    
    comment_filtered_cols = ['id','link_id','parent_id', 'summarized', 'text', 'score','length']
    comment_filtered_dict = {col: [] for col in comment_filtered_cols}
    for i in tqdm(range(comment_df.shape[0])):
        comment_id = comment_df.iloc[i]['id']
        link_id = comment_df.iloc[i]['link_id']
        parent_id = comment_df.iloc[i]['parent_id']
        score=comment_df.iloc[i]['score']
        body = comment_df.iloc[i]['body']
        preprocessed_body = preprocess_raw(body)
        summarized_body = summarize(preprocessed_body, dialog_turn = 2)
        if summarized_body is not None:
            comment_filtered_dict['id'].append(comment_id)
            comment_filtered_dict['link_id'].append(link_id)
            comment_filtered_dict['parent_id'].append(parent_id)
            comment_filtered_dict['summarized'].append(summarized_body['summarized'])
            comment_filtered_dict['text'].append(summarized_body['text'])
            comment_filtered_dict['score'].append(score)
            comment_filtered_dict['length'].append(summarized_body['length'])
    comment_filtered_df = pd.DataFrame(comment_filtered_dict)
#     print(comment_filtered_df)
    comment_filtered_df.to_csv('../pushshift/filtered_q/comment/askReddit_comment_{}.csv'.format(date_suffix), index = False)    

  7%|██▊                                  | 7806/104951 [04:28<41:24, 39.10it/s]

EXCEPTION occured


 24%|████████                          | 24704/104951 [14:25<1:03:00, 21.23it/s]

EXCEPTION occured


 29%|█████████▉                        | 30592/104951 [19:41<1:10:50, 17.50it/s]

EXCEPTION occured


 35%|████████████▋                       | 36831/104951 [24:21<40:13, 28.23it/s]

EXCEPTION occured


 43%|███████████████▎                    | 44734/104951 [28:55<32:29, 30.90it/s]

EXCEPTION occured


 46%|████████████████▌                   | 48446/104951 [30:53<24:19, 38.71it/s]

EXCEPTION occured


 46%|████████████████▌                   | 48455/104951 [30:53<27:25, 34.32it/s]

EXCEPTION occured


 53%|███████████████████▏                | 55971/104951 [34:24<17:21, 47.02it/s]

EXCEPTION occured


 55%|███████████████████▊                | 57627/104951 [35:05<20:20, 38.77it/s]

EXCEPTION occured


 62%|██████████████████████▏             | 64851/104951 [38:06<22:41, 29.45it/s]

EXCEPTION occured


 71%|█████████████████████████▋          | 74716/104951 [42:36<17:22, 28.99it/s]

EXCEPTION occured


 73%|██████████████████████████▏         | 76239/104951 [43:26<17:27, 27.41it/s]

EXCEPTION occured


 74%|██████████████████████████▌         | 77545/104951 [44:13<18:00, 25.36it/s]

EXCEPTION occured


 74%|██████████████████████████▊         | 78056/104951 [44:23<07:50, 57.20it/s]

EXCEPTION occured


 86%|██████████████████████████████▉     | 90023/104951 [49:59<04:31, 54.92it/s]

EXCEPTION occured


 91%|████████████████████████████████▊   | 95744/104951 [52:17<04:14, 36.23it/s]

EXCEPTION occured


 91%|████████████████████████████████▉   | 95957/104951 [52:22<04:32, 33.03it/s]

EXCEPTION occured


 94%|█████████████████████████████████▋  | 98279/104951 [53:53<03:03, 36.38it/s]

EXCEPTION occured


100%|███████████████████████████████████| 104951/104951 [57:21<00:00, 30.50it/s]


# Finalize the Submissions and Comments

In [38]:
submission_filtered_df = pd.read_csv('../data/reddit/filtered/casual_conv_submissions_{}.csv'.format(date_suffix))

In [39]:
comment_filtered_df = pd.read_csv('../data/reddit/filtered/casual_conv_comments_{}.csv'.format(date_suffix))
comment_filtered_parent_ids = comment_filtered_df['parent_id'].tolist()
comment_filtered_parent_ids = [x[3:] for x in comment_filtered_parent_ids]

In [40]:
submission_filtered_df = submission_filtered_df[submission_filtered_df['id'].isin(comment_filtered_parent_ids)]
submission_filtered_df.shape

(7941, 6)

In [41]:
final_cols = ['sub_id', 'sub_summarized', 'sub_from', 'sub_text', 'sub_root', 'sub_length',
              'com_id', 'com_summarized', 'com_text', 'com_length']
final_dict = {col: [] for col in final_cols}

In [42]:
for i in tqdm(range(submission_filtered_df.shape[0])):
    sub_id = submission_filtered_df.iloc[i]['id']
    sub_summarized = submission_filtered_df.iloc[i]['summarized']
    sub_from = submission_filtered_df.iloc[i]['from']
    sub_text = submission_filtered_df.iloc[i]['text']
    sub_root = submission_filtered_df.iloc[i]['root']
    sub_length = submission_filtered_df.iloc[i]['length']

    comment_filtered_df_sub = comment_filtered_df[comment_filtered_df['parent_id'] == 't3_' + sub_id]
    for j in range(comment_filtered_df_sub.shape[0]):
        final_dict['sub_id'].append(sub_id)
        final_dict['sub_summarized'].append(sub_summarized)
        final_dict['sub_from'].append(sub_from)
        final_dict['sub_text'].append(sub_text)
        final_dict['sub_root'].append(sub_root)
        final_dict['sub_length'].append(sub_length)
        final_dict['com_id'].append(comment_filtered_df_sub.iloc[j]['id'])
        final_dict['com_summarized'].append(comment_filtered_df_sub.iloc[j]['summarized'])
        final_dict['com_text'].append(comment_filtered_df_sub.iloc[j]['text'])
        final_dict['com_length'].append(comment_filtered_df_sub.iloc[j]['length'])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7941/7941 [00:46<00:00, 170.12it/s]


In [43]:
final_df = pd.DataFrame(final_dict)

In [44]:
final_df

Unnamed: 0,sub_id,sub_summarized,sub_from,sub_text,sub_root,sub_length,com_id,com_summarized,com_text,com_length
0,r2v7x0,0,title,My boss told me that during the time my hair w...,make,21,hm6trwr,1,Take it as a compliment.,6
1,r2v7x0,0,title,My boss told me that during the time my hair w...,make,21,hm6tw3v,1,However working at a bar where sex-charged men...,14
2,r2v7x0,0,title,My boss told me that during the time my hair w...,make,21,hm6uegk,1,Why do people get bent out of shape about hair...,15
3,r2v7x0,0,title,My boss told me that during the time my hair w...,make,21,hm6xlf7,1,Will appearing more attractive to customers in...,16
4,r2v7x0,0,title,My boss told me that during the time my hair w...,make,21,hm6zt2c,1,are you suggesting that its a problem for peop...,28
...,...,...,...,...,...,...,...,...,...,...
53689,n8sdhl,0,selftext,I never really got to do things like this when...,get,36,gxkseja,1,Nice man,2
53690,n8sdhl,0,selftext,I never really got to do things like this when...,get,36,gxocvzh,1,"I've never lived anywhere with fireflies, and ...",18
53691,n8sbcb,0,title,After the worst 4 years of my life during coll...,move,22,gxk4z84,0,Not a bad place to live if you can solve for COL,12
53692,n8sbcb,0,title,After the worst 4 years of my life during coll...,move,22,gxk5bpg,1,Good luck on your new adventure!,7


In [45]:
final_df.to_csv('../data/reddit/matched/casual_conv_{}.csv'.format(date_suffix), index = False)

## Batch Process

In [54]:
date_suffices = ['20170101_20171231', '20180101_20181231', '20190101_20191231',
                 '20200101_20201231', '20210101_20211231']


for date_suffix in date_suffices:
    submission_filtered_df = pd.read_csv('../pushshift/filtered_q/submission/casual_conv_submissions_{}.csv'.format(date_suffix))
    comment_filtered_df = pd.read_csv('../pushshift/filtered_q/comment/casual_conv_comments_{}.csv'.format(date_suffix))
    
    final_cols = ['src_id', 'src_type', 'src_summarized', 'src_from', 'src_text', 'src_root', 'src_length',
                  'com_id', 'com_summarized', 'com_text', 'com_length','com_score']
    final_dict = {col: [] for col in final_cols}
    
    comment_filtered_parent_df=comment_filtered_df[comment_filtered_df['parent_id'].str.startswith('t3')]
    comment_filtered_parent_ids = comment_filtered_parent_df['parent_id'].apply(lambda x:x.split('_')[1]).tolist()
    
    submission_filtered_df = submission_filtered_df[submission_filtered_df['id'].isin(comment_filtered_parent_ids)]
    print(submission_filtered_df.shape)

    
    print(date_suffix,'Start assemble submission with comment')
    for i in tqdm(range(submission_filtered_df.shape[0])):       
        sub_id = submission_filtered_df.iloc[i]['id']
        sub_summarized = submission_filtered_df.iloc[i]['summarized']
        sub_from = submission_filtered_df.iloc[i]['from']
        sub_text = submission_filtered_df.iloc[i]['text']
        sub_root = submission_filtered_df.iloc[i]['root']
        sub_length = submission_filtered_df.iloc[i]['length']
        comment_filtered_df_sub = comment_filtered_df[comment_filtered_df['parent_id'] == 't3_' + sub_id]
        for j in range(comment_filtered_df_sub.shape[0]):
            final_dict['src_id'].append(sub_id)
            final_dict['src_type'].append('sub')
            final_dict['src_summarized'].append(sub_summarized)
            final_dict['src_from'].append(sub_from)
            final_dict['src_text'].append(sub_text)
            final_dict['src_root'].append(sub_root)
            final_dict['src_length'].append(sub_length)
            final_dict['com_id'].append(comment_filtered_df_sub.iloc[j]['id'])
            final_dict['com_summarized'].append(comment_filtered_df_sub.iloc[j]['summarized'])
            final_dict['com_text'].append(comment_filtered_df_sub.iloc[j]['text'])
            final_dict['com_length'].append(comment_filtered_df_sub.iloc[j]['length'])
            final_dict['com_score'].append(comment_filtered_df_sub.iloc[j]['score'])
    
    
    
    coment_coment_df=comment_filtered_df[comment_filtered_df['parent_id'].str.startswith('t1')]
    coment_coment_ids=coment_coment_df['parent_id'].apply(lambda x:x.split('_')[1]).tolist()    
    comment_have_kid_df = comment_filtered_df[comment_filtered_df['id'].isin(coment_coment_ids)]
    
    print(date_suffix,'Start assemble comment with comment')
    for k in tqdm(range(comment_have_kid_df.shape[0])):       
        src_id = comment_have_kid_df.iloc[k]['id']
        src_summarized = comment_have_kid_df.iloc[k]['summarized']
        src_text = comment_have_kid_df.iloc[k]['text']
        src_length = comment_have_kid_df.iloc[k]['length']
        
        comment_comment_df = comment_filtered_df[comment_filtered_df['parent_id'] == 't1_' + src_id]
        for m in range(comment_comment_df.shape[0]):
            final_dict['src_id'].append(src_id)
            final_dict['src_type'].append('com')
            final_dict['src_summarized'].append(src_summarized)
            final_dict['src_from'].append('comment')
            final_dict['src_text'].append(src_text)
            final_dict['src_root'].append(src_root)
            final_dict['src_length'].append(src_length)
            
            final_dict['com_id'].append(comment_comment_df.iloc[m]['id'])
            final_dict['com_summarized'].append(comment_comment_df.iloc[m]['summarized'])
            final_dict['com_text'].append(comment_comment_df.iloc[m]['text'])
            final_dict['com_length'].append(comment_comment_df.iloc[m]['length'])
            final_dict['com_score'].append(comment_comment_df.iloc[m]['score'])


    
    final_df = pd.DataFrame(final_dict)
    print("final df shape:",final_df.shape)
    final_df.to_csv('../pushshift/matched_q/casual_conv_{}.csv'.format(date_suffix), index = False)
    
   

(11, 7)
20160101_20161231 Start assemble submission with comment


100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 389.26it/s]


20160101_20161231 Start assemble comment with comment


100%|██████████████████████████████████████| 2164/2164 [00:03<00:00, 692.37it/s]

final df shape: (2186, 12)





# Sample Some Pairs

In [31]:
N = final_df.shape[0]
indices = np.sort(np.random.choice(N, 1000, replace = False))
final_df_sample = final_df.iloc[indices]
final_df_sample.to_csv('pushshift/final/casual_conv_20200101_20201231_sample.csv', index = False)

# Some Tests

In [13]:
nlp = spacy.load('en_core_web_sm')

In [23]:
doc = nlp("A silly question")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)

question
NOUN


In [25]:
doc = nlp("I am happy about it.")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)

am
AUX


In [26]:
doc = nlp("I went to the market today")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)

went
VERB


In [27]:
doc = nlp("Just worked more than I ever have in my life")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)

worked
VERB


In [29]:
doc = nlp("Happy New Year!")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)

Year
PROPN


In [46]:
doc = nlp("Has she done your homework")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)
    print(sent[0].lemma_)
    for token in sent:
        print(token, token.lemma_, token.pos_)

done
VERB
have
Has have AUX
she -PRON- PRON
done do VERB
your -PRON- DET
homework homework NOUN


In [35]:
doc = nlp("Did you finish your homework")
sents = [sent for sent in doc.sents]
sent = sents[0]
print(sent)
print(sent.root.pos_)
print(sent[0].lemma_)

Did you finish your homework
VERB
do


In [25]:
doc = nlp("I just gave you the paper")
sents = [sent for sent in doc.sents]
sent = sents[0]
print(len(sent), sent.root, sent.root.pos_, sent.root.lemma_)
print(sent[0].lemma_, sent[0].pos_)

6 gave VERB give
-PRON- PRON


In [13]:
r_pattern = re.compile(r'(^| )\/?r\/[^ ]*')
print(r_pattern.search('ashjs/r/haha__ ss'))

None


In [18]:
re.sub(r'(\([^\(\)]*\))|(\[[^\[\]]*\])', '', 'how about [ashs] and and [[ss]')

'how about  and and ['

In [58]:
data_suffices = [
'20170101_20171231', '20180101_20181231', 
                 '20200101_20201231', '20210101_20211231']
df_all = pd.DataFrame(columns = ["src_id", "com_score", "combined", "gpt2_score",'colbert_score'])

for date_suffix in data_suffices:
    df=pd.read_csv('../pushshift/final/askreddit_final_{}.csv'.format(date_suffix))
#     df=df[ (df['com_score']>=2) & (df['gpt2_score']>5) &(df['colbert_score']>0.99) ]
#         df=df[(df['com_score']>=2) & (df['gpt2_score']>5) &(sdf['colbert_core']>0.99)]
    df_all=df_all.append(df)
   

# df_all.to_csv('../pushshift/all_result.csv')
print(df_all.shape)
# check=df_all.sample(n=50)
# print(check.combined)

(109, 5)


In [43]:
data_suffices = [
'20170101_20171231', '20180101_20181231', 
                 '20200101_20201231', '20210101_20211231']

df_all = pd.DataFrame(columns = ["src_id", "com_score", "combined", "gpt2_score",'colbert_score'])

for date_suffix in date_suffices:
        df = pd.read_csv('../pushshift/final/askreddit_final_{}.csv'.format(date_suffix))
        df=df[(df['com_score']>=2)]
#         df=df[(df['com_score']>=2) & (df['gpt2_score']>5) &(df['colbert_score']>0.)]
        df_all=df_all.append(df)

print(df_all.shape)
# df_all.to_csv('../pushshift/score_result.csv')

# check=df_all.sample(n=50)
# check

FileNotFoundError: [Errno 2] No such file or directory: '../pushshift/final/askreddit_final_20190101_20191231.csv'