In [None]:
import os
import praw
import pandas as pd
import re
import datetime
from tqdm import tqdm

### general functions

In [None]:
def extract_between_brackets(text):
    pattern = r'\[(.*?)\]'
    matches = re.findall(pattern, text)
    return matches


def remove_text_between_brackets(text):
    pattern = r'\[.*?\]'
    result = re.sub(pattern, '', text)
    return result


def num_words(text):
    try:
        return len(text.split())
    except:
        return 0


def convert_utc_to_datetime(utc_time):
    return datetime.datetime.utcfromtimestamp(utc_time).strftime('%d/%m/%y %H:%M:%S')


def remove_text_after_x(text, x):
    pattern = re.escape(x) + r'.*'
    result = re.sub(pattern, '', text, flags=re.IGNORECASE)
    return result


def extract_data_from_comment(comment, prefix=None):
    data = dict(comment_id=comment.id, comment_body=remove_text_after_x(comment.body, 'edit:'),
                comment_author=comment.author,
                comment_date_time=convert_utc_to_datetime(comment.created), comment_score=comment.score)
    if prefix is not None:
        data = {f"{prefix}_{k}": v for k, v in data.items()}
    return data


def extract_data_from_post(post):
    post_data = dict(post_id=post.id, title=remove_text_between_brackets(post.title),
                     post_author=post.author,
                     body=remove_text_after_x(post.selftext, 'edit:'), date_time=convert_utc_to_datetime(post.created),
                     url=post.url, score=post.score, num_comments=post.num_comments)
    post_data['flairs'] = str(extract_between_brackets(post.title))
    comments = [(comment, comment.score, num_words(comment.body)) for comment in post.comments]
    comments = [x for x in comments if x[2] > 30]
    highest_comment = sorted(comments, key=lambda c: c[1], reverse=True)[0][0]
    longest_comment = sorted(comments, key=lambda c: c[2], reverse=True)[0][0]
    post_data.update(extract_data_from_comment(highest_comment, 'highest'))
    post_data.update(extract_data_from_comment(longest_comment, 'longest'))
    return post_data

### reddit api

In [None]:
CLIENT_ID = 'yy'
CLIENT_SECRET = 'xx'
USER_AGENT = 'my_user_agent'
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=USER_AGENT)
subrredit_str_lists = ['abusiverelationships',  'ToxicRelationships', 'domesticviolence', 'abusesurvivors', 'emotionalabuse', 'Infedelity', 'survivinginfidelity', 'relationship_advice', 'relationships',
'LifeAfterNarcissism', 'NarcAbuseAndDivorce', 'NarcissisticSpouses']

### get all the data from the previous runs

In [None]:
exisiting_data = []
for i in range(1, 3):
    root = f"/data/home/ilanit.sobol/dv/data/outputs/reddit/subreddits/new/iter{i}"
    for file in os.listdir(root):
        if file.endswith(".csv"):
            exisiting_data.append(pd.read_csv(f"{root}/{file}"))
queries_root = "/data/home/ilanit.sobol/dv/data/outputs/reddit/subreddits/queries"

for subreddit in os.listdir(queries_root):
    for sort in os.listdir(f"{queries_root}/{subreddit}"):
        for file in os.listdir(f"{queries_root}/{subreddit}/{sort}"):
            if file.endswith(".csv"):
                try:
                    exisiting_data.append(pd.read_csv(f"{queries_root}/{subreddit}/{sort}/{file}"))
                except Exception as e:
                    pass

exisiting_data = pd.concat(exisiting_data)
exisiting_data = exisiting_data.drop_duplicates(subset=["post_id"])

#### Scrape the top posts from the subreddits

In [None]:
all_data = []
for subrredit_str in subrredit_str_lists:
    current_data = []
    subreddit_obj = reddit.subreddit(subrredit_str)
    posts = list(subreddit_obj.top(time_filter="all", limit=1000))
    for post in tqdm(posts):
        if post.id in exisiting_data.post_id.values:
            continue
        try:
            post_data = extract_data_from_post(post)
            post_data.update(dict(subreddit=subrredit_str))
            current_data.append(post_data)
        except Exception as e:
            pass
    df = pd.DataFrame(current_data)
    df.to_csv(f'/data/home/ilanit.sobol/dv/data/outputs/reddit/top/{subrredit_str}.csv', index=False)
    all_data.extend(current_data)

## search via query and sort

In [None]:
queries = ["marriage", "relationship", "love", "hate", "like", "partner", "ex", "divorce", "work", "done", "everything", "right", "wrong", "simple", "know", "boyfriend", "flowers", "date", "clothes", "story", "together", "breakup", "social", "mental", "health", "hard", "we", "me", "living", "M39", "M20", "M21", "M42",  "M43", "M54"]
sorts = ["hot", "new", "relevance"]
subrredit_str_lists = ['abusiverelationships',  'ToxicRelationships', 'domesticviolence', 'abusesurvivors', 'emotionalabuse', 'Infedelity', 'survivinginfidelity', 'relationship_advice', 'relationships', 'LifeAfterNarcissism', 'NarcAbuseAndDivorce', 'NarcissisticSpouses']

In [None]:
all_data = []
for sort in sorts:
    for query in queries:
        for subrredit_str in subrredit_str_lists:
            folder = f'/data/home/ilanit.sobol/dv/data/outputs/reddit/queries/{query}/{sort}'
            os.makedirs(folder, exist_ok=True)
            if os.path.exists(f'{folder}/{subrredit_str}.csv'):
                continue
            current_data = []
            subreddit_obj = reddit.subreddit(subrredit_str)
            posts = list(subreddit_obj.search(query, time_filter="all", limit=500, sort=sort))
            for post in tqdm(posts, desc=f"{query}_{sort}_{subrredit_str}"):
                if post.id in exisiting_data.post_id.values:
                    continue
                try:
                    post_data = extract_data_from_post(post)
                    post_data.update(dict(subreddit=subrredit_str))
                    current_data.append(post_data)
                except Exception as e:
                    pass
            df = pd.DataFrame(current_data)
            if df.shape[0] == 0:
                continue
            df.to_csv(f'{folder}/{subrredit_str}.csv', index=False)
            all_data.extend(current_data)

### combine all the data

In [None]:
## filter out bad posts
data = pd.DataFrame(all_data)
data = data.drop_duplicates(subset=["post_id"])
data = data[~data.post_id.isin(exisiting_data.post_id)]
df = df[(df['longest_comment_body'] != '[deleted]') &  (df['highest_comment_body'] != '[deleted]')]
df = df[df["body"].isnull() == False]

In [None]:
## get metadata
df['body'] = [remove_text_after_x(text, 'edit:') for text in df['body'].values]
df['longest_comment_body'] = [remove_text_after_x(text, 'edit:') for text in df['longest_comment_body'].values]
df['highest_comment_body'] = [remove_text_after_x(text, 'edit:') for text in df['highest_comment_body'].values]
df['post_body'] = df['title'] + '\n' + df['body']
df["deleted"] = df["body"].apply(lambda x: is_deleted_or_null(x))
df = df[df["deleted"] == False]
df["post_num_words"] = df["post_body"].apply(lambda x: num_words(x))

In [None]:
def is_deleted_or_null(text):
    if text is None:
        return True
    if type(text) == float:
        return True
    if type(text) == str:
        return '[deleted]' in text
    return False

In [None]:
df_filtered = df[df["deleted"] == False]
df_filtered = df_filtered[(df_filtered['post_num_words'] > 50) & (df_filtered['post_num_words'] < 1200)]

In [None]:
df_filtered.to_csv(r'/data/home/ilanit.sobol/dv/data/outputs/reddit/reddit_posts.csv', index=False)