In [None]:
##AUTHOR: Katherine Norris
##PROJECT: MASTER'S THESIS 2024 "MODERATION IN ONLINE INCEL FORUMS: A COMPARISON OF WARNING BEHAVIORS"
##COPYRIGHT: Copyright Katherine E. Norris 2024 ALL RIGHTS RESERVED

In [1]:
# import packages and define a useful function for stripping HTML code from text.

from bs4 import BeautifulSoup
import requests
import itertools
import pandas as pd
import re
from io import StringIO
from html.parser import HTMLParser
from more_itertools import strip

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [2]:
# SCRAPING ALL THREAD LINKS ON A CONTENTS PAGE
urls = ["https://incels.is/forums/inceldom-discussion.2/?order=reply_count&direction=desc&thread_type=discussion"] 
                                                                # List of content pages 
                                                                # containing links to 
                                                                # individual threads.
reqs = [requests.get(url) for url in urls]
soups = [BeautifulSoup(req.text, "html.parser") for req in reqs]

# create empty lists to populate with scraped data
label_list = []
title_list = []
url_list  = []
dop_list = []
dopsread_list = []

for soup in soups:
    
    # first, limit searches to the non-sticky threads by finding the primary container group
    non_sticky = soup.find("div", {"class": "structItemContainer-group js-threadList"})
    
    # identify and scrape the thread tooltips for the i-th page / limited to non-sticky grou
    threads = non_sticky.find_all("div", {"class": "structItem-title"})

    #extract post date from a list of thread "items" (threads)
    items = non_sticky.find_all("div", {"class": "structItem--thread"})
    dops = [int(udt.find_all("time", {"class": "u-dt"})[0]["data-time"]) for udt in items]
    
    #Readable timestamp 
    dopsread = [udt.find_all("time", {"class": "u-dt"})[0]["data-date-string"] for udt in items]
                
    # extract the label for the thread (e.g., "Blackpill")
    labels = [label.find_all("a", {"class": "labelLink"}) for label in threads]
    labels = [label[0].get_text() if len(label) > 0 else '' for label in labels]

    # extract the element containing the title and urls to each thread
    threads = [thread.find_all("a", {"data-xf-init": "preview-tooltip"}) for thread in threads]

    # extract titles
    titles = [title[0].get_text() if len(title) > 0 else '' for title in threads]

    
    # extract urls
    urls = [url[0].get('href') if len(url) > 0 else '' for url in threads]
    urls = ["https://incels.is"+url for url in urls]
    
    # append the labels, titles, and urls this page to the respect lists and loop for every page
    label_list = label_list + labels
    title_list = title_list + titles
    url_list = url_list + urls
    dop_list = dop_list + dops
    dopsread_list = dopsread_list + dopsread
    
# construct data frame containing all of the above information
df_threads = pd.DataFrame([{"label": label, "title": title, "url": url, "dop": dop, "Date": dopsread} for label, title, url, dop, dopsread in zip(label_list, title_list, url_list, dop_list, dopsread_list)])
df_threads.query("dop.between(1577858400,1693544399,inclusive=True)", inplace=True) 

df_threads = df_threads.reset_index()
df_threads[0:10]

#saving to a csv
df_threads.to_csv("Top Posts UnMod.csv", index=True)

df_threads[0:10]

  df_threads.query("dop.between(1577858400,1693544399,inclusive=True)", inplace=True)


Unnamed: 0,index,label,title,url,dop,Date
0,1,Discussion,The collapse of the United States and what it ...,https://incels.is/threads/the-collapse-of-the-...,1643848632,"Feb 2, 2022"
1,2,Serious,The collapse of the United States and what it ...,https://incels.is/threads/the-collapse-of-the-...,1667851351,"Nov 7, 2022"
2,3,,I try to ejaculate every single day of 2023,https://incels.is/threads/i-try-to-ejaculate-e...,1672637386,"Jan 2, 2023"
3,4,,TikTok Blackpill Megathread,https://incels.is/threads/tiktok-blackpill-meg...,1675999567,"Feb 9, 2023"
4,11,,Roping soon ask me anything,https://incels.is/threads/roping-soon-ask-me-a...,1669845475,"Nov 30, 2022"
5,12,Serious,Pedophiles and Pedophile Sympathizers are a CA...,https://incels.is/threads/pedophiles-and-pedop...,1587481966,"Apr 21, 2020"
6,13,Discussion,What do you struggle with? How do you cope?,https://incels.is/threads/what-do-you-struggle...,1671008308,"Dec 14, 2022"
7,14,LifeFuel,The person who hacked the site has been identi...,https://incels.is/threads/the-person-who-hacke...,1642911861,"Jan 22, 2022"
8,15,Blackpill,Be VERY suspisuous of members that defend brag...,https://incels.is/threads/be-very-suspisuous-o...,1585933096,"Apr 3, 2020"
9,17,It's Over,Most white girls have fucked a black dude at s...,https://incels.is/threads/most-white-girls-hav...,1665922860,"Oct 16, 2022"


In [3]:
posts_dict ={"parent id":[], "post id":[], "avatar": [], "user": [], "jobtitle": [],"extras": [],"datetime": [],"post": [], "label":[], "thread title":[], "thread url":[], "thread date":[]} 

for record in range(0,10):
    url = df_threads["url"][record]

    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")

    # scrape the urls for every page in the thread
    pages = soup.find("a", {"class": "pageNavSimple-el pageNavSimple-el--current"})
    last = int(pages.get_text()[-3:]) 
    pages = [url + "page-" + str(p) for p in range(1, last+1)]
    
    #Original
    #pages = soup.find_all("ul", {"class": "pageNav-main"})
    #pages = pages[0].find_all("a")
    #pages = ["https://incels.is" + page.get("href") for page in pages]

# create empty lists to populate with scraped data
    avatar_list = []
    user_list = []
    job_list  = []
    extras_list = []
    datetime_list = []
    content_list = []
    parent_list = []
    idlist_list = []
    

    for page in pages:
        req = requests.get(page)
        soup = BeautifulSoup(req.text, "html.parser")
        
        # separate out all of the posts into their own elements (the first post will be the OP)
        posts = soup.find_all("div", {"class": "message-inner"}) 
        
        # scrape avatar from each post
        avatars = [post.find_all("div", {"class": "message-avatar-wrapper"}) for post in posts]
        avatars = [avatar[0].find_all("img") if len(avatar) > 0 else '' for avatar in avatars]
        avatars = [avatar[0]['src'] if len(avatar) > 0 else ''  for avatar in avatars]

        # scrape username from each post
        users = [post.find_all("h4", {"class": "message-name"}) for post in posts]
        users = [user[0].get_text() if len(user) > 0 else '' for user in users]

        # scrape 'job title' from each post
        jobs = [post.find_all("h5", {"class": "userTitle message-userTitle"}) for post in posts]
        jobs = [job[0].get_text() if len(job) > 0 else '' for job in jobs]

        # scrape date joined and number of posts from each post (the code calls them "extras")
        extras = [post.find_all("div", {"class": "message-userExtras"}) for post in posts]
        extras = [(extra[0].get_text()).strip() if len(extra) > 0 else '' for extra in extras]

        # scrape message date and time (right down to the seconds!)
        post_datetime = [post.find_all("time", {"class": "u-dt"}) for post in posts]
        post_datetime = [date[0]['datetime'] if len(date) > 0 else ''  for date in post_datetime]

        # scrape message content
        idlist_list = [post.find_all("div", {"class": "message-userContent lbContainer js-lbContainer"})[0]["data-lb-id"] for post in posts]
        content = [post.find_all("div", {"class": "bbWrapper"}) for post in posts]
        parent_list = [c[0].find_all("blockquote")[0]["data-source"] if c[0].find("blockquote") != None else "None" for c in content]
        [(c[0].blockquote).decompose() if c[0].find("blockquote") != None else c for c in content]
        content = [(text[0].get_text()).strip() if len(text) > 0 else '' for text in content]
        
        [posts_dict["avatar"].append(a) for a in avatars]
        [posts_dict["user"].append(u) for u in users]
        [posts_dict["jobtitle"].append(j) for j in jobs]
        [posts_dict["extras"].append(e) for e in extras]
        [posts_dict["datetime"].append(d) for d in post_datetime]
        [posts_dict["post"].append(c) for c in content]
        
        [posts_dict["parent id"].append(p) for p in parent_list]
        [posts_dict["post id"].append(p) for p in idlist_list]
        
        
        [posts_dict["label"].append(df_threads["label"][record]) for a in avatars]
        [posts_dict["thread title"].append(df_threads["title"][record]) for a in avatars]
        [posts_dict["thread url"].append(df_threads["url"][record]) for a in avatars]
        [posts_dict["thread date"].append(df_threads["Date"][record]) for a in avatars]

In [4]:
# view thread data
df_posts= pd.DataFrame(posts_dict)
df_posts.to_csv("Replies UnMod.csv", index=True)
df_posts


Unnamed: 0,parent id,post id,avatar,user,jobtitle,extras,datetime,post,label,thread title,thread url,thread date
0,,post-8046194,/data/avatars/l/32/32869.jpg?1692898758,MarquisDeSade,Mephistopheles,"Joined\nFeb 11, 2021\n\n\nPosts\n15,865",2022-02-02T19:37:12-0500,Can you feel it my American brocels? The Unite...,Discussion,The collapse of the United States and what it ...,https://incels.is/threads/the-collapse-of-the-...,"Feb 2, 2022"
1,,post-8046219,/data/avatars/l/37/37248.jpg?1668128378,unuser,Incel Defense Forces (IDF),"Joined\nOct 6, 2021\n\n\nPosts\n1,460",2022-02-02T19:45:10-0500,I read only the first two lines and already KE...,Discussion,The collapse of the United States and what it ...,https://incels.is/threads/the-collapse-of-the-...,"Feb 2, 2022"
2,,post-8046228,/data/avatars/l/40/40524.jpg?1659942445,Glerforpus,Overlord,"Joined\nJan 25, 2022\n\n\nPosts\n5,613",2022-02-02T19:47:19-0500,"Based. I read all of it. I honestly wish tho, ...",Discussion,The collapse of the United States and what it ...,https://incels.is/threads/the-collapse-of-the-...,"Feb 2, 2022"
3,post: 8046219,post-8046235,/data/avatars/l/32/32869.jpg?1692898758,MarquisDeSade,Mephistopheles,"Joined\nFeb 11, 2021\n\n\nPosts\n15,865",2022-02-02T19:48:04-0500,Soon.....,Discussion,The collapse of the United States and what it ...,https://incels.is/threads/the-collapse-of-the-...,"Feb 2, 2022"
4,post: 8046228,post-8046241,/data/avatars/l/32/32869.jpg?1692898758,MarquisDeSade,Mephistopheles,"Joined\nFeb 11, 2021\n\n\nPosts\n15,865",2022-02-02T19:49:07-0500,"This country or nation is beyond saving, this ...",Discussion,The collapse of the United States and what it ...,https://incels.is/threads/the-collapse-of-the-...,"Feb 2, 2022"
...,...,...,...,...,...,...,...,...,...,...,...,...
7767,post: 9918784,post-9918790,,Inceldom Victim,Biggest subhuman alive,"Joined\nAug 22, 2022\n\n\nPosts\n4,697",2022-10-24T09:41:28-0400,Every country will get blacked eventually,It's Over,Most white girls have fucked a black dude at s...,https://incels.is/threads/most-white-girls-hav...,"Oct 16, 2022"
7768,post: 9918790,post-9918800,/data/avatars/l/45/45309.jpg?1660939650,Emperor Palpatine,Self-banned,"Joined\nAug 19, 2022\n\n\nPosts\n607",2022-10-24T09:44:25-0400,"Yeah, it's really a shit",It's Over,Most white girls have fucked a black dude at s...,https://incels.is/threads/most-white-girls-hav...,"Oct 16, 2022"
7769,post: 9918784,post-9919406,/data/avatars/l/6/6579.jpg?1702764283,Izayacel,"LUZER RA ""","Joined\nMay 5, 2018\n\n\nPosts\n18,370",2022-10-24T12:58:47-0400,,It's Over,Most white girls have fucked a black dude at s...,https://incels.is/threads/most-white-girls-hav...,"Oct 16, 2022"
7770,post: 9918406,post-9920191,/data/avatars/l/25/25780.jpg?1591771218,Saint Lives MattER,Self-banned,"Joined\nApr 17, 2020\n\n\nPosts\n1,435",2022-10-24T15:58:35-0400,Wrong. There are no whites in:\n\nUSA - all Mu...,It's Over,Most white girls have fucked a black dude at s...,https://incels.is/threads/most-white-girls-hav...,"Oct 16, 2022"
