Skip to content

Commit

Permalink
pipeline to scrape submissions
Browse files Browse the repository at this point in the history
  • Loading branch information
Jithin James committed Mar 4, 2023
1 parent 683f6ca commit 616f62e
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 12 deletions.
101 changes: 101 additions & 0 deletions data/datasets/nsfw_csam_reddit/scrape_nsfw_reddit.py
@@ -0,0 +1,101 @@
import logging
import os

import pandas as pd
import praw
import prawcore
import utils
from tqdm import tqdm

logger = logging.getLogger(__name__)

# setup praw
CLIENT_ID = "ON-Jk8euNrhTBNKaQYyP9Q"
CLIENT_SECRET = "Lrgf4vzTW3K4VMBpNJF9O49u-EJ8Sg"
USER_AGENT = "web:in.jjmachan.scrapper:v0.1.0 (by u/jjmachan)"

# the client that communicates with reddit.
reddit = praw.Reddit(
client_id=CLIENT_ID,
client_secret=CLIENT_SECRET,
user_agent=USER_AGENT,
)

subs = [
"Sexpolls",
"sexpositions",
"Sexconfessional",
"penissize",
"masturbation",
"AskRedditNSFW",
"sexstories",
"SexFantasies",
"sexconfession",
"askwoman_aboutsex",
"sextips",
"sexualhealth",
"SexPositive",
"DirtyConfession",
"Puberty",
"NSFWIAMA",
"sexover30",
"SexToys",
"sexquestions",
"deepvaginaproblems",
"kegels",
"sexeducation",
"ColoredLang",
"masterbationstories",
"RedditAfterDark",
"Threesome_advice",
]


def scrape_subreddit(subreddit: str) -> pd.DataFrame | None:
# NUM_COMMENTS = 5
items = []
dfs = []

sub = reddit.subreddit(subreddit)
try:
sub.id
except prawcore.exceptions.ResponseException as e:
logger.error(f"Error getting {subreddit}: {e}")
return
ordering = (sub.hot(limit=1000), sub.top(limit=1000), sub.rising(limit=1000))
for order in ordering:
for post in tqdm(order, leave=False):
item = {
"title": post.title,
"subreddit": sub.display_name,
"post_id": post.id,
"score": post.score,
"link_flair_text": post.link_flair_text,
"is_self": post.is_self,
"over_18": post.over_18,
"upvote_ratio": post.upvote_ratio,
"is_question": utils.is_question(post.title),
}
# for i, c in enumerate(post.comments[:NUM_COMMENTS]):
# item[f"C{i+1}"] = c.body
items.append(item)
dfs.append(pd.DataFrame(items))

df = pd.concat(dfs)
return df.drop_duplicates(subset=["post_id"])


def add_comments(df_filepath: os.PathLike) -> pd.DataFrame:
...


if __name__ == "__main__":
for sub in tqdm(subs):
try:
df = scrape_subreddit(sub)
if df is not None:
file_name = f"dataframes/{sub}.csv"
df.to_csv(file_name, index=False)
print("subreddit saved to: ", file_name)
except Exception as e:
logger.error(f"Error scraping {sub}: {e}")
2 changes: 1 addition & 1 deletion data/datasets/nsfw_csam_reddit/utils/__init__.py
@@ -1,3 +1,3 @@
from is_question import is_question
from .is_question import is_question

__all__ = ["is_question"]
16 changes: 5 additions & 11 deletions data/datasets/nsfw_csam_reddit/utils/is_question.py
Expand Up @@ -3,22 +3,16 @@

class IsQuestion:
def __init__(self):
self.tokenizer = AutoTokenizer.from_pretrained(
"shahrukhx01/question-vs-statement-classifier"
)
self.model = AutoModelForSequenceClassification.from_pretrained(
"shahrukhx01/question-vs-statement-classifier"
)
self.classifier = pipeline(
"sentiment-analysis", model=self.model, tokenizer=self.tokenizer
)
self.tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/question-vs-statement-classifier")
self.model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/question-vs-statement-classifier")
self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer)
self.labels = {
"LABEL_0": False,
"LABEL_1": True,
}

def __call__(self, text):
return self.classifier(text)
def __call__(self, text: str) -> bool:
return self.labels[self.classifier(text)[0]["label"]]


is_question = IsQuestion()

0 comments on commit 616f62e

Please sign in to comment.