In [1]:
import praw
import pandas as pd
from datetime import datetime
import time
import logging
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
logging.basicConfig(filename='data_extraction.log', level=logging.INFO)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/kyle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kyle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kyle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
reddit = praw.Reddit(
    client_id='l13EfEFWXIo8ZHx6kndK8A',
    client_secret='wnQ35vd_ggELT219tsG6AKe0dUSAdg',
    user_agent='MacOS:redditScraper:v1.0 (by /u/kyle_stein)'
)

In [4]:
subreddit_name = 'Etoro'

In [5]:
def preprocess_text(text):
    # Remove HTML tags and special characters
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the tokens back into a string
    processed_text = ' '.join(tokens)
    
    return processed_text

In [6]:
subreddit = reddit.subreddit(subreddit_name)

posts = []
comments_data = []

try:
    # Retrieve posts with pagination
    last_post_id = None
    while True:
        # Adjust the limit as needed
        posts_batch = list(subreddit.new(limit=100, params={'after': last_post_id}))
        if not posts_batch:
            break

        for post in posts_batch:
            post_data = {
                'title': post.title,
                'score': post.score,
                'id': post.id,
                'url': post.url,
                'num_comments': post.num_comments,
                'created_utc': datetime.fromtimestamp(post.created_utc),
                'body': post.selftext,
                'username': post.author.name if post.author else '[deleted]',
                'upvote_ratio': post.upvote_ratio,
                'is_original_content': post.is_original_content,
                'flair': post.link_flair_text,
                'is_video': post.is_video,
                'domain': post.domain,
                'is_self': post.is_self,
                'is_stickied': post.stickied,
                'over_18': post.over_18,
                'total_awards_received': post.total_awards_received,
                'gilded': post.gilded,
                'edited': post.edited,
                'comment_sort': post.comment_sort,
                'permalink': f'https://www.reddit.com{post.permalink}',
                'is_media': bool(post.media),
                'media_url': post.url if post.media else None,
                'is_crosspost': post.num_crossposts > 0,
                # 'original_subreddit': post.crosspost_parent.subreddit.display_name if post.num_crossposts > 0 else None,
                'stickied': post.stickied
            }
            posts.append(post_data)

            post.comments.replace_more(limit=None)  # Retrieve all comments, including nested ones
            for comment in post.comments.list():
                comment_data = {
                    'comment_id': comment.id,
                    'post_id': post.id,
                    'parent_id': comment.parent_id,
                    'comment_body': comment.body,
                    'comment_author': comment.author.name if comment.author else '[deleted]',
                    'comment_score': comment.score,
                    'comment_created_utc': datetime.fromtimestamp(comment.created_utc),
                    'comment_gilded': comment.gilded,
                    'comment_edited': comment.edited,
                    'comment_is_submitter': comment.is_submitter,
                    'comment_stickied': comment.stickied
                }
                comments_data.append(comment_data)

        last_post_id = posts_batch[-1].fullname
        time.sleep(1)  # Add a delay to avoid hitting rate limits

    df_posts = pd.DataFrame(posts)
    df_comments = pd.DataFrame(comments_data)

    # Data validation and cleaning
    df_posts['created_utc'] = pd.to_datetime(df_posts['created_utc'])
    df_comments['comment_created_utc'] = pd.to_datetime(df_comments['comment_created_utc'])

    # Handle missing values
    df_posts.fillna('', inplace=True)
    df_comments.fillna('', inplace=True)

    # Perform text preprocessing and feature engineering
    df_posts['body'] = df_posts['body'].apply(preprocess_text)
    df_comments['comment_body'] = df_comments['comment_body'].apply(preprocess_text)

    # Save data to files
    # df_posts.to_csv('posts_data.csv', index=False)
    # df_comments.to_csv('comments_data.csv', index=False)

    logging.info("Data extraction completed successfully.")


except Exception as e:
    logging.error(f"An error occurred: {str(e)}")


In [7]:
df_posts

Unnamed: 0,title,score,id,url,num_comments,created_utc,body,username,upvote_ratio,is_original_content,...,over_18,total_awards_received,gilded,edited,comment_sort,permalink,is_media,media_url,is_crosspost,stickied
0,Crypto trading,1,1br0lym,https://www.reddit.com/r/Etoro/comments/1br0ly...,0,2024-03-29 16:56:04,hi late buy crypto benefit halving,FigNo2310,1.00,False,...,False,0,0,False,confidence,https://www.reddit.com/r/Etoro/comments/1br0ly...,False,,False,False
1,Which of these are correct?,0,1bqrses,https://i.redd.it/0wpqutkjiarc1.jpeg,9,2024-03-29 10:17:15,correct ive trading year suddenly ive restrict...,hannahbanananananana,0.18,False,...,False,0,0,False,confidence,https://www.reddit.com/r/Etoro/comments/1bqrse...,False,,False,False
2,Cost of holding a position on crypto,3,1bqo14y,https://i.redd.it/kilzura6o9rc1.jpeg,5,2024-03-29 07:27:02,according there 1 charge buying another 1 sell...,Amazin8Trade,0.71,False,...,False,0,0,False,confidence,https://www.reddit.com/r/Etoro/comments/1bqo14...,False,,False,False
3,Etoro wallet question,1,1bqnqgk,https://www.reddit.com/r/Etoro/comments/1bqnqg...,0,2024-03-29 07:11:32,option transfer bitcoin etoro wallet bitcoin w...,abhi3186,1.00,False,...,False,0,0,False,confidence,https://www.reddit.com/r/Etoro/comments/1bqnqg...,False,,False,False
4,My eToro Copy Trade Stats: 2024 03 29,0,1bqmcb3,https://www.reddit.com/r/Etoro/comments/1bqmcb...,0,2024-03-29 05:54:02,httpsyoutubeuawez0qorrihttpsyoutubeuawez0qorri,qeras89,0.50,False,...,False,0,0,False,confidence,https://www.reddit.com/r/Etoro/comments/1bqmcb...,False,,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933,Expectations of Etoro indices and commodilities,2,123vvl7,https://www.reddit.com/r/Etoro/comments/123vvl...,1,2023-03-27 13:32:35,hello interested trading index commodilities w...,Vast-Tourist-9700,1.00,False,...,False,0,0,False,confidence,https://www.reddit.com/r/Etoro/comments/123vvl...,False,,False,False
934,why etoro doesnt close my silver sell position...,1,123hmmm,https://www.reddit.com/r/Etoro/comments/123hmm...,3,2023-03-27 04:33:13,x200b httpspreviewreddituyddz1c149qa1pngwidth9...,Born-Persimmon7796,0.67,False,...,False,0,0,False,confidence,https://www.reddit.com/r/Etoro/comments/123hmm...,False,,False,False
935,Crypto wallet in money app - how to cash it in?,5,120yklw,https://www.reddit.com/r/Etoro/comments/120ykl...,1,2023-03-24 16:02:30,hello way money app convert crypto back cash g...,Individual_Wallaby25,1.00,False,...,False,0,0,False,confidence,https://www.reddit.com/r/Etoro/comments/120ykl...,False,,False,False
936,eToro Profit Loss Query,1,120ei85,https://www.reddit.com/r/Etoro/comments/120ei8...,3,2023-03-24 04:00:04,hi recently began using etoro trading u stock ...,LuciaVia,1.00,False,...,False,0,0,False,confidence,https://www.reddit.com/r/Etoro/comments/120ei8...,False,,False,False


In [8]:
# df_posts
df_comments

Unnamed: 0,comment_id,post_id,parent_id,comment_body,comment_author,comment_score,comment_created_utc,comment_gilded,comment_edited,comment_is_submitter,comment_stickied
0,kx5pfou,1bqrses,t3_1bqrses,maybe learn trading,redditerhuman897,9,2024-03-29 15:20:22,0,False,False,False
1,kx4m2np,1bqrses,t3_1bqrses,c,RayTrader03,4,2024-03-29 11:16:46,0,False,False,False
2,kx530wc,1bqrses,t3_1bqrses,c maybe phase bit weird cuz open position stil...,WinterSapphirez,4,2024-03-29 12:53:46,0,False,False,False
3,kx55d2q,1bqrses,t3_1bqrses,c,Fck-tm-without-crm,2,2024-03-29 13:07:18,0,False,False,False
4,kx5u8cr,1bqrses,t3_1bqrses,c something weary though co unless youve paid ...,ROBNOB9X,2,2024-03-29 15:53:43,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
5575,jgqu7n7,120yklw,t3_120yklw,im wondering thing,Kind_Measurement_753,1,2023-04-18 08:55:38,0,False,False,False
5576,jdlt1vr,120ei85,t3_120ei85,maybe youre looking daily pl reset day,Avocado357,1,2023-03-25 05:19:15,0,False,False,False
5577,jdujn7x,120ei85,t1_jdlt1vr,thanks reply looking daily pl still would 2 ce...,LuciaVia,1,2023-03-27 03:48:41,0,False,True,False
5578,jdukpmd,120ei85,t1_jdujn7x,guess something rounding position 0 cent round...,Avocado357,1,2023-03-27 04:04:35,0,False,False,False


In [10]:
df_posts.to_csv('posts_data.csv', index=False)
df_comments.to_csv('comments_data.csv', index=False)