In [29]:
import math
import json
import requests
import itertools
import numpy as np
import time
from datetime import datetime, timedelta

In [30]:
def get_pushshift_data(data_type, **kwargs):
    """
    Gets data from the pushshift api.
 
    data_type can be 'comment' or 'submission'
    The rest of the args are interpreted as payload.
 
    Read more: https://github.com/pushshift/api
    """
 
    base_url = f"https://api.pushshift.io/reddit/search/{data_type}/"
    payload = kwargs
    request = requests.get(base_url, params=payload)
    return request.json()

In [41]:
data_type="comment"     # give me comments, use "submission" to publish something
duration="1d"           # Select the timeframe. Epoch value or Integer + "s,m,h,d" (i.e. "second", "minute", "hour", "day")
size=25                 # maximum 1000 comments
sort_type="created_utc" # Sort by score (Accepted: "score", "num_comments", "created_utc")
sort="desc"             # sort descending
subreddit="webdev22"
after=None
before=None

In [42]:
data = get_pushshift_data(data_type=data_type,          
                          size=size,
                          subreddit=subreddit,
                          sort_type=sort_type,
                          sort=sort,
                          after=after,
                          before=before
                          )

In [43]:
len(data['data'])

0

In [26]:
import pandas as pd

df = pd.DataFrame.from_records(data['data'])

In [15]:
df_comments = df[['id', 'author', 'created_utc', 'body']]

In [28]:
df_comments.sample(5)

Unnamed: 0,id,author,created_utc,body
21,iwvd33m,bkuri,1668791339,In some ways it's the best time to get started...
9,iwvekjg,AutoModerator,1668791933,"Hi, Ultra-Coder,\n\nYour post has been automat..."
8,iwvem13,Iankill,1668791949,Are you really so dumb that you can't see the ...
3,iwvff0m,59t_5in3es1tvanj,1668792268,I remember using images for styling. Because o...
23,iwvct5i,Floepert,1668791227,Dreamweaver websites…. I forgot about those… m...


In [16]:
df_time_cleaned = df_comments.copy()
df_time_cleaned['created_utc'] = pd.to_datetime(df_time_cleaned['created_utc'], unit = 's', utc=True).dt.tz_convert('Europe/Berlin')
df_time_cleaned = df_time_cleaned.sort_values(by='created_utc', ascending=False)
df_final = df_time_cleaned

In [17]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid_obj = SentimentIntensityAnalyzer()

def sentiment_scores(sid_obj, *sentence):
    return [sid_obj.polarity_scores(sente)['compound'] for sente in sentence]

In [18]:
%%time
df_final['compound'] = sentiment_scores(sid_obj, *df_final['body'].values)
df_final['class'] = np.where(df_final['compound'] > 0, 'Postive', 'Negative')

CPU times: user 12.5 ms, sys: 1.07 ms, total: 13.5 ms
Wall time: 14.5 ms


In [19]:
out = df_final[['id', 'body', 'compound', 'class']]

In [21]:
df_final

Unnamed: 0,id,author,created_utc,body,compound,class
0,iwvfkui,iWantBots,2022-11-18 18:25:34+01:00,Pointing out how you post in sets of 3 exactly...,0.7845,Postive
1,iwvfk3t,SamFoucart,2022-11-18 18:25:26+01:00,If you want to know the general ideas behind a...,0.9729,Postive
2,iwvfihm,braalewi,2022-11-18 18:25:07+01:00,Building buttons as images in photoshop with a...,0.0,Negative
3,iwvff0m,59t_5in3es1tvanj,2022-11-18 18:24:28+01:00,I remember using images for styling. Because o...,0.3182,Postive
4,iwvfe1z,magenta_placenta,2022-11-18 18:24:17+01:00,"I'm pointing out what a clown you are, pal.",0.0,Negative
5,iwvfar6,braalewi,2022-11-18 18:23:42+01:00,Oh God! I was a Macromedia fanboy. Then I lear...,0.6476,Postive
6,iwvf253,iWantBots,2022-11-18 18:22:08+01:00,Lol the guy who got called out for spamming Re...,0.296,Postive
7,iwvev1k,magenta_placenta,2022-11-18 18:20:50+01:00,Links aren't helpful posts? Show me a post I'...,0.7172,Postive
8,iwvem13,Iankill,2022-11-18 18:19:09+01:00,Are you really so dumb that you can't see the ...,-0.882,Negative
9,iwvekjg,AutoModerator,2022-11-18 18:18:53+01:00,"Hi, Ultra-Coder,\n\nYour post has been automat...",0.6705,Postive
