## Collecting data from subreddit and openai

#### From Praw

In [1]:
import praw

#### Using praw credentilas

In [28]:
reddit = praw.Reddit()

#### Select the specific subreddit

In [3]:
my_subreddit = 'nevertellmetheodds'

In [4]:
try:
    posts = reddit.subreddit(my_subreddit).hot(limit=5000)
except StopIteration:
    print(f"Less than 5000 posts in subreddit {my_subreddit}")
    posts = []

posts_list = []

for post in posts:
    if not post.over_18:
        post.comments.replace_more(limit=0)
        highest_ups = -1
        best_comment = None
        for comment in post.comments.list():
            if isinstance(comment, praw.models.MoreComments):
                # Handle MoreComments objects if needed
                continue  # Skip to the next iteration
            if comment.ups > highest_ups:
                best_comment = comment.body
                highest_ups = comment.ups
        posts_list.append({
            'title': post.title,
            'selftext': post.selftext,
            'subreddit': post.subreddit.display_name,  
            'post_created_utc': post.created_utc,
            'top_comment': best_comment,
            'comment_upvotes': highest_ups
        })

**The above code is slightly modified from the original version of Eric Bayless**

In [5]:
print(f"Number of posts from reddit: {len(posts_list)}")

Number of posts from reddit: 218


## Collecting the ai response from OPENAI

In [6]:
import os
import openai
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

## Batch inputs

In [7]:
davinci_prompts = [p['title'] for p in posts_list]
len(davinci_prompts)

218

In [8]:
import backoff 
import openai 
@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def completions_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

In [9]:
results= [] 
for i in range(0,len(davinci_prompts),20):
    results.append(completions_with_backoff(model = 'text-davinci-003',
    prompt=davinci_prompts[i:i+20],
    temperature=0.6,
    max_tokens= 500))

In [12]:
ai_responses = []

for i in range(len(results)):
    for j in range(20):
        try:
            text = results[i]['choices'][j]['text'].strip()
            ai_responses.append(text)
        except IndexError:
            # Handle the case where the index is out of range
            print(f"IndexError: Skipping index ({i}, {j}) as it is out of range.")

IndexError: Skipping index (10, 18) as it is out of range.
IndexError: Skipping index (10, 19) as it is out of range.


**The above code is slightly modified from the original version of Eric Bayless**

In [15]:
print(f"Number of responses from chatgpt : {len(ai_responses)}")

Number of responses from chatgpt : 218


In [16]:
ai_responses;

# Putting it all together

In [17]:
posts_list;

In [18]:
full_data = [{
    'question': i['title'],
    'human_answer' : i['top_comment'],
    'ai_answer':j} for i,j in zip(posts_list,ai_responses)]
full_data;

In [19]:
import pandas as pd

In [20]:
df = pd.DataFrame(full_data)

In [21]:
df.shape

(218, 3)

In [22]:
df['question'][0]

'[META] List of Banned Posts on this sub'

In [23]:
df['human_answer'][0]

'Can someone be kind enough to type this because the link broke on mobile '

In [24]:
df['ai_answer'][0]

"This subreddit does not allow posts that are:\n\n• Spam\n• Off-topic\n• Advertising/promoting of products or services\n• Posts containing hate speech, racism, sexism, or other forms of discrimination\n• Posts inciting violence or illegal activities\n• Posts that are overly aggressive or abusive\n• Posts that are overly sexual or explicit\n• Posts that are not related to the topic of this subreddit\n• Posts that are not civil or respectful\n• Posts that contain personal information\n• Posts that are considered trolling or baiting\n• Posts that are in violation of Reddit's site-wide rules\n• Posts that are not in English\n• Posts that are intended to generate revenue or profit without prior approval"

In [25]:
df

Unnamed: 0,question,human_answer,ai_answer
0,[META] List of Banned Posts on this sub,Can someone be kind enough to type this becaus...,This subreddit does not allow posts that are:\...
1,Dropped my soy sauce and it accidentally made ...,a perfect soycle,Accidents happen! Clean up the soy sauce and t...
2,The way the sunlight reflected off our neighbo...,"I'm a genuine fucking idiot. I thought *""how i...",.
3,Guy does 360 spin on horseback!,"Aww man, it was supposed to be my turn to post...","Yes, it is possible for a person to do a 360 s..."
4,"Boat runs aground, balances on keel alone unti...",That is bizarre. Looks like something you’d se...,If a boat runs aground and is balanced on its ...
...,...,...,...
213,Dragonfly ate a mosquito that was about to bite,Should post this to AnimalsbeingBros!,me\n\nDragonfly is a beneficial insect for hum...
214,Federer hitting the ball through the net durin...,Sure it can. I just watched it happen,"Yes, it is possible for Roger Federer to hit t..."
215,how?!,"As scary as that would be, that would be the b...",There is no single answer to this question as ...
216,Follow the bouncing ball,Camera operator did a sterling job here,Follow the bouncing ball as it moves across th...


#### Save the text dataframes into .csv files.

In [26]:
#df.to_csv(my_subreddit + '.csv')

In [27]:
df.isnull().sum() # Check for nulls

question        0
human_answer    1
ai_answer       0
dtype: int64