In [None]:
!pip install praw
!pip install openai

Collecting praw
  Downloading praw-7.7.1-py3-none-any.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.0/191.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting prawcore<3,>=2.1 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Collecting update-checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update-checker, prawcore, praw
Successfully installed praw-7.7.1 prawcore-2.4.0 update-checker-0.18.0


In [17]:
import praw
import openai
import random
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import BertTokenizer, BertModel
import torch
import json
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import re



secret_key = 'YOUR_SECRET_KEY'
client_id = 'YOUR_CLIENT_ID'
user_agent = 'YOUR_USER_AGENT'



# Initialize Reddit API with appropriate credentials
# reddit = praw.Reddit(client_id='YOUR_CLIENT_ID',
#                      client_secret='YOUR_CLIENT_SECRET',
#                      user_agent='YOUR_USER_AGENT')

reddit = praw.Reddit(client_id=client_id,
                        client_secret=secret_key,
                        user_agent=user_agent)

# Define function to scrape Reddit posts and comments
def scrape_reddit_posts(reddit, subreddits, limit):
    posts = []
    for subreddit_each in subreddits:
        try:
            for post in reddit.subreddit(subreddit_each).top(limit=limit):
                post_data = {'title': post.title, 'text': post.selftext, 'comments': []}
                if any(keyword in post.title.lower() or keyword in post.selftext.lower() for keyword in ['clinical trial','health','trials', 'clinical research', 'health condition', 'medical research']):
                  posts.append(post_data)
                for comment in post.comments:
                    if not any(keyword in comment.body.lower() for keyword in ['clinical trial', 'health condition', 'medical research']):
                        # Mask confidential data in comment body
                        comment_body = mask_confidential_data(comment.body)
                        post_data['comments'].append({'text': comment_body})
                posts.append(post_data)
        except Exception as e:
            print('Error processing subreddit {}: {}'.format(subreddit_each, e))
    return posts

# Define function to mask confidential data
def mask_confidential_data(text):
    # Define patterns for sensitive information (e.g., email addresses, phone numbers)
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    phone_pattern = r'\b(?:\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b'

    # Replace sensitive information with placeholder text
    masked_text = re.sub(email_pattern, '[EMAIL]', text)
    masked_text = re.sub(phone_pattern, '[PHONE]', masked_text)

    return masked_text

def analyze_sentiment(model, text, tokenizer):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=400, add_special_tokens=True)
    output = model(**encoded_input)
    predicted_label = torch.argmax(output.logits)
    sentiment_classes = ["Negative", "Neutral", "Positive"]
    predicted_sentiment = sentiment_classes[predicted_label]
    return predicted_sentiment

def generate_messages(sentiments, message_templates):
    messages = []
    openai.api_key = "YOUR_OPENAI_KEY"
    try:
        for sentiment in sentiments:
          if sentiment['post_sentiment'] == 'Positive':
              prompt = message_templates[2]['prompt']
          elif sentiment['post_sentiment'] == 'Negative':
              prompt = message_templates[0]['prompt']
          else:
              prompt = message_templates[1]['prompt']
          response = openai.completions.create(
              model="gpt-3.5-turbo-instruct",
              prompt=prompt,
              temperature=0.5,
              top_p=1
          )
          post_message = response['choices'][0]['text']
          comment_messages = []

          for comment_sentiment in sentiment['comments_sentiments']:
            comment_prompt = 'Write a personalized message that should be crisp and clear by analyzing this content keep this short in 3 to 4 sentences <'+comment_sentiment['text']+'> '

            if comment_sentiment['sentiment'] == 'Positive':
                comment_prompt +=message_templates[2]['prompt']
            elif comment_sentiment['sentiment'] == 'Negative':
                comment_prompt+= message_templates[0]['prompt']
            else:
                comment_prompt = message_templates[1]['prompt']

            response = openai.completions.create(
            model="gpt-3.5-turbo-instruct",
            messages=[
            {
              "role": "user",
              "content": comment_prompt
            }
            ],
            prompt=comment_prompt,
            temperature=0.5,
            top_p=1
          )
          comment_message = response['choices'][0]['text']
          comment_messages.append({'Actual_comment':comment_sentiment['text'],'Generated_message': comment_message, 'sentiment': comment_sentiment['sentiment']})
          messages.append({'post_message': post_message, 'post_sentiment': sentiment['post_sentiment'], 'comments_messages': comment_messages})
    except Exception as e:
        print("OpenAI API error:", e)
    return messages

def main():
    # Model_name = 'bert-base-uncased'

    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    config = AutoConfig.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    # tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
    # Model_name = 'dmis-lab/biobert-base-cased-v1.2'
    # tokenizer = BertTokenizer.from_pretrained(Model_name, do_lower_case=True)
    # tokenizer =
    # model = BertForSequenceClassification.from_pretrained(Model_name, num_labels=3)
    subreddits = ["MedicalResearch", "ClinicalTrials", "clinicaltrials", "Diseasemanagement", "Healthcare", "healthcare", "science", "askscience","AskScience"]
    reddit = praw.Reddit(client_id=client_id,
                        client_secret=secret_key,
                        user_agent=user_agent)

    posts = scrape_reddit_posts(reddit, subreddits, 20)
    sentiments = []
    for post in posts:
        text = post['title'] + ' ' + post['text']
        post_sentiment = analyze_sentiment(model, text, tokenizer)
        post['sentiment'] = post_sentiment
        post_comments_sentiments = []
        for comment in post['comments']:
            comment_sentiment = analyze_sentiment(model, comment['text'], tokenizer)
            comment['sentiment'] = comment_sentiment
            post_comments_sentiments.append({'text': comment['text'], 'sentiment': comment_sentiment})
        sentiments.append({'post_title': post['title'], 'post_sentiment': post_sentiment, 'comments_sentiments': post_comments_sentiments})
    with open('reddit_data.json', 'w') as f:
        json.dump(posts, f)
    with open('reddit_sentiments.json', 'w') as f:
        json.dump(sentiments, f)
    message_templates = [
        {
            "sentiment": "negative",
            "prompt": "addressing concerns about clinical trials, providing reassurance and support, acknowledging concerns respectfully, and offering avenues for further information or support. Avoid dismissing concerns and prioritize participant well-being in all communications"
        },
        {
            "sentiment": "neutral",
            "prompt": "providing information about clinical trials, highlighting the benefits and risks. convey how this could benefit the outcomes of medical research try to attract him/her for clinical trials.provide clear and balanced information about clinical trials. Avoid overly promotional language and ensure participants understand both the benefits and risks involved"
        },
        {
            "sentiment": "positive",
            "prompt": "encouraging someone to participate in a clinical trial, highlighting the potential impact on their health and the benefits of contributing to medical research., express gratitude while ensuring transparency about the clinical trial process. Avoid making promises or minimizing potential risks, and emphasize the importance of informed consent."
        }
    ]
    messages = generate_messages(sentiments, message_templates)
    with open('messages.json', 'w') as f:
        json.dump(messages, f)

if __name__ == '__main__':
    main()


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.

Error processing subreddit Diseasemanagement: Redirect to /subreddits/search


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Error processing subreddit science: 'MoreComments' object has no attribute 'body'


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Error processing subreddit askscience: 'MoreComments' object has no attribute 'body'


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Error processing subreddit AskScience: 'MoreComments' object has no attribute 'body'
OpenAI API error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}


In [13]:
!pip freeze > requirements.txt

In [18]:
!pip install session-info

Collecting session-info
  Downloading session_info-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting stdlib_list (from session-info)
  Downloading stdlib_list-0.10.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: session-info
  Building wheel for session-info (setup.py) ... [?25l[?25hdone
  Created wheel for session-info: filename=session_info-1.0.0-py3-none-any.whl size=8026 sha256=210e7ab7920819de50cf0a5a53f2a11bef16d04bcaafa20be80f927a18742417
  Stored in directory: /root/.cache/pip/wheels/6a/aa/b9/eb5d4031476ec10802795b97ccf937b9bd998d68a9b268765a
Successfully built session-info
Installing collected packages: stdlib_list, session-info
Successfully installed session-info-1.0.0 stdlib_list-0.10.0


In [22]:
!pip freeze > requirements.txt
!pip list --format=freeze > requirements.txt