# 1. Import Dependencies

In [1]:
#Depending on which environment you're in, you may not have to run this. All dependencies are native to Jupyter Notebooks, 
#for example

#!pip install transformers requests beautifulsoup4 pandas numpy
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import bs4 as bs
import urllib.request
import pandas as pd
import matplotlib.pyplot as plt

# 2. Define Scraping Function
Input URL (can later be looped through with a list of post URLs to scrape,) returning a database of content WITH dates.

Could also just append content to existing dataframe


In [None]:
#content_url = 'https://forums.studentdoctor.net/threads/using-ai-to-generate-research-publications.1486320/'
#html = requests.get(content_url)
#soup = BeautifulSoup(html.text, 'html.parser')
#regex = re.compile('.*message.*')
#result_date = soup.find_all('time')
#result_text = soup.find_all('div', attrs={'class':'bbWrapper'})

#From this, result_date[].text and result_text[].text give the date of the post and the content of the post

# 3. Obtain URLs of every post on a SDN Subpage
This segment produces a list, titled full_post_urls, which contains the URL of the posts on pages in the defined range. This list will later be iterated through with another function to scrape post contents and timestamps into a pd dataframe.

In [2]:
base_url = 'https://forums.studentdoctor.net/'
forum_url_template = 'https://forums.studentdoctor.net/forums/radiology.42/page-{}' #change this to whatever board you would like to scrape, but include the /page-{} to allow for iteration through all the pages of the forum
n=2 #this is the number of pages of posts you would like to scrape. Check the webpage directly to see how many pages there are
full_post_urls = []

for page_number in range(1, n):
    board_url = forum_url_template.format(page_number)
    board_html = requests.get(board_url)
    board_soup = BeautifulSoup(board_html.text, 'html.parser')
    posts = board_soup.find_all('div', attrs={'class':'structItem-title'})

    for url in posts:
        link = url.find('a')['href']
        full_link = base_url + link
        full_post_urls.append(full_link)

In [4]:
len(full_post_urls) #this will return the number of total URLs to posts that will later be scraped

55

In [7]:
for url in full_post_urls: #if you would like to see the list of the URLs, you can run this
    print(url)

https://forums.studentdoctor.net//threads/faq-what-are-my-chances.283794/
https://forums.studentdoctor.net//threads/radiology-faculty-answering-questions-ama.1245595/
https://forums.studentdoctor.net//threads/reccomended-booklist-for-radiology-residents.368548/
https://forums.studentdoctor.net//threads/faq2-what-to-look-for-in-a-radiology-program.286095/
https://forums.studentdoctor.net//threads/general-questions-about-applying-to-radiology-read-the-first-faq-first.558320/
https://forums.studentdoctor.net//threads/malignant-programs.1174577/
https://forums.studentdoctor.net//threads/psa-gpt-4.1476965/
https://forums.studentdoctor.net//threads/chances-for-dr-re-applicant-current-pgy-1.1486631/
https://forums.studentdoctor.net//threads/concerning-trend-radiology-has-the-slowest-compensation-growth-of-any-specialty.1477508/
https://forums.studentdoctor.net//threads/late-low-step-2.1487329/
https://forums.studentdoctor.net//threads/anyone-taken-the-abr-nuclear-radiology-medicine-subspecial

# 4. Run scraping function and create database
Here we define and run the function that actually scrapes post content and timestamps, and places the data into a dataframe.

In [9]:
#This uses beautiful soup to parse the HTML from a single SDN post URL and returns the timestamp and text for each comment on that page. 
def extract_forum_data(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    result_date = [time['datetime'] for time in soup.find_all('time')]
    result_text = [div.get_text(separator=' ', strip=True) for div in soup.find_all('div', attrs={'class':'bbWrapper'})]
    return list(zip(result_date, result_text))

In [10]:
#loop through all of the urls in full_post_urls, adding them to a single list of data called all_data

all_data = [ ]
for url in full_post_urls:
  all_data.extend(extract_forum_data(url))

In [11]:
df = pd.DataFrame(all_data, columns=['Date','Content']) #Create the pd df and name the columns

In [26]:
df.head(10) #show head/tail of the dataframe

Unnamed: 0_level_0,Content,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-05-19 15:10:33+00:00,I think the majority of applicants have a gene...,1.0
2006-05-19 15:10:33+00:00,dear Hans You have solved problems of many peo...,1.0
2006-05-19 17:39:25+00:00,doc_radio said: dear Hans You have solved prob...,0.5
2006-05-19 22:32:46+00:00,What are my chances of matching? Advice? -----...,0.5
2006-05-25 00:06:46+00:00,"May I suggest that the moderator ""sticky"" this...",0.5
2006-05-25 11:01:53+00:00,hans19 said: I think the majority of applicant...,1.0
2006-05-27 06:59:07+00:00,If you guys wanna nominate anything else for t...,0.5
2006-05-30 14:27:11+00:00,Neuronix said: If you guys wanna nominate anyt...,0.5
2006-05-30 15:06:32+00:00,That sort of thing is over my head. I would PM...,0.0
2006-05-30 16:35:39+00:00,bigfrank said: Can voxel as moderator be remov...,0.0


In [46]:
#Saving df to csv is highly recommended. If the kernel fails, you will lose the dataframe unless it is saved to a csv

raw_df = 'SDNRads.csv' #change this to the name you want for the csv CSV for the raw data
df.to_csv(raw_df, index=False) #Save the dataframe as csv

# 5. Run Sentiment Analysis on DB
This section goes through the df and runs a huggingface sentiment analysis mode and produces a numerical value

In [15]:
#This assigns tokenizer and model to be used, this can be changed to test different models.

tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

In [16]:
#This defines the function that will iterate through the dataframe and return the sentiment analysis

def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))/2 #dividing by two gives a sentiment score from 0-1 instead of 0-2 for this model

In [18]:
df['Sentiment'] = df['Content'].apply(lambda x: sentiment_score(x[:512]))

In [44]:
df.head(5) #if you want to look at the top of the current dataset and confirm sentiment is present

Unnamed: 0_level_0,Content,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-05-19 15:10:33+00:00,I think the majority of applicants have a gene...,1.0
2006-05-19 15:10:33+00:00,dear Hans You have solved problems of many peo...,1.0
2006-05-19 17:39:25+00:00,doc_radio said: dear Hans You have solved prob...,0.5
2006-05-19 22:32:46+00:00,What are my chances of matching? Advice? -----...,0.5
2006-05-25 00:06:46+00:00,"May I suggest that the moderator ""sticky"" this...",0.5


In [None]:
sentiment_df = 'SDNRads_sentiment.csv' #again, change the title here if you want to save the new df as something else
df.to_csv(sentiment_df, index=False)

In [5]:
df = pd.read_csv('SDNRads_sentiment.csv') #run this to reload the above dataframe if the kernel crashes

# 6. Averaging the data across a timeframe
For the full datasets of 20+ years, monthly averages tend to graph much better than daily averages, but this will depend on what information you are hoping to glean from the data and how you would like to present it.

In [None]:
df['Date'] = pd.to_datetime(df['Date'], utc=True) #converts the Date column to Datetime format

In [None]:
df.set_index('Date', inplace=True) #resets the index of the column to the datetime

In [8]:
def resample_data(df, freq):
    return df['Sentiment'].resample(freq).mean()

#this defines a function to create a new dataframe of the average sentiment across a given timeframe

In [58]:
weekly_avg = resample_data(df, 'W') #this will give you a new pd df on the weekly timeframe. 'W', 'M', or 'Y' can be subbed in for other timelines

In [59]:
weekly_avg.index = weekly_avg.index.strftime('%Y-%m-%d') #this cleans up the Date row to drop hour:minute:second

In [60]:
weekly_avg.head #showing the sentiment by week, there are a lot of missing days early on which can be dropped if desired.

<bound method NDFrame.head of Date
2022-05-08    0.500000
2022-05-15         NaN
2022-05-22         NaN
2022-05-29         NaN
2022-06-05         NaN
                ...   
2023-09-24    0.500000
2023-10-01    0.500000
2023-10-08    0.348485
2023-10-15    0.500000
2023-10-22    0.500000
Name: Sentiment, Length: 77, dtype: float64>

In [68]:
#This helps clean up a lot of the empty weeks. This wouldn't be a problem if we do a full scrape, but since this is a small 
#sample, the stickied posts cause some problems with the timeline of the data unless we clean it up a bit
#weekly_avg = weekly_avg.iloc[45:]
weekly_avg.head

<bound method NDFrame.head of Date
2023-03-19    0.350000
2023-03-26    0.383333
2023-04-02    0.250000
2023-04-09    0.500000
2023-04-16    0.785714
2023-04-23         NaN
2023-04-30         NaN
2023-05-07    0.500000
2023-05-14         NaN
2023-05-21    0.500000
2023-05-28    0.362500
2023-06-04    0.222222
2023-06-11    0.500000
2023-06-18    0.625000
2023-06-25    0.318182
2023-07-02    0.590909
2023-07-09    0.425000
2023-07-16    0.578947
2023-07-23    0.468750
2023-07-30    0.200000
2023-08-06    0.625000
2023-08-13    0.600000
2023-08-20    0.200000
2023-08-27    0.550000
2023-09-03    0.500000
2023-09-10    0.692308
2023-09-17    0.464286
2023-09-24    0.500000
2023-10-01    0.500000
2023-10-08    0.348485
2023-10-15    0.500000
2023-10-22    0.500000
Name: Sentiment, dtype: float64>

In [70]:
weekly_avg.to_csv('Rads_weekly.csv', index=True)

# 7. Graphing the data
There is a problem with the libraries interacting here... so this needs to be run in a different notebook or the kernel will crash. Lame, but it is what it is

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

weekly_avg = pd.read_csv('Rads_weekly.csv', index_col=0)

weekly_avg.plot(figsize=(10,6))
plt.title('Monthly Average Sentiment of Student Doctor Network EM Board Posts')
plt.ylabel('Monthly Average Sentiment from 0 (negative) to 2 (positive)')
plt.xlabel('Month')
plt.grid(True)
plt.show()