# Generalizing Sentiment Analysis based on the given course URL:

## imports

In [15]:
#EDA stuff:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')


#natural language tokenizer:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
nltk.download('vader_lexicon')

# sia = SentimentIntensityAnalyzer()


#Beautiful soup use
import requests
from bs4 import BeautifulSoup

url = 'https://thecourseforum.com/course/14942/7737/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')


# Roberta Model
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

[nltk_data] Downloading package punkt to /Users/kaylakim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kaylakim/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/kaylakim/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/kaylakim/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kaylakim/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## web scraping

In [None]:
def get_last_page_num(url):
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve webpage")
        return 1
    
    soup = BeautifulSoup(response.content, 'html.parser')
    pagination_div = soup.find('div', class_='pagination')
    if not pagination_div:
        print('error on first')
        return 1

    page_links = pagination_div.find_all('a', href=lambda href: href and 'page=' in href)
    if not page_links:
        print('error on second')
        return 1

    last_link = page_links[-1]
    last_page_href = last_link.get('href')
    
    try:
        page_number = last_page_href.split('?page=')[1].split('#reviews')[0]
        return int(page_number)
    except (IndexError, ValueError):
        print('error on last')
        return 1


In [19]:
def scrape_all_reviews(url):
    all_reviews_final = []
    all_ratings_final = []

    last_page_num = get_last_page_num(url)
    # print(last_page_num)

    for i in range(1, (int(last_page_num) + 1)):
        new_url = url + f"?page={i}#reviews"
        # print("scraping page", new_url)

        response = requests.get(new_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find the main container for all reviews (the <ul> list) based on the class 'review-list list-unstyled'
            review_list = soup.find('ul', class_='review-list list-unstyled')

            # Check if the review list was found
            if review_list:
                all_reviews = review_list.find_all('li')

                # print(f"Found {len(all_reviews)} reviews for page {i}.")

                for j, review_item in enumerate(all_reviews):
                    review_text_div = review_item.find('div', class_='review-text-full')
                    
                    if review_text_div:
                        review_text = review_text_div.get_text(strip=True)
                        # print(f"\n--- Review {j+1} ---\n{review_text}")
                        all_reviews_final.append(review_text)
            else:
                print("Could not find the review list container.")
                break


            # Check if the review list was found
            if review_list:
                all_ratings = review_list.find_all('li')

                for review_item in all_ratings:
                    rating_div = review_item.find('div', id='review-average')
                    
                    if rating_div:
                        rating = rating_div.get_text(strip=True)
                        # print(rating)
                        all_ratings_final.append(float(rating))
            else:
                print("Could not find the rating container.")
        else:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

    return all_reviews_final, all_ratings_final

## roberta model and utilization (generalization)

In [20]:
def polarity_scores_roberta(example, tokenizer, model):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [21]:
def train_roberta (df, sia):
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment" 
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    res = {}
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            text = row['review_text']
            # rating = row['rating']
            myid = row['Id']
            vader_result = sia.polarity_scores(text)

            vader_result_rename = {}
            for key, value in vader_result.items(): # rename vader
                vader_result_rename[f"vader_{key}"] = value

            roberta_result = polarity_scores_roberta(text, tokenizer, model)
            both = {**vader_result_rename, **roberta_result}
            res[myid] = both

            results_df = pd.DataFrame(res).T
            results_df = results_df.reset_index().rename(columns={'index' : 'Id'})
            results_df = results_df.merge(df, how='left')

        except RuntimeError: # stating that the roberta model breaks for examples where text is too long -> print in some way
            print(f'Broke for id {myid}')

    return results_df

In [22]:
# input: url to course, modelType (huggingface auto, built one, etc.)
# output: average compound score for course, list of review_texts, list of ratings
def general_TCF_Sentiment_analyzer(url):
    all_reviews_list = []
    all_ratings_list = []
    all_reviews_list, all_ratings_list = scrape_all_reviews(url)
    # print(len(all_ratings_list), len(all_reviews_list))

    future_id = list(range(len(all_reviews_list)))
    data = {'Id': future_id, 'review_text': all_reviews_list, 'rating': all_ratings_list}
    df = pd.DataFrame(data)

    sia = SentimentIntensityAnalyzer()
    
    results_df_passed = train_roberta(df, sia)

    list_compounds = results_df_passed['vader_compound']
    average_compound_score = list_compounds.mean()


    # return average_compound_score, all_reviews_list, all_ratings_list
    return average_compound_score, all_reviews_list, all_ratings_list # TEMP

average_compound_score, all_reviews_list, all_ratings_list = general_TCF_Sentiment_analyzer(url)
print(average_compound_score) # 0.6078586206896552 should be for cso1


  0%|          | 0/29 [00:00<?, ?it/s]

0.6078586206896552


## Application

In [23]:
url = "https://thecourseforum.com/course/15227/4710/"
average_compound_score_dmt2, new_reviews_dmt2, new_ratings_dmt2 = general_TCF_Sentiment_analyzer(url)
print(average_compound_score_dmt2)

# for each in new_reviews_dmt2:
#     print(each)

  0%|          | 0/13 [00:00<?, ?it/s]

Broke for id 9
0.34205833333333335


In [24]:
url = "https://thecourseforum.com/course/15790/13128/"
average_compound_score_compt_prob, new_reviews_comp_prob, new_ratings_comp_prob = general_TCF_Sentiment_analyzer(url)
print(average_compound_score_compt_prob)

# for each in new_ratings_comp_prob:
#     print(each)


error on second


  0%|          | 0/1 [00:00<?, ?it/s]

-0.7747


In [25]:
url = "https://thecourseforum.com/course/748/691/"
average_compound_score_econ, new_reviews_econ, new_ratings_econ = general_TCF_Sentiment_analyzer(url)
print(average_compound_score_econ)

  0%|          | 0/413 [00:00<?, ?it/s]

0.5659292978208232


In [28]:
# url = "https://thecourseforum.com/course/15909/11449/"
# average_compound_score_ml1, new_reviews_ml1, new_ratings_ml1 = general_TCF_Sentiment_analyzer(url)
# print(average_compound_score_ml1)

## ISSUES

**1.**

[RESOLVED: error in not providing local soup to get_last_page_num()] So, the compound scores were found to be accurate through the courseForumWebscrape.ipynb file. However it's duplicating some of the reviews and ratings in the list. The important thing is that the compound scores (average and individuals) are accurate, and that there are equal number of ratings and reviews per course. 

I suspect this issue of duplicating stuff, as seen if you uncomment the last code cell above from comp prob, is that sometimes people rate courses, but don't leave a review. So the loop in scrape_all_reviews will not detect a review, and use the last found review on the rating found. This is due to, then, how we web scrape and not the sentiment analysis model itself.

That scrape_all_reviews method could be simplified and "debugged" in the future. However, for the purposes of this project, we've compounded the sentiment scores and can list the individual review sentiment analysis scores if we needed.

<br><br>

**2.**

If you uncomment the last cell about ml1, there'll be an error. Currently, this course doesn't have any reviews or ratings. I could easily go in and fix this, but I haven't gotten the chance.

I suspect I'd provide get_last_page_num a check for if there are any reviews or ratings. If either don't exist (len(ratings)==0 | len(reviews)==0), just return or print for debugging. This might break something that was intentional in this project, so I'm leaving it for now and to fix later when the chance arises.
<br><br>
