In [None]:
import praw
import pandas as pd

In [None]:
#creating reddit instance

reddit = praw.Reddit(
    client_id= client_id,
    password=password,
    client_secret= secret_key,
    user_agent= user_agent)

In [None]:
#creating dataframe
df = pd.DataFrame(columns=["id", "title", "created", "created_utc", 'score'])

In [None]:
#getting top submissions in the cycling subreddit
for submission in reddit.subreddit("cycling").top(limit=None):
    df = df.append({"id": submission.id, "title": submission.title,"created": submission.created, "created_utc": submission.created_utc, "score": submission.score}, ignore_index=True)

In [None]:
#removing duplicate submissions
df = df.drop_duplicates(subset=['id'])

In [None]:
#turning UTC column into datetime
df['created'] = df['created'].astype('str')
df['Converted_Date_2'] = pd.to_datetime(df['created'], unit='s')

In [None]:
#Iterating through the dataframe in order to get all comments attached to each submission id
#appending those comments to the dataframe
df = pd.DataFrame(columns=["id", "Comment"])

for id in df:
    submission = reddit.submission(id)
    submission.comments.replace_more(limit=None)
    comment_queue = submission.comments[:] 
    
    while comment_queue:
        comment = comment_queue.pop(0)
        # Add id and comment.body to the DataFrame
        df = df.append({"id": id, "Comment": comment.body}, ignore_index=True)
        comment_queue.extend(comment.replies)

In [None]:
#making a list of bike brands
manufacturers = [
    "Giant", "Trek", "Specialized", "Cannondale", "Scott",
    "Santa Cruz", "Bianchi", "Merida", "GT", "BMC", "Cube",
    "Kona", "Fuji", "Orbea", "Colnago", "Pinarello", "Cervélo", "Focus",
    "Rocky Mountain", "Wilier", "Raleigh", "Felt", "Yeti",
    "Marin", "Norco", "Pivot", "Salsa", "Ibis", "Diamondback",
    "Devinci", "Jamis", "Fuji", "Ghost", "Canyon", "Fuji",
    "Gazelle", "Look", "Ridley", "Santana", "Surly", "Breezer",
    "Cinelli", "De Rosa", "Litespeed", "Masi", "Scott", "Time",
    "Van Nicholas", "Yeti", "felt", "Eddy Merckx", 'huffy', 'schwinn', 
    'transition', 'sixthreezero', 'electra', 'evil', 'niner', 'mongoose', '6ku'
]

In [None]:
#setting brands to lowercase
manufacturers = [str(word).lower() for word in manufacturers if isinstance(word, str)]
manufacturers = list(set(manufacturers))


In [None]:
#setting all comments lowercase
df['Comment'] = df['Comment'].apply(lambda x: ' '.join(word.lower() for word in str(x).split()) if isinstance(x, str) else x)

In [None]:
#extracting comments that contain any of the bike brands
for index, row in df.iterrows():
    try:
        text = row['Comment']
        # Check if any of the search words exists in the column
        if any(word in text for word in manufacturers):
            # Add the row to the matching DataFrame
            row['brand'] = next((word for word in manufacturers if word in text), None)
            matching_df = matching_df.append(row) 
                
    except TypeError:
        pass

# Reset the index of the matching DataFrame
matching_df = matching_df.reset_index(drop=True)

In [None]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import numpy as np
from tqdm import tqdm, trange

In [None]:
#defining the model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
#loops through a dataframe and applies polarity score function to the comment column, then
#adds each score (dictionary) to the dataframe

for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        comment = row['Comment']
        roberta_result = polarity_scores_roberta(comment)

        # Assign roberta_result values to specific row in the dataframe
        for key, value in roberta_result.items():
            df.at[i, key] = value

    except RuntimeError:
        print('Missed one')

    except IndexError:
        print('Missed one')

In [None]:
#sending results to csv
df.to_csv('all_sentiments.csv', index=False)