In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

pd.set_option('display.max_colwidth', None)

max_pages = 10
data = [('how do you get to Chad\'s Gap from the bottom of Alta?', ['I heard ski patrol blew it up', 'my ankles are broken'])]
ns_qa = pd.DataFrame(data, columns=['question', 'answers'])
new_thread_data = [('what happened to Ian Compton?', ['he\'s in TC heaven', 'he moved back east'])]
new_thread = pd.DataFrame(new_thread_data, columns=['question', 'answers'])
ns_qa = pd.concat([ns_qa, new_thread], ignore_index=True)

def scrape_thread(page_url, ns_qa):
    question = ''
    answers = []

    # scrape each thread page
    try:
        for i in range(1, max_pages+1):
            response = requests.get(page_url)

            # collect posts
            if response.status_code == 200:
                # Parse the HTML content with BeautifulSoup
                soup = BeautifulSoup(response.text, 'html.parser')

                posts = soup.find_all('div', class_='post-body')
                # grab the op
                if i == 1:
                    question = soup.find('title').get_text().split(' -')[0] + ': ' +  posts[0].get_text(separator='\n', strip=True).split('\n')[-1]

                for post in posts[1:]:
                    lines = post.get_text(separator='\n', strip=True).split('\n')
                    # if someone's quoting another user we only want the response
                    answers.append(lines[-1])

                # check for next page if its there go to the link
                next_page_regex = rf'http.*page={i+1}"'
                matches = re.findall(next_page_regex, response.text)

                if (matches):
                    page_url = matches[0][:-1]
                else:
                    break

        # save thread contents
        new_thread_data = (question, answers)
        new_thread = pd.DataFrame(data=[new_thread_data], columns=['question', 'answers'])
        ns_qa = pd.concat([ns_qa, new_thread], ignore_index=True)

    except Exception as e:
        print(e)

    return ns_qa



In [3]:
for i in range(101, 200):
    forum_page = f"https://www.newschoolers.com/forum/1/Ski-Gabber?page={i}"

    # Send an HTTP GET request to the URL
    response = requests.get(forum_page)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all thread titles and their links
    thread_elements = soup.find_all("a", class_="thread")

    # Extract thread titles and links and store them in a list of tuples
    thread_list = [(thread.h2.get_text(strip=True), thread["href"]) for thread in thread_elements]

    # Print the list of tuples (thread title, thread link)
    for index, thread in enumerate(thread_list):
        ns_qa = scrape_thread(thread[1], ns_qa)
        # print(f'thread {index}, {thread[0]} scraped.')
    
    ns_qa['question'] = ns_qa['question'].str.replace('|', '')
    ns_qa['answers'] = ns_qa['answers'].apply(lambda x: [item.replace('|', '') for item in x])
    ns_qa.to_csv('ns_qa_p101_200.csv', index=False, mode='w')
    print(f'NS dataset, {i} pages, {len(ns_qa)} threads saved.')

NS dataset, 101 pages, 4001 threads saved.
NS dataset, 102 pages, 4041 threads saved.
NS dataset, 103 pages, 4081 threads saved.
NS dataset, 104 pages, 4121 threads saved.
NS dataset, 105 pages, 4161 threads saved.
NS dataset, 106 pages, 4201 threads saved.
NS dataset, 107 pages, 4241 threads saved.
NS dataset, 108 pages, 4281 threads saved.
NS dataset, 109 pages, 4321 threads saved.
NS dataset, 110 pages, 4361 threads saved.
NS dataset, 111 pages, 4401 threads saved.
NS dataset, 112 pages, 4441 threads saved.
NS dataset, 113 pages, 4481 threads saved.
NS dataset, 114 pages, 4521 threads saved.
NS dataset, 115 pages, 4561 threads saved.
NS dataset, 116 pages, 4601 threads saved.
NS dataset, 117 pages, 4641 threads saved.
NS dataset, 118 pages, 4681 threads saved.
NS dataset, 119 pages, 4721 threads saved.
NS dataset, 120 pages, 4761 threads saved.
NS dataset, 121 pages, 4801 threads saved.
NS dataset, 122 pages, 4841 threads saved.
NS dataset, 123 pages, 4881 threads saved.
NS dataset,

In [None]:
dset = pd.read_csv('ns_qa_p1_200.csv')

def clean_strings(answer_list):
    return [sentence.replace("\\", "") for sentence in ast.literal_eval(answer_list)]

dset['answers'] = dset['answers'].apply(clean_strings)

print(dset['question'][0], dset['answers'][0])

dset['pretoken_input'] = None

for j in range(len(dset['question'])):
    message = [{"role": "user", "content": dset['question'][j]}]
    for i, resp in enumerate(dset['answers'][j]):
        # alternate between "user" and "assistant" -> "op" and "other"
        if i % 2 == 0:
            message.append({"role": "assistant", "content": resp})
        else:
            message.append({"role": "user", "content": resp})
            
    dset.at[j, 'pretoken_input'] = message

In [None]:
hf_dset = Dataset.from_pandas(dset)
hf_dset = hf_dset.add_column('formatted_pretoken_input', [tokenizer.apply_chat_template(msg, tokenize=False) for msg in hf_dset['pretoken_input']])

hf_dset['formatted_pretoken_input'][0]

In [None]:
hf_dset = Dataset.from_pandas(dset)
hf_dset = hf_dset.add_column('formatted_pretoken_input', [tokenizer.apply_chat_template(msg, tokenize=False) for msg in hf_dset['pretoken_input']])
# hf_dset = hf_dset.add_column('tokenized_input', [tokenizer.encode(msg, padding='max_length', max_length=512, truncation=True) for msg in hf_dset['formatted_pretoken_input']]) # SFTTrainer won't let me token inputs myself
hf_dset = hf_dset.select_columns('formatted_pretoken_input')
hf_dset = hf_dset.train_test_split(test_size=0.1)
# hf_dset.save_to_disk('./hf_dset') # commented to prevent overwrite
hf_dset