In [216]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [217]:
# Load the original Stack Overflow posts
data = pd.read_csv('../data/pytorch_posts.csv')

In [218]:
# Initialize NLTK stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wgo027\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [219]:
# Function to remove code snippets from text
def remove_code_snippets(text):
    cleaned_text = re.sub(r'(?s)<code>.*?</code>', '', text)
    return cleaned_text

In [220]:
# Function to remove non-alphanumeric characters from text
def remove_non_alphanumeric(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return cleaned_text

In [221]:
# Function to remove hyperlinks from text
def remove_hyperlinks(text):
    cleaned_text = re.sub(r'http\S+|www.\S+', '', text)
    return cleaned_text

In [222]:
# Function to remove HTML tags from text
def remove_html_tags(text):
     # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')
    
    # Remove HTML tags
    cleaned_text = re.sub(r'<[^>]+>', '', str(soup))


    return cleaned_text

In [223]:
# Function to remove punctuation from text
def remove_punctuation(text):
    cleaned_text = re.sub(r'[^\w\s]', ' ', text)
    return cleaned_text

In [224]:
def remove_extra_spaces(text):
    # Remove extra whitespaces and newline characters
    cleaned_text = re.sub(r'\s+', ' ', text.strip())
    return cleaned_text

In [225]:
def extract_tags(tag_string):
    # Remove angle brackets
    tag_string = re.sub(r'><', ' ', tag_string)
    tag_string = re.sub(r'>', '', tag_string)
    tag_string = re.sub(r'<', '', tag_string)
    tag_string = re.sub(r'\s+', ' ', tag_string)
    
    return tag_string

In [226]:
# Apply preprocessing steps to each field
fields = ['Title', 'Body']
for field in fields:
    cleaned_fields = []
    for post in data[field]:
        post = str(post)  # Convert field value to string
        post = remove_hyperlinks(post)
        post = remove_code_snippets(post)
        post = remove_html_tags(post)
        post = remove_non_alphanumeric(post)
        post = remove_extra_spaces(post)
        cleaned_fields.append(post)
    data[field] = cleaned_fields

splitTags = []
for post in data['Tags']:
    post = str(post)
    post = extract_tags(post)
    post = remove_extra_spaces(post)
    splitTags.append(post)
data['Tags'] = splitTags

  soup = BeautifulSoup(text, 'html.parser')


In [227]:
# Save the cleaned posts to a new file
data.to_csv('../data/cleaned_pytorch_posts.csv', index=False)