In [1]:
import pandas as pd
import os
import re


def parse_file(file_path):
    with open(file_path, mode="r", encoding="ascii", errors="ignore") as f:
        content = f.read()

        features = {}

        # Extract relevant fields
        fields = ["STARS", "TITLE", "DATE", "AUTHOR", "PRODUCT", "REVIEW"]
        for field in fields:
            match = re.search("<{}>(.*?)</{}>".format(field, field), content, re.DOTALL)
            features[field.lower()] = match.group(1).strip() if match else "Unknown"

        return features


def parse_directory(directory_path):
    txt_files = [f for f in os.listdir(directory_path) if f.endswith(".txt")]
    data = []
    for txt_file in txt_files:
        file_path = os.path.join(directory_path, txt_file)
        features = parse_file(file_path)
        data.append(features)

    return pd.DataFrame(data)


# Paths for your directories
SARCASM_AMAZON_REVIEWS_REGULAR_DIR_PATH = (
    "../datasets/SarcasmAmazonReviewsCorpus-master/Regular"
)
SARCASM_AMAZON_REVIEWS_IRONIC_DIR_PATH = (
    "../datasets/SarcasmAmazonReviewsCorpus-master/Ironic"
)

# Parse directories and get dataframes
ironic_df = parse_directory(SARCASM_AMAZON_REVIEWS_IRONIC_DIR_PATH)
ironic_df["is_sarcastic"] = 1

regular_df = parse_directory(SARCASM_AMAZON_REVIEWS_REGULAR_DIR_PATH)
regular_df["is_sarcastic"] = 0

# Combine data
amazon_combined = pd.concat([ironic_df, regular_df], ignore_index=True)

# Save to parquet and csv
amazon_combined.to_parquet("../datasets/amazon_combined.parquet")
amazon_combined.to_csv("../datasets/amazon_combined.csv", index=False)

amazon_combined.head()

Unnamed: 0,stars,title,date,author,product,review,is_sarcastic
0,1.0,"Listening to this ""Hurt"" me!","November 8, 2007","MomKKC ""momkkc""",The Sun Also Rises (Audio CD),William Hurt cannot read. At all. The cadenc...,1
1,1.0,"40% price hike, hmm","April 15, 2010",M. Barnhart,"Heineken BT06 BeerTender Tubes, Pack of 6 (Kit...","As another reviewer noted, these used to be 10...",1
2,5.0,Don't Mess With the Lupine Trinity!!!,"June 2, 2010",Jake &#34;The Wolfman&#34; Sanchez,The Mountain Three Wolf Moon Short Sleeve Tee ...,I've read several reviews from people who have...,1
3,1.0,IT'S A BLENDER!,"June 17, 2010",S. Cashdollar,Margaritaville DM1000 Frozen Concoction Maker ...,If you pay $250 for this blender you need your...,1
4,1.0,Another movie to ignore....,"April 24, 2010","Kody ""ParisHiltonFan""",Valentine's Day (DVD),A perfect date movie: you'll miss absolutely n...,1
