In [37]:
import pandas as pd
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoConfig


In [38]:
filename = 'yelp_reviews_food_tenth.parquet'

# Load the dataset into a Pandas DataFrame
df = pd.read_parquet(filename)

In [39]:
# Drop any rows with missing values
df.dropna(inplace=True)

In [40]:
# Remove any URLs and mentions (@username) from the text
df['text'] = df['text'].apply(lambda x: re.sub(r"http\S+", "", x))
df['text'] = df['text'].apply(lambda x: re.sub(r"@\S+", "", x))

# Remove any non-alphanumeric characters and convert to lowercase
df['text'] = df['text'].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s]", "", x.lower()))

# Remove any leading or trailing whitespaces
df['text'] = df['text'].apply(lambda x: x.strip())



In [41]:
df['text'][0]

'love going here for happy hour or dinner  great patio with fans to beat the stl heat   alsovery accomodating at this location  i like the veal milanese but with mixed greens instead of pasta  theyll modify the menu to suit your taste'

In [42]:
stop_words = stopwords.words('english')

def clean_text(text):
    
    text = ''.join(text)
    
    # Remove stopwords
    words = text.split()
    words = [w for w in words if w not in stop_words]

        # Join back the words
    text = " ".join(words)
    return text

In [43]:
clean_text(df['text'][0])

'love going happy hour dinner great patio fans beat stl heat alsovery accomodating location like veal milanese mixed greens instead pasta theyll modify menu suit taste'

In [44]:
# Apply the function to the text column
df['text_cleaned'] = df['text'].apply(clean_text)


In [45]:
# Save as checkpoint
df.to_parquet('yelp_food_reviews_cleaned.parquet', index=False)

In [46]:
# Define preprocessing functions

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stopwords.words('english')]

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def stem(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def pos_tagger(tokens):
    doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
    pos_tagged_tokens = [(token.text, token.pos_) for token in doc]
    return pos_tagged_tokens

def ner(text):
    doc = nlp(text)
    ner_results = [(ent.text, ent.label_) for ent in doc.ents]
    return ner_results


In [48]:
# Tokenize the text using the sentiment_pipeline tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", config=AutoConfig.from_pretrained("potatobunny/results-yelp"))

# tokenizer = sentiment_pipeline.tokenizer
df["tokens"] = df['text'].apply(tokenizer.tokenize)

Token indices sequence length is longer than the specified maximum sequence length for this model (587 > 512). Running this sequence through the model will result in indexing errors


In [49]:
df['tokens'] = df['tokens'].apply(lemmatize)

In [50]:
df['tokens'] = df['tokens'].apply(stem)

In [51]:
df['pos_tags'] = df['tokens'].apply(lambda x: pos_tagger(x))

In [52]:
df['ner'] = df['tokens'].apply(lambda x: ner(' '.join(x)))

In [53]:
# create a mapping of star ratings to sentiment categories
sentiment_map = {1: 'Negative', 2: 'Neutral', 3: 'Neutral', 4: 'Neutral', 5: 'Positive'}

# apply the mapping to the 'stars' column of the dataframe
df['label'] = df['stars'].map(sentiment_map)

In [54]:
# Save the preprocessed data to a new CSV file
df.to_csv('sentiment_data_preprocessed.csv', index=False)


In [None]:
import pandas as pd

df = pd.read_csv('sentiment_data_preprocessed.csv')

df = df[['text_cleaned', 'stars']]
df.rename(columns={'text_cleaned': 'text', 'stars':'labels'}, inplace=True)
df

In [None]:
df.to_parquet('sentiment_data_preprocessed.parquet')