# Yelp Sentiment CSV
266 Summer 2024
Kevin Kuc, 

The data is publically available here:
*   https://huggingface.co/datasets/Yelp/yelp_review_full


**Data Dictionary**

1.   'text': The review texts are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes (""). New lines are escaped by a backslash followed with an "n" character, that is "\n".
2.   'label': Corresponds to the score associated with the review (between 1 and 5).

## Step 1: Import packages

In [1]:
import pandas as pd
import numpy as np
import csv
import copy
from datasets import load_dataset

# data preprocessing
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import IncrementalPCA
from numpy.random.mtrand import binomial
import random
import string
from nltk.corpus import stopwords
from contractions import fix

# exploratory analysis
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import mlxtend
from mlxtend.plotting import scatterplotmatrix
from mlxtend.plotting import heatmap
import seaborn as sns
from IPython.display import Image
from textblob import TextBlob
from wordcloud import WordCloud

# model fit
import statsmodels.api as sm
#import tensorflow as tf
#from tensorflow import keras
#from keras import metrics
#from tensorflow.keras import initializers


# ignore warnings (libraries are rapidly changing)
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")

# These commands below set some options for pandas and to have matplotlib show the charts in the notebook
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format
pd.options.mode.chained_assignment = None  # default='warn'

  from .autonotebook import tqdm as notebook_tqdm


## Step 2: Read data

In [2]:
dataset = load_dataset('yelp_review_full')
df_train = dataset['train'].to_pandas()
df_test = dataset['test'].to_pandas()

#### Step 3.1: Preprocess Text

In [3]:
def preprocess_review(review):
    """Ensures the input is a string, converts it to lovercase, removes puncuation, 
    keeps alphabetical words only, remove any whitespaces"""
    if isinstance(review, list):
        review = ' '.join(review)
    review = review.lower()
    review = ''.join([char for char in review if char not in string.punctuation])
    review = ' '.join([word for word in review.split() if word.isalpha()])
    review = ' '.join(review.split())
    return review

df_train['text'] = df_train['text'].apply(preprocess_review)
df_test['text'] = df_test['text'].apply(preprocess_review)

#### Step 3.1.1: Adjust text length

In [4]:
#sampled_df['text_length'] = sampled_df['text'].apply(len)

# Calculate the average text length
#average_length = sampled_df['text_length'].mean()
#sampled_df = sampled_df[(sampled_df['text_length'] > (average_length-300)) & (sampled_df['text_length'] < (average_length+300))]
#sampled_df["label"].value_counts().sort_index()

#### Step 3.5: Export Model

In [5]:
# Export to csv
df_train.to_csv('train_allstars.csv', index=False)
df_test.to_csv('test_allstars.csv', index=False)