## Import libraries

In [1]:
import gc
import ast
import json
import pickle
import ray
import numpy as np
import modin.pandas as mpd
import pandas as pd

from textblob import TextBlob
from tqdm import tqdm
from transformers import pipeline
from torch.utils.data import Dataset

In [2]:
ray.init(ignore_reinit_error=True)

## Define feature engineering process
- Read data 
- Calculate features
    - Word / Sentence tokenization
    - Sentiment (Polarity / Subjectivity)
    - Emotion classification
- Export data with calculation

In [3]:
def literal_eval(text):
    try:
        return ast.literal_eval(text)
    except:
        return np.nan
        

def process_calculation(ds_name):
    mdf = mpd.read_csv("../datasets/processed/{0}_reviews_preprocessed.csv".format(ds_name))
    mdf = mdf.merge(
        mdf.content.apply(
            lambda content: json.dumps({
                "polarity": np.round(TextBlob(content).sentiment.polarity, 3),   
                "subjectivity": np.round(TextBlob(content).sentiment.subjectivity, 3),
                "words": list(map(str, TextBlob(content).words)),
                "sentences": list(map(str, TextBlob(content).sentences)),
            },
        )), 
        left_index=True, 
        right_index=True,
    ).rename(columns={"content_x": "content_text", "content_y": "content_features"})
    mdf.to_csv("../datasets/processed/{0}_reviews_calculated.xlsx".format(ds_name), index=None, sep="\t")
    del mdf
    gc.collect()
    
    df = pd.read_csv("../datasets/processed/{0}_reviews_calculated.xlsx".format(ds_name), sep="\t")
    df["content_features"] = df["content_features"].apply(literal_eval)
    df = df.dropna()
    df = pd.concat(
        [df, pd.json_normalize(df["content_features"].tolist()).add_prefix('content_')], 
        axis=1,
    )
    df = df.drop(columns=["content_features"])
    df = df[["user", "product", "rating", "label", "date", *[col for col in df.columns if "content_" in col]]]
    df.to_excel("../datasets/processed/{0}_reviews_calculated.xlsx".format(ds_name), index=None)
    return

### Process emotion predictions

In [4]:
def create_emotion_predictions(ds_name):
  df = pd.read_excel("./drive/MyDrive/CMP1044/{}_reviews_calculated.xlsx".format(ds_name))


  class MyDataset(Dataset):
    inputs = df.content_text.values.tolist()
    def __len__(self):
      return len(self.inputs)

    def __getitem__(self, i):
      return str(self.inputs[i])


  pipe = pipeline("sentiment-analysis", model="michellejieli/emotion_text_classifier", device=0)
  dataset = MyDataset()

  model_outputs = []
  for out in tqdm(pipe(dataset, batch_size=64, truncation=True, padding=True, max_length=512), total=len(dataset)):
    model_outputs.append(out)
    if model_outputs and (len(model_outputs) % 1600 == 0 or len(model_outputs) >= len(dataset)):
      with open("./drive/MyDrive/CMP1044/{}_emotion_predictions.pkl".format(ds_name), "wb") as f:
        pickle.dump(model_outputs, f)


def get_emotion_predictions(ds_name):
    with open("../datasets/processed/{}_emotion_predictions.pkl".format(ds_name), "rb") as f:
        predictions = [it["label"] for it in pickle.load(f)]
    return predictions

## Conduct calculation and checkout

### YelpChi

In [5]:
ds_name = "yelpchi"
process_calculation(ds_name)

In [7]:
df = pd.read_excel("../datasets/processed/{0}_reviews_calculated.xlsx".format(ds_name))
df.head()

Unnamed: 0,user,product,rating,label,date,content_text,content_polarity,content_subjectivity,content_words,content_sentences,content_emotion
0,yelpchi_user_00000,yelpchi_product_000,5,organic,2011-06-08,Let me begin by saying that there are two kind...,0.025,0.57,"['Let', 'me', 'begin', 'by', 'saying', 'that',...","[""Let me begin by saying that there are two ki...",neutral
1,yelpchi_user_00001,yelpchi_product_000,3,organic,2011-08-30,The only place inside the Loop that you can st...,-0.047,0.625,"['The', 'only', 'place', 'inside', 'the', 'Loo...",['The only place inside the Loop that you can ...,neutral
2,yelpchi_user_00002,yelpchi_product_000,5,organic,2009-06-26,I have walked by the Tokyo Hotel countless tim...,0.172,0.615,"['I', 'have', 'walked', 'by', 'the', 'Tokyo', ...",['I have walked by the Tokyo Hotel countless t...,neutral
3,yelpchi_user_00003,yelpchi_product_000,1,organic,2010-09-16,"If you are considering staying here, watch thi...",0.25,0.333,"['If', 'you', 'are', 'considering', 'staying',...","['If you are considering staying here, watch t...",neutral
4,yelpchi_user_00004,yelpchi_product_000,3,organic,2010-02-05,"This place is disgusting, absolutely horrible,...",-0.435,0.505,"['This', 'place', 'is', 'disgusting', 'absolut...","['This place is disgusting, absolutely horribl...",disgust


In [7]:
create_emotion_predictions("yelpchi")
df["content_emotion"] = get_emotion_predictions(ds_name)
df = df.dropna()
df.head()

Unnamed: 0,user,product,rating,label,date,content_text,content_polarity,content_subjectivity,content_words,content_sentences,content_emotion
0,yelpchi_user_00000,yelpchi_product_000,5,organic,2011-06-08,Let me begin by saying that there are two kind...,0.025,0.57,"['Let', 'me', 'begin', 'by', 'saying', 'that',...","[""Let me begin by saying that there are two ki...",neutral
1,yelpchi_user_00001,yelpchi_product_000,3,organic,2011-08-30,The only place inside the Loop that you can st...,-0.047,0.625,"['The', 'only', 'place', 'inside', 'the', 'Loo...",['The only place inside the Loop that you can ...,neutral
2,yelpchi_user_00002,yelpchi_product_000,5,organic,2009-06-26,I have walked by the Tokyo Hotel countless tim...,0.172,0.615,"['I', 'have', 'walked', 'by', 'the', 'Tokyo', ...",['I have walked by the Tokyo Hotel countless t...,neutral
3,yelpchi_user_00003,yelpchi_product_000,1,organic,2010-09-16,"If you are considering staying here, watch thi...",0.25,0.333,"['If', 'you', 'are', 'considering', 'staying',...","['If you are considering staying here, watch t...",neutral
4,yelpchi_user_00004,yelpchi_product_000,3,organic,2010-02-05,"This place is disgusting, absolutely horrible,...",-0.435,0.505,"['This', 'place', 'is', 'disgusting', 'absolut...","['This place is disgusting, absolutely horribl...",disgust


In [8]:
df.info()
df.to_excel("../datasets/processed/{0}_reviews_calculated.xlsx".format(ds_name), index=None)

<class 'pandas.core.frame.DataFrame'>
Index: 67384 entries, 0 to 67384
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user                  67384 non-null  object 
 1   product               67384 non-null  object 
 2   rating                67384 non-null  int64  
 3   label                 67384 non-null  object 
 4   date                  67384 non-null  object 
 5   content_text          67384 non-null  object 
 6   content_polarity      67384 non-null  float64
 7   content_subjectivity  67384 non-null  float64
 8   content_words         67384 non-null  object 
 9   content_sentences     67384 non-null  object 
 10  content_emotion       67384 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 6.2+ MB


### YelpNYC

In [8]:
ds_name = "yelpnyc"
process_calculation(ds_name)

In [9]:
df = pd.read_excel("../datasets/processed/{0}_reviews_calculated.xlsx".format(ds_name))
df.head()

Unnamed: 0,user,product,rating,label,date,content_text,content_polarity,content_subjectivity,content_words,content_sentences
0,yelpnyc_user_000000,yelpnyc_product_000,3,fraud,2014-12-08,The food at snack is a selection of popular Gr...,0.196,0.396,"['The', 'food', 'at', 'snack', 'is', 'a', 'sel...",['The food at snack is a selection of popular ...
1,yelpnyc_user_000001,yelpnyc_product_000,3,fraud,2013-05-16,This little place in Soho is wonderful. I had ...,0.025,0.65,"['This', 'little', 'place', 'in', 'Soho', 'is'...","['This little place in Soho is wonderful.', 'I..."
2,yelpnyc_user_000002,yelpnyc_product_000,4,fraud,2013-07-01,ordered lunch for 15 from Snack last Friday. ...,0.22,0.329,"['ordered', 'lunch', 'for', '15', 'from', 'Sna...",['ordered lunch for 15 from Snack last Friday....
3,yelpnyc_user_000003,yelpnyc_product_000,4,fraud,2011-07-28,This is a beautiful quaint little restaurant o...,0.555,0.777,"['This', 'is', 'a', 'beautiful', 'quaint', 'li...",['This is a beautiful quaint little restaurant...
4,yelpnyc_user_000004,yelpnyc_product_000,4,fraud,2010-11-01,Snack is great place for a casual sit down lu...,0.139,0.538,"['Snack', 'is', 'great', 'place', 'for', 'a', ...",['Snack is great place for a \xa0casual sit do...


In [10]:
create_emotion_predictions("yelpnyc")
df["content_emotion"] = get_emotion_predictions(ds_name)
df = df.dropna()
df.head()

Unnamed: 0,user,product,rating,label,date,content_text,content_polarity,content_subjectivity,content_words,content_sentences,content_emotion
0,yelpnyc_user_000000,yelpnyc_product_000,3,fraud,2014-12-08,The food at snack is a selection of popular Gr...,0.196,0.396,"['The', 'food', 'at', 'snack', 'is', 'a', 'sel...",['The food at snack is a selection of popular ...,neutral
1,yelpnyc_user_000001,yelpnyc_product_000,3,fraud,2013-05-16,This little place in Soho is wonderful. I had ...,0.025,0.65,"['This', 'little', 'place', 'in', 'Soho', 'is'...","['This little place in Soho is wonderful.', 'I...",joy
2,yelpnyc_user_000002,yelpnyc_product_000,4,fraud,2013-07-01,ordered lunch for 15 from Snack last Friday. ...,0.22,0.329,"['ordered', 'lunch', 'for', '15', 'from', 'Sna...",['ordered lunch for 15 from Snack last Friday....,neutral
3,yelpnyc_user_000003,yelpnyc_product_000,4,fraud,2011-07-28,This is a beautiful quaint little restaurant o...,0.555,0.777,"['This', 'is', 'a', 'beautiful', 'quaint', 'li...",['This is a beautiful quaint little restaurant...,joy
4,yelpnyc_user_000004,yelpnyc_product_000,4,fraud,2010-11-01,Snack is great place for a casual sit down lu...,0.139,0.538,"['Snack', 'is', 'great', 'place', 'for', 'a', ...",['Snack is great place for a \xa0casual sit do...,neutral


In [12]:
df.info()
df.to_excel("../datasets/processed/{0}_reviews_calculated.xlsx".format(ds_name), index=None)

<class 'pandas.core.frame.DataFrame'>
Index: 358951 entries, 0 to 358956
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   user                  358951 non-null  object 
 1   product               358951 non-null  object 
 2   rating                358951 non-null  int64  
 3   label                 358951 non-null  object 
 4   date                  358951 non-null  object 
 5   content_text          358951 non-null  object 
 6   content_polarity      358951 non-null  float64
 7   content_subjectivity  358951 non-null  float64
 8   content_words         358951 non-null  object 
 9   content_sentences     358951 non-null  object 
 10  content_emotion       358951 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 32.9+ MB
