In [16]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import seaborn as sns

In [2]:
all_messages = pd.read_parquet('messages.parquet')
all_messages.head()

Unnamed: 0,rowid,ThreadId,IsFromMe,FromPhoneNumber,ToPhoneNumber,Service,TextDate,MessageText,AttributedBody,RoomName,ContactName,AssociatedMessageType,GUID,AssociatedMessageGUID
0,1,13147370530,1,Me,+13147370530,iMessage,2021-12-12 01:41:58,Emphasized “YOOOOO LETS GOOOO ”,b'\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84...,,Rhythm Garg,2004,3F41AE4F-2EC4-4C12-BFF7-E244634E025E,p:0/0F4F286F-C802-470F-AECE-9ED95B7A1C78
1,2,13147370530,1,Me,+13147370530,iMessage,2021-12-12 01:42:18,Noooo bro if it gets home after u return to st...,b'\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84...,,Rhythm Garg,0,CE5945ED-AE95-417E-9402-D825B825D72D,
2,3,13147370530,1,Me,+13147370530,iMessage,2021-12-12 01:42:31,That’s really surprising actually bc mine came...,b'\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84...,,Rhythm Garg,0,FABE020C-D59E-464B-9FD3-47A9AB023582,
3,4,13147370530,1,Me,+13147370530,iMessage,2021-12-12 01:43:10,��I wanted to get 16 inch but since I just bou...,b'\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84...,,Rhythm Garg,0,5D183EDA-1ADA-4CFA-AD8D-BC9025414CDA,
4,5,13147370530,0,+13147370530,Me,iMessage,2021-12-12 10:50:59,��I called them and they said bc of how much I...,b'\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84...,,Rhythm Garg,0,503B1EC7-AB59-4D8B-87B3-917C948EBD65,


In [3]:
def preproc_table(df, only_dms=False, no_reactions=False):
    cols_to_keep = [
        "rowid",
        "MessageText",
        "AssociatedMessageType",
        "RoomName"
    ]

    processed_table = df[cols_to_keep].copy()
    processed_table = processed_table[processed_table['MessageText'] != '']

    if only_dms:
        processed_table = processed_table[processed_table["RoomName"].isna()]

    if no_reactions:
        processed_table = processed_table[processed_table["AssociatedMessageType"] == 0]

    return processed_table

In [4]:
t1 = preproc_table(all_messages)
t2 = preproc_table(all_messages, only_dms=True, no_reactions=True)
print(len(t1), len(t2))

567349 300580


In [5]:
# using VADER
# analyzer = SentimentIntensityAnalyzer()

# vader_sentiment_table = preproc_table(all_messages)

# vader_sentiment_table['sentiment_scores'] = vader_sentiment_table['MessageText'].apply(analyzer.polarity_scores)

# vader_sentiment_table['compound'] = vader_sentiment_table['sentiment_scores'].apply(lambda x: x['compound'])
# vader_sentiment_table['positive'] = vader_sentiment_table['sentiment_scores'].apply(lambda x: x['pos'])
# vader_sentiment_table['negative'] = vader_sentiment_table['sentiment_scores'].apply(lambda x: x['neg'])
# vader_sentiment_table['neutral'] = vader_sentiment_table['sentiment_scores'].apply(lambda x: x['neu'])

# vader_sentiment_table.to_parquet('vader_sentiment_scores.parquet')

# vader_sentiment_table.sample(n=10)

In [None]:
roberta_sentiment_table = preproc_table(all_messages, only_dms=True, no_reactions=True)

sentiment_pipeline = pipeline(
   "sentiment-analysis",
   model="cardiffnlp/twitter-roberta-base-sentiment",
   top_k=None,
   max_length=512,
   truncation=True
)

# Process in single batch mode
texts = roberta_sentiment_table['MessageText'].tolist()
batch_size = 64

# Single process with batching
results = []
for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
   batch = texts[i:i + batch_size]
   batch_results = sentiment_pipeline(list(batch), padding=True, truncation=True)
   results.extend(batch_results)

# Process results with correct label mapping
processed_results = []
for result in results:
   scores = {'LABEL_0': 0, 'LABEL_1': 0, 'LABEL_2': 0}
   for r in result:
       scores[r['label']] = r['score']
   processed_results.append(scores)

# Add to dataframe
roberta_sentiment_table['negative'] = [r['LABEL_0'] for r in processed_results]
roberta_sentiment_table['neutral'] = [r['LABEL_1'] for r in processed_results]
roberta_sentiment_table['positive'] = [r['LABEL_2'] for r in processed_results]

# Add sentiment label
label_map = {
   'LABEL_0': 'Negative',
   'LABEL_1': 'Neutral',
   'LABEL_2': 'Positive'
}
roberta_sentiment_table['sentiment'] = [label_map[max(result, key=lambda x: x['score'])['label']] for result in results]

In [8]:
roberta_sentiment_table.to_parquet('roberta_sentiment_scores.parquet')

In [7]:
roberta_sentiment_table = pd.read_parquet('roberta_sentiment_scores.parquet')
