## Loading data

In [1]:
# import important libraries
import xml.etree.ElementTree as ET
import re
import pandas as pd
import os
import html

In [2]:
# clean up text
def clean_text(text):
    if text is None:
        return ""
    # remove HTML encodings
    text = html.unescape(text)
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text.strip().lower()

In [3]:
# parse xml file
def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    data = []

    # iterate through each conversation
    for conversation in root.findall('conversation'):
        conversation_id = conversation.get('id')
        
        # iterate through each message
        for message in conversation.findall('message'):
            line = message.get('line')
            author = message.find('author').text
            time = message.find('time').text
            text = message.find('text').text

            # clean the text
            cleaned_text = clean_text(text)
            
            # store as dictionary
            data.append({
                'conversation_id': conversation_id,
                'line': line,
                'author': author,
                'time': time,
                'text': cleaned_text
            })
    
    # convert to a DataFrame
    df = pd.DataFrame(data)
    return df

In [4]:
# load train file
xml_file = 'PAN12/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml'
train_df = parse_xml(xml_file)

In [5]:
train_df.tail()

Unnamed: 0,conversation_id,line,author,time,text
903602,4ed6b02ae537fdfd6078597b706292a8,101,74bfc043bd5ce9c17b37ffae6e0ba2fa,22:36,oh ok
903603,4ed6b02ae537fdfd6078597b706292a8,102,8cd850ea4215ee7c4b94b6bcc0bae593,22:36,i will look for you tomorrow
903604,4ed6b02ae537fdfd6078597b706292a8,103,74bfc043bd5ce9c17b37ffae6e0ba2fa,22:36,ok
903605,4ed6b02ae537fdfd6078597b706292a8,104,8cd850ea4215ee7c4b94b6bcc0bae593,22:36,bye lissa
903606,4ed6b02ae537fdfd6078597b706292a8,105,74bfc043bd5ce9c17b37ffae6e0ba2fa,22:37,bye jake


In [6]:
# load test file
xml_file = 'PAN12/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml'
test_df = parse_xml(xml_file)

In [7]:
test_df.tail()

Unnamed: 0,conversation_id,line,author,time,text
2058776,8deed7c66340728e6863f3d931d4cca7,53,105ca6d0fd6c2c3e136980d3548f16ab,02:32,"come the revolution, i'll worry about it"
2058777,8deed7c66340728e6863f3d931d4cca7,54,48a4fba2c4916bbc8aae694a3877dca3,02:33,no pasaran!
2058778,8deed7c66340728e6863f3d931d4cca7,55,48a4fba2c4916bbc8aae694a3877dca3,02:33,:)
2058779,8deed7c66340728e6863f3d931d4cca7,56,e1e3d026bb7ee9ee264316da10378aeb,02:35,25mhz is slow :(
2058780,8deed7c66340728e6863f3d931d4cca7,57,542ebcbcf6ee466a1780913e2c800716,02:42,"<- 3,400,000,000+ users can't be wrong"")"


In [8]:
# function to load the identified groomers
def load_groomers(file_path):
    with open(file_path, 'r') as f:
        positive_authors = set([line.strip() for line in f])
    return positive_authors

In [9]:
# load training groomers
training_groomers = "PAN12/pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt"
train_groomers = load_groomers(training_groomers)

In [10]:
# load testing groomers
testing_groomers = "PAN12/pan12-sexual-predator-identification-groundtruth-problem1.txt"
test_groomers = load_groomers(testing_groomers)

In [11]:
# function to label grooming conversations based on author IDs
def label_author_conversations(df, positive_authors):
    # Step 1: Identify conversations with at least one positive author
    positive_conversations = df[df['author'].isin(positive_authors)]['conversation_id'].unique()
    # Step 2: Label entire conversations
    df['label'] = df['conversation_id'].apply(lambda cid: 1 if cid in positive_conversations else 0)
    return df

In [12]:
# label train and test datasets
train_df = label_author_conversations(train_df, train_groomers)
test_df = label_author_conversations(test_df, test_groomers)

In [13]:
test_df

Unnamed: 0,conversation_id,line,author,time,text,label
0,affc2df0951b733d14ba92d19d9b7695,1,0a39f78bcb297ab0ebe8a29c28bfed89,15:24,bugmail: [bug 6978] new: mark eof-terminated s...,0
1,affc2df0951b733d14ba92d19d9b7695,2,60659cfda992013e610f285c46692d28,15:32,"henri, can i ask you a firefox build question ...",0
2,affc2df0951b733d14ba92d19d9b7695,3,b8810fee2f4a71f849f3f7409546d1d9,15:34,"60659cfda992013e610f285c46692d28: sure, but i ...",0
3,affc2df0951b733d14ba92d19d9b7695,4,60659cfda992013e610f285c46692d28,15:35,"it appears the build runs through, it creates ...",0
4,affc2df0951b733d14ba92d19d9b7695,5,60659cfda992013e610f285c46692d28,15:35,"when i start it, i get my standard install of ...",0
...,...,...,...,...,...,...
2058776,8deed7c66340728e6863f3d931d4cca7,53,105ca6d0fd6c2c3e136980d3548f16ab,02:32,"come the revolution, i'll worry about it",0
2058777,8deed7c66340728e6863f3d931d4cca7,54,48a4fba2c4916bbc8aae694a3877dca3,02:33,no pasaran!,0
2058778,8deed7c66340728e6863f3d931d4cca7,55,48a4fba2c4916bbc8aae694a3877dca3,02:33,:),0
2058779,8deed7c66340728e6863f3d931d4cca7,56,e1e3d026bb7ee9ee264316da10378aeb,02:35,25mhz is slow :(,0


In [14]:
# combine train and test datasets
combined_df = pd.concat([train_df, test_df], axis=0)
combined_df.reset_index(drop=True, inplace=True)

In [15]:
combined_df

Unnamed: 0,conversation_id,line,author,time,text,label
0,e621da5de598c9321a1d505ea95e6a2d,1,97964e7a9e8eb9cf78f2e4d7b2ff34c7,03:20,hola.,0
1,e621da5de598c9321a1d505ea95e6a2d,2,0158d0d6781fc4d493f243d4caa49747,03:20,hi.,0
2,e621da5de598c9321a1d505ea95e6a2d,3,0158d0d6781fc4d493f243d4caa49747,03:20,whats up?,0
3,e621da5de598c9321a1d505ea95e6a2d,4,97964e7a9e8eb9cf78f2e4d7b2ff34c7,03:20,not a ton.,0
4,e621da5de598c9321a1d505ea95e6a2d,5,97964e7a9e8eb9cf78f2e4d7b2ff34c7,03:20,you?,0
...,...,...,...,...,...,...
2962383,8deed7c66340728e6863f3d931d4cca7,53,105ca6d0fd6c2c3e136980d3548f16ab,02:32,"come the revolution, i'll worry about it",0
2962384,8deed7c66340728e6863f3d931d4cca7,54,48a4fba2c4916bbc8aae694a3877dca3,02:33,no pasaran!,0
2962385,8deed7c66340728e6863f3d931d4cca7,55,48a4fba2c4916bbc8aae694a3877dca3,02:33,:),0
2962386,8deed7c66340728e6863f3d931d4cca7,56,e1e3d026bb7ee9ee264316da10378aeb,02:35,25mhz is slow :(,0


In [16]:
print('Total conversations:', combined_df['conversation_id'].nunique())
print('Total authors:', combined_df['author'].nunique())
predatory = combined_df[combined_df['label']==1]

Total conversations: 222055
Total authors: 307693


In [17]:
predatory_ids = predatory['conversation_id'].drop_duplicates()
predatory_ids.to_csv('data/groomer_conversations.csv', index=False)

## Extract groomer messages

In [18]:
# function to label grooming conversations based on conversation ID
def label_authors(df, positive_authors):
    # Label conversations
    df['author_label'] = df['author'].apply(lambda cid: 1 if cid in positive_authors else 0)
    return df

In [19]:
# label authors 
train_df = label_authors(train_df, train_groomers)
test_df = label_authors(test_df, test_groomers)

In [20]:
# combine train and test datasets
combined_auth_df = pd.concat([train_df, test_df], axis=0)
combined_auth_df.reset_index(drop=True, inplace=True)

In [21]:
combined_auth_df

Unnamed: 0,conversation_id,line,author,time,text,label,author_label
0,e621da5de598c9321a1d505ea95e6a2d,1,97964e7a9e8eb9cf78f2e4d7b2ff34c7,03:20,hola.,0,0
1,e621da5de598c9321a1d505ea95e6a2d,2,0158d0d6781fc4d493f243d4caa49747,03:20,hi.,0,0
2,e621da5de598c9321a1d505ea95e6a2d,3,0158d0d6781fc4d493f243d4caa49747,03:20,whats up?,0,0
3,e621da5de598c9321a1d505ea95e6a2d,4,97964e7a9e8eb9cf78f2e4d7b2ff34c7,03:20,not a ton.,0,0
4,e621da5de598c9321a1d505ea95e6a2d,5,97964e7a9e8eb9cf78f2e4d7b2ff34c7,03:20,you?,0,0
...,...,...,...,...,...,...,...
2962383,8deed7c66340728e6863f3d931d4cca7,53,105ca6d0fd6c2c3e136980d3548f16ab,02:32,"come the revolution, i'll worry about it",0,0
2962384,8deed7c66340728e6863f3d931d4cca7,54,48a4fba2c4916bbc8aae694a3877dca3,02:33,no pasaran!,0,0
2962385,8deed7c66340728e6863f3d931d4cca7,55,48a4fba2c4916bbc8aae694a3877dca3,02:33,:),0,0
2962386,8deed7c66340728e6863f3d931d4cca7,56,e1e3d026bb7ee9ee264316da10378aeb,02:35,25mhz is slow :(,0,0


In [22]:
combined_auth_df.to_csv('data/dataset.csv', index=False)

In [23]:
predatory_auth = combined_auth_df[combined_auth_df['label']==1]

In [24]:
auth_ids = predatory_auth['author'].drop_duplicates()

In [25]:
# filter the groomers out
groomers_train_df = train_df[train_df['author_label'] == 1]
groomers_test_df = test_df[test_df['author_label'] == 1]

In [26]:
# combine test and train groomers
groomers_only_df = pd.concat([groomers_train_df, groomers_test_df], axis=0)
groomers_only_df.reset_index(drop=True, inplace=True)

In [27]:
print('Total predator conversations:', groomers_only_df['conversation_id'].nunique())
print('Total authors:', groomers_only_df['author'].nunique())

Total predator conversations: 5753
Total authors: 394


In [28]:
# group by 'conversation_id' to represent whole conversation
groomer_conversations = groomers_only_df.groupby('conversation_id')['text'].apply(lambda x: ' '.join(x)).reset_index()
groomer_conversations

Unnamed: 0,conversation_id,text
0,0013449f76e98b51b85505c5bd98d61a,hi.. let me know when you are on
1,0014c1b4278df4bf8ea8a20b7abdb13c,hi how r ya? was out with a buddy u miss me hu...
2,001642060867dc1119343316fc21926c,hi wanna play on phone ok ya i did and im supe...
3,0027f42308aad202fcb7224597a7ac1a,hey baby kool it wont take me two hours baby o...
4,002cac6e3f890fdc2ff5aa13e2c51e12,yeah i am on pretty late at night now :( don't...
...,...,...
5748,ffd257ae045d84529fe6b8b8797f91b2,hi my sweet julia.. im sorry but i cant get of...
5749,ffd55257c660e70e17cd38834037a46c,"how are you? i'm good missed you too, what ar..."
5750,ffdc2e08b13f1d05c2888e84c50da6b4,"i'm leaving for work now my precious princess,..."
5751,ffeb61465be684db5e17e2c3d84582cd,ill be on here in like 3 hours ok its 10am you...


## Sentiment analysis groomers

In [29]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.4.1+cu121
True


In [30]:
# import transformers pipeline
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm
2024-11-25 11:34:12.392306: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-25 11:34:12.785404: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [31]:
from torch.nn import DataParallel
# check if GPU available
device = 0 if torch.cuda.is_available() else -1
print("Using device:", device)
device_count = torch.cuda.device_count()
device_ids = list(range(device_count))
device_ids

Using device: 0


[0]

### BERT

In [33]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn import DataParallel
import numpy as np

# check if GPUs are available
device_count = torch.cuda.device_count()  # Get the number of GPUs
device_ids = list(range(device_count))  # List of GPU IDs (e.g., [0, 1, 2, 3] for 4 GPUs)

# load model and tokenizer
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# move the model to the GPUs and wrap it in DataParallel for multi-GPU processing
model = DataParallel(model, device_ids=device_ids).cuda()

# Function to classify sentiment for a batch of texts
def classify_sentiment_batch(batch):
    device = model.module.device if hasattr(model, 'module') else model.device
    
    # tokenize input batch
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to(device)
    
    # run inference on the model
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # get predictions
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    return predictions

# function to process batches of texts in parallel across multiple GPUs
def classify_batch_in_parallel(texts, batch_size=16):
    # split dataset into smaller batches
    num_batches = int(np.ceil(len(texts) / batch_size))
    predictions = []

    for i in range(num_batches):
        batch = texts[i*batch_size:(i+1)*batch_size]
        # classify the batch
        pred = classify_sentiment_batch(batch)
        predictions.extend(pred)
    
    return predictions

texts = groomer_conversations['text'].tolist() 
predictions = classify_batch_in_parallel(texts)

# assign predictions to the DataFrame (Positive/Negative labels)
groomer_conversations['BERT'] = ['Positive' if p == 1 else 'Negative' for p in predictions]

print(groomer_conversations)


                       conversation_id  \
0     0013449f76e98b51b85505c5bd98d61a   
1     0014c1b4278df4bf8ea8a20b7abdb13c   
2     001642060867dc1119343316fc21926c   
3     0027f42308aad202fcb7224597a7ac1a   
4     002cac6e3f890fdc2ff5aa13e2c51e12   
...                                ...   
5748  ffd257ae045d84529fe6b8b8797f91b2   
5749  ffd55257c660e70e17cd38834037a46c   
5750  ffdc2e08b13f1d05c2888e84c50da6b4   
5751  ffeb61465be684db5e17e2c3d84582cd   
5752  fff6daff608e7b65936e42279bf0e13d   

                                                   text      BERT  
0                      hi.. let me know when you are on  Positive  
1     hi how r ya? was out with a buddy u miss me hu...  Negative  
2     hi wanna play on phone ok ya i did and im supe...  Negative  
3     hey baby kool it wont take me two hours baby o...  Negative  
4     yeah i am on pretty late at night now :( don't...  Positive  
...                                                 ...       ...  
5748  hi my sweet j

In [34]:
# print how many conversations are positive/negative
bert_pos_count = groomer_conversations[groomer_conversations['BERT'] == 'Positive'].shape[0]
print(f"BERT: Number of conversations labeled as 'Positive': {bert_pos_count}")
bert_neg_count = groomer_conversations[groomer_conversations['BERT'] == 'Negative'].shape[0]
print(f"BERT: Number of conversations labeled as 'Negative': {bert_neg_count}")

BERT: Number of conversations labeled as 'Positive': 2695
BERT: Number of conversations labeled as 'Negative': 3058


In [35]:
# save conversation ids of groomer conversations with positive tone
pos_groomer_bert_conversations = groomer_conversations[groomer_conversations['BERT']== 'Positive']
pos_bert_conv_id_groomer = pos_groomer_bert_conversations[['conversation_id']]

In [36]:
# save conversation ids of groomer conversations with negative tone
neg_groomer_bert_conversations = groomer_conversations[groomer_conversations['BERT']== 'Negative']
neg_bert_conv_id_groomer = neg_groomer_bert_conversations[['conversation_id']]

## Extract non-grooming conversations

In [41]:
# dataframe with only non-predatory messages
non_predatory = combined_df[combined_df['label']==0]

In [42]:
print('Total non-predator conversations:', non_predatory['conversation_id'].nunique())
print('Total authors:', non_predatory['author'].nunique())

Total non-predator conversations: 216302
Total authors: 307235


In [43]:
# group by 'conversation_id' 
non_groomer_conversations = non_predatory.groupby('conversation_id')['text'].apply(lambda x: ' '.join(x)).reset_index()
non_groomer_conversations

Unnamed: 0,conversation_id,text
0,000049c4530615e68b898b3e0306630d,hi hi fr?
1,0000604306a283600b730276a2039471,b8810fee2f4a71f849f3f7409546d1d9 - do you have...
2,000133dbd971ffb8f723fc61ba977ca0,hey heyy hej din fjant
3,0001347c00d419eb537c0692e6e58eba,say asl and i'll rip your heart out asl grrr -...
4,000161e288cf8dfc468fe86d6d4af2d4,h hi heeeyy asl ?
...,...,...
216297,fffe4d1b08952afb8627a9b594f913c7,do you want to chat? here not on msn i don't h...
216298,ffff2d0e314610b1df596482d806ada9,haiiiiiiiii. can you help me? ): sure.. with w...
216299,ffff38287b6013960b9e96e08f85526a,hi asl? hello kjlkj
216300,ffff74f40b58182a2521235b9db901d4,hey hi lookingfor girl? r u girl? yes u? what ...


## Sentiment analysis non-groomers

### BERT

In [45]:
texts = non_groomer_conversations['text'].tolist() 
predictions = classify_batch_in_parallel(texts)

# assign predictions to the DataFrame (Positive/Negative labels)
non_groomer_conversations['BERT'] = ['Positive' if p == 1 else 'Negative' for p in predictions]

print(non_groomer_conversations)

                         conversation_id  \
0       000049c4530615e68b898b3e0306630d   
1       0000604306a283600b730276a2039471   
2       000133dbd971ffb8f723fc61ba977ca0   
3       0001347c00d419eb537c0692e6e58eba   
4       000161e288cf8dfc468fe86d6d4af2d4   
...                                  ...   
216297  fffe4d1b08952afb8627a9b594f913c7   
216298  ffff2d0e314610b1df596482d806ada9   
216299  ffff38287b6013960b9e96e08f85526a   
216300  ffff74f40b58182a2521235b9db901d4   
216301  ffffe01fc5b03a8d6b8c929d595644d9   

                                                     text      BERT  
0                                               hi hi fr?  Positive  
1       b8810fee2f4a71f849f3f7409546d1d9 - do you have...  Negative  
2                                  hey heyy hej din fjant  Positive  
3       say asl and i'll rip your heart out asl grrr -...  Positive  
4                                       h hi heeeyy asl ?  Negative  
...                                                

In [46]:
bert_pos_count = non_groomer_conversations[non_groomer_conversations['BERT'] == 'Positive'].shape[0]
print(f"BERT: Number of conversations labeled as 'Positive': {bert_pos_count}")
bert_neg_count = non_groomer_conversations[non_groomer_conversations['BERT'] == 'Negative'].shape[0]
print(f"BERT: Number of conversations labeled as 'Negative': {bert_neg_count}")

BERT: Number of conversations labeled as 'Positive': 69739
BERT: Number of conversations labeled as 'Negative': 146563


In [47]:
# save conversation ids of non-groomer conversations with positive tone
pos_non_groomer_bert_conversations = non_groomer_conversations[non_groomer_conversations['BERT']== 'Positive']
pos_bert_conv_id_non_groomer = pos_non_groomer_bert_conversations[['conversation_id']]

In [48]:
# save conversation ids of non-groomer conversations with negative tone
neg_non_groomer_bert_conversations = non_groomer_conversations[non_groomer_conversations['BERT']== 'Negative']
neg_bert_conv_id_non_groomer = neg_non_groomer_bert_conversations[['conversation_id']]

## One big dataframe

In [54]:
# group by conversation_id
dataset_grouped = combined_df.groupby('conversation_id').apply(
    lambda x: ' '.join(f"{speaker}: {message}" for speaker, message in zip(x['author'], x['text']))
).reset_index()

In [55]:
dataset_grouped

Unnamed: 0,conversation_id,0
0,000049c4530615e68b898b3e0306630d,53a66119381d887197c67ccfe3ef6670: hi 1c8edb8bf...
1,0000604306a283600b730276a2039471,a9b326df4e6da61c5b6f5e1058be83a2: b8810fee2f4a...
2,000133dbd971ffb8f723fc61ba977ca0,8f1d151f40bd785177dec682f5407c4e: hey 3b8f9119...
3,0001347c00d419eb537c0692e6e58eba,e2bd430b29412d9267886e187ba28075: say asl and ...
4,000161e288cf8dfc468fe86d6d4af2d4,b035925d950f4a032b68dd0844ff8413: h b035925d95...
...,...,...
222050,fffe4d1b08952afb8627a9b594f913c7,e5a96ed432ed5041be76d3fb1784fb95: do you want ...
222051,ffff2d0e314610b1df596482d806ada9,eccc65c89e622a83cfec5827c16391de: haiiiiiiiii....
222052,ffff38287b6013960b9e96e08f85526a,a9343d850a27be6ed37f176bc2ce589b: hi a9343d850...
222053,ffff74f40b58182a2521235b9db901d4,7bc167d759d9c56d43d1d46575433d35: hey 169b2106...


In [56]:
positive_sentiments_bert = pd.concat([pos_bert_conv_id_non_groomer, pos_bert_conv_id_groomer], axis=0)
positive_bert = positive_sentiments_bert['conversation_id'].tolist()

In [57]:
dataset_grouped['sentiment'] = dataset_grouped['conversation_id'].apply(lambda cid: 'Positive' if cid in positive_bert else 'Negative')

In [58]:
dataset_grouped

Unnamed: 0,conversation_id,0,sentiment
0,000049c4530615e68b898b3e0306630d,53a66119381d887197c67ccfe3ef6670: hi 1c8edb8bf...,Positive
1,0000604306a283600b730276a2039471,a9b326df4e6da61c5b6f5e1058be83a2: b8810fee2f4a...,Negative
2,000133dbd971ffb8f723fc61ba977ca0,8f1d151f40bd785177dec682f5407c4e: hey 3b8f9119...,Positive
3,0001347c00d419eb537c0692e6e58eba,e2bd430b29412d9267886e187ba28075: say asl and ...,Positive
4,000161e288cf8dfc468fe86d6d4af2d4,b035925d950f4a032b68dd0844ff8413: h b035925d95...,Negative
...,...,...,...
222050,fffe4d1b08952afb8627a9b594f913c7,e5a96ed432ed5041be76d3fb1784fb95: do you want ...,Negative
222051,ffff2d0e314610b1df596482d806ada9,eccc65c89e622a83cfec5827c16391de: haiiiiiiiii....,Negative
222052,ffff38287b6013960b9e96e08f85526a,a9343d850a27be6ed37f176bc2ce589b: hi a9343d850...,Positive
222053,ffff74f40b58182a2521235b9db901d4,7bc167d759d9c56d43d1d46575433d35: hey 169b2106...,Positive


In [59]:
dataset_grouped.columns = ['conversation_id', 'text', 'sentiment']

In [63]:
print(f"Bert:Number of conversations labeled as 'Positive': {dataset_grouped[dataset_grouped['sentiment'] == 'Positive'].shape[0]}")
print(f"Bert:Number of conversations labeled as 'Negative': {dataset_grouped[dataset_grouped['sentiment'] == 'Negative'].shape[0]}")

Bert:Number of conversations labeled as 'Positive': 72434
Bert:Number of conversations labeled as 'Negative': 149621


In [65]:
dataset_grouped.to_csv('data/dataset_w_sentiments.csv', index=False)

In [66]:
dataset_grouped

Unnamed: 0,conversation_id,text,sentiment
0,000049c4530615e68b898b3e0306630d,53a66119381d887197c67ccfe3ef6670: hi 1c8edb8bf...,Positive
1,0000604306a283600b730276a2039471,a9b326df4e6da61c5b6f5e1058be83a2: b8810fee2f4a...,Negative
2,000133dbd971ffb8f723fc61ba977ca0,8f1d151f40bd785177dec682f5407c4e: hey 3b8f9119...,Positive
3,0001347c00d419eb537c0692e6e58eba,e2bd430b29412d9267886e187ba28075: say asl and ...,Positive
4,000161e288cf8dfc468fe86d6d4af2d4,b035925d950f4a032b68dd0844ff8413: h b035925d95...,Negative
...,...,...,...
222050,fffe4d1b08952afb8627a9b594f913c7,e5a96ed432ed5041be76d3fb1784fb95: do you want ...,Negative
222051,ffff2d0e314610b1df596482d806ada9,eccc65c89e622a83cfec5827c16391de: haiiiiiiiii....,Negative
222052,ffff38287b6013960b9e96e08f85526a,a9343d850a27be6ed37f176bc2ce589b: hi a9343d850...,Positive
222053,ffff74f40b58182a2521235b9db901d4,7bc167d759d9c56d43d1d46575433d35: hey 169b2106...,Positive
