In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


# SemEval-2016 Task 6

[Website](https://alt.qcri.org/semeval2016/task6/index.php?id=data-and-tools), [Data visualization](https://www.saifmohammad.com/WebPages/StanceDataset.htm)

In [41]:
# semeval-2016
!wget http://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip
!unzip /content/stancedataset.zip

--2023-12-04 19:47:20--  http://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip
Resolving alt.qcri.org (alt.qcri.org)... 80.76.166.231
Connecting to alt.qcri.org (alt.qcri.org)|80.76.166.231|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip [following]
--2023-12-04 19:47:20--  https://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip
Connecting to alt.qcri.org (alt.qcri.org)|80.76.166.231|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 262993 (257K) [application/zip]
Saving to: ‘stancedataset.zip.1’


2023-12-04 19:47:22 (443 KB/s) - ‘stancedataset.zip.1’ saved [262993/262993]

Archive:  /content/stancedataset.zip
replace StanceDataset/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [42]:
csv_file = 'StanceDataset/train.csv'
col_names = ['tweet', 'target', 'stance', 'opinion towards', 'sentiment']
stance2id = {'AGAINST': 0, 'FAVOR': 1, 'NONE': 2}

df = pd.read_csv(csv_file, encoding='unicode_escape', lineterminator='\r', header=0, names=col_names)
df['stance'] = df['stance'].apply(lambda x: stance2id[x])
df.head()

Unnamed: 0,tweet,target,stance,opinion towards,sentiment
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,0,1. The tweet explicitly expresses opinion abo...,neg
1,Hillary is our best choice if we truly want to...,Hillary Clinton,1,1. The tweet explicitly expresses opinion abo...,pos
2,@TheView I think our country is ready for a fe...,Hillary Clinton,0,1. The tweet explicitly expresses opinion abo...,neg
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,0,1. The tweet explicitly expresses opinion abo...,neg
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,2,3. The tweet is not explicitly expressing opi...,pos


In [43]:
df.shape

(2914, 5)

In [44]:
class SemEval2016Dataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length):
        stance2id = {
            'AGAINST': 0,
            'FAVOR': 1,
            'NONE': 2
        }
        target2id = {
            'Atheism': 0,
            'Climate Change is a Real Concern': 1,
            'Donald Trump': 2,
            'Feminist Movement': 3,
            'Hillary Clinton': 4,
            'Legalization of Abortion': 5
        }
        col_names = ['tweet', 'target', 'stance', 'opinion towards', 'sentiment']

        df = pd.read_csv(csv_file, encoding='unicode_escape', lineterminator='\r',
                         header=0, names = col_names)
        df['stance'] = df['stance'].apply(lambda x: stance2id[x])
        df['target'] = df['target'].apply(lambda x: target2id[x])
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, ix):
        tweet = self.df['tweet'].iloc[ix]
        target = self.df['target'].iloc[ix]
        stance = self.df['stance'].iloc[ix]
        encoding = self.tokenizer(tweet,
                                  return_tensors='pt',
                                  max_length=self.max_length,
                                  padding='max_length',
                                  truncation=True)
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        return input_ids, attention_mask, target, stance

In [45]:
def get_semeval_dataset(tokenizer='vinai/bertweet-large', max_length=128, train=True):
    if train:
        csv_file = 'StanceDataset/train.csv'
    else:
        csv_file = 'StanceDataset/test.csv'
    tokenizer = AutoTokenizer.from_pretrained(tokenizer)

    return SemEval2016Dataset(csv_file, tokenizer, max_length)

In [46]:
dataset = get_semeval_dataset()
dataset[0]

(tensor([    0,  1039,  5357, 32023,   329,   178,     6,   849, 21292, 10777,
           133, 47576,    79, 16492,  2382,  2055,   389,   330, 13908,  5575,
             6,  4529, 25872, 38068,     9,  4053,    73, 15124,   769,   849,
         17521, 49050,     6, 42624,   849, 42988,  1242,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

In [47]:
def get_semeval_data_loader(batch_size=128, shuffle=True, tokenizer='vinai/bertweet-large', max_length=128, train=True):
    data = get_semeval_dataset(tokenizer, max_length, train)
    return DataLoader(data, batch_size, shuffle)

In [50]:
loader = get_semeval_data_loader(batch_size=4)
for ix, (input_ids, attention_mask, target, stance) in enumerate(loader):
    print(f'{input_ids=}')
    print(f'{attention_mask=}')
    print(f'{target=}')
    print(f'{stance=}')
    if ix == 5:
        break

input_ids=tensor([[    0,   100,    40,    45,  1762,  4402,    13,   127,   301,     6,
            13,    19,  4402,     5,  5736,    40, 21104,   162,    25,    10,
         15978,   111, 13438,     4,   195,    35,  1092,   849, 37504,  4014,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1

# COVID-19 Stance Detection

[github](https://github.com/kglandt/stance-detection-in-covid-19-tweets/tree/main)

In [49]:
base_url = 'https://raw.githubusercontent.com/kglandt/stance-detection-in-covid-19-tweets/main/dataset/'

train_urls = [
    'face_masks_train.csv',
    'fauci_train.csv',
    'school_closures_train.csv',
    'stay_at_home_orders_train.csv',
]

train_noisy = [
    'face_masks_train_noisy.csv',
    'fauci_train_noisy.csv',
    'school_closures_train_noisy.csv',
    'stay_at_home_orders_train_noisy.csv',
]

val_urls = [
    'face_masks_val.csv',
    'fauci_val.csv',
    'school_closures_val.csv',
    'stay_at_home_orders_val.csv',
]

test_urls = [
    'face_masks_test.csv',
    'fauci_test.csv',
    'school_closures_test.csv',
    'stay_at_home_orders_test.csv',
]

# Testing

In [3]:
df = pd.read_csv('original_test.csv')
df.head()

Unnamed: 0,Tweet,Target,Stance
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE


In [5]:
stance2id = {
            'AGAINST': 0,
            'FAVOR': 1,
            'NONE': 2
        }

df['Stance'] = df['Stance'].apply(lambda x: stance2id[x])
df.head()
df.to_csv('original_test.csv')

KeyError: 0

In [8]:
df.to_csv('original_test.csv', index=False)

In [12]:
llama = '../augment/data/llama_augment.tsv'
llama_df = pd.read_csv(llama, header=0, names=['Tweet', 'Target', 'Stance'])
llama_df.to_csv('../augment/data/llama_augment.csv', index=False)

In [13]:
df = pd.read_csv(f'../augment/data/cmlm/abortion_cmlm.tsv', sep='\t', header=0, names=['Tweet', 'Target', 'Stance'])
df.head()

Unnamed: 0,Tweet,Target,Stance
0,Where is the childcare program @joanburton whi...,Legalization of Abortion,0
1,I get several requests with petitions to save ...,Legalization of Abortion,0
2,"we must always see others as Christ sees us,we...",Legalization of Abortion,0
3,PRAYERS FOR BABIES Urgent prayer one in Lexing...,Legalization of Abortion,0
4,I do not want to bring a child into this dange...,Legalization of Abortion,0


In [19]:
df = pd.DataFrame()

for t in ['abortion', 'atheism', 'climate', 'donald', 'feminism', 'hillary']:
    # df = pd.read_csv(f'../augment/data/cmlm/{t}_cmlm.tsv', sep='\t', header=0, names=['Tweet', 'Target', 'Stance'])
    # df.to_csv(f'../augment/data/cmlm/{t}_cmlm.csv', index=False)
    t_df = pd.read_csv(f'../augment/data/cmlm/{t}_cmlm.csv')
    df = pd.concat((df, t_df), axis=0)

df.to_csv('../augment/data/cmlm/cmlm_augmented.csv', index=False)

In [33]:
df = pd.read_csv('../augment/data/llama_augmented.csv')

for ix, row in enumerate(df.iterrows()):
    print(row[1]['Tweet'])
    if ix == 5: break

No other president has risen to the level of Obama's leadership since JFK, #HandOverTheServer she wiped clean + 30k deleted emails, explains dereliction of duty/lies re #Benghazi, etc #tcot.
It's crucial that we improve our efforts in encouraging people to get vaccinated, #HandOverTheServer she wiped clean + 30k deleted emails, explains dereliction of duty/lies re #Benghazi, etc #tcot.
Hillary is our best choice if we truly want to continue being a progressive nation. #Ohio,
Hillary embodies the values of a progressive nation, and her leadership will undoubtedly advance our country in the right direction. #Ohio.
By selecting Hillary as our leader, we can ensure a continued commitment to progress and social justice. #Ohio.
Hillary's dedication to advancing social and economic equality makes her the ideal candidate for progressive Americans. #Ohio.


In [34]:
from datasets import load_dataset

In [35]:
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')

In [49]:
ds_csv = '../augment/data/llama_augmented.csv'

ds = load_dataset('csv', data_files=ds_csv, split='train')

def preprocess_data(batch_tweet):
    encoding = tokenizer(batch_tweet['Tweet'],
                         return_tensors='pt',
                         truncation=True,
                         max_length=128,
                         padding='max_length')
    encoding['label'] = batch_tweet['Stance']
    return encoding

encoded_split_ds = ds.map(preprocess_data,
                                batched=True,
                                batch_size=1)
encoded_split_ds
# 768, 3738

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 4877.10it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 472.76it/s]
Generating train split: 4535 examples [00:00, 215030.51 examples/s]
Map: 100%|██████████| 4535/4535 [00:08<00:00, 534.68 examples/s]


Dataset({
    features: ['Tweet', 'Target', 'Stance', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 4535
})