In [40]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from transformers import AutoTokenizer

# SemEval-2016 Task 6

[Website](https://alt.qcri.org/semeval2016/task6/index.php?id=data-and-tools), [Data visualization](https://www.saifmohammad.com/WebPages/StanceDataset.htm)

In [41]:
# semeval-2016
!wget http://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip
!unzip /content/stancedataset.zip

--2023-12-04 19:47:20--  http://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip
Resolving alt.qcri.org (alt.qcri.org)... 80.76.166.231
Connecting to alt.qcri.org (alt.qcri.org)|80.76.166.231|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip [following]
--2023-12-04 19:47:20--  https://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip
Connecting to alt.qcri.org (alt.qcri.org)|80.76.166.231|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 262993 (257K) [application/zip]
Saving to: ‘stancedataset.zip.1’


2023-12-04 19:47:22 (443 KB/s) - ‘stancedataset.zip.1’ saved [262993/262993]

Archive:  /content/stancedataset.zip
replace StanceDataset/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [42]:
csv_file = 'StanceDataset/train.csv'
col_names = ['tweet', 'target', 'stance', 'opinion towards', 'sentiment']
stance2id = {'AGAINST': 0, 'FAVOR': 1, 'NONE': 2}

df = pd.read_csv(csv_file, encoding='unicode_escape', lineterminator='\r', header=0, names=col_names)
df['stance'] = df['stance'].apply(lambda x: stance2id[x])
df.head()

Unnamed: 0,tweet,target,stance,opinion towards,sentiment
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,0,1. The tweet explicitly expresses opinion abo...,neg
1,Hillary is our best choice if we truly want to...,Hillary Clinton,1,1. The tweet explicitly expresses opinion abo...,pos
2,@TheView I think our country is ready for a fe...,Hillary Clinton,0,1. The tweet explicitly expresses opinion abo...,neg
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,0,1. The tweet explicitly expresses opinion abo...,neg
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,2,3. The tweet is not explicitly expressing opi...,pos


In [43]:
df.shape

(2914, 5)

In [44]:
class SemEval2016Dataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length):
        stance2id = {
            'AGAINST': 0,
            'FAVOR': 1,
            'NONE': 2
        }
        target2id = {
            'Atheism': 0,
            'Climate Change is a Real Concern': 1,
            'Donald Trump': 2,
            'Feminist Movement': 3,
            'Hillary Clinton': 4,
            'Legalization of Abortion': 5
        }
        col_names = ['tweet', 'target', 'stance', 'opinion towards', 'sentiment']

        df = pd.read_csv(csv_file, encoding='unicode_escape', lineterminator='\r',
                         header=0, names = col_names)
        df['stance'] = df['stance'].apply(lambda x: stance2id[x])
        df['target'] = df['target'].apply(lambda x: target2id[x])
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, ix):
        tweet = self.df['tweet'].iloc[ix]
        target = self.df['target'].iloc[ix]
        stance = self.df['stance'].iloc[ix]
        encoding = self.tokenizer(tweet,
                                  return_tensors='pt',
                                  max_length=self.max_length,
                                  padding='max_length',
                                  truncation=True)
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        return input_ids, attention_mask, target, stance

In [45]:
def get_semeval_dataset(tokenizer='vinai/bertweet-large', max_length=128, train=True):
    if train:
        csv_file = 'StanceDataset/train.csv'
    else:
        csv_file = 'StanceDataset/test.csv'
    tokenizer = AutoTokenizer.from_pretrained(tokenizer)

    return SemEval2016Dataset(csv_file, tokenizer, max_length)

In [46]:
dataset = get_semeval_dataset()
dataset[0]

(tensor([    0,  1039,  5357, 32023,   329,   178,     6,   849, 21292, 10777,
           133, 47576,    79, 16492,  2382,  2055,   389,   330, 13908,  5575,
             6,  4529, 25872, 38068,     9,  4053,    73, 15124,   769,   849,
         17521, 49050,     6, 42624,   849, 42988,  1242,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

In [47]:
def get_semeval_data_loader(batch_size=128, shuffle=True, tokenizer='vinai/bertweet-large', max_length=128, train=True):
    data = get_semeval_dataset(tokenizer, max_length, train)
    return DataLoader(data, batch_size, shuffle)

In [50]:
loader = get_semeval_data_loader(batch_size=4)
for ix, (input_ids, attention_mask, target, stance) in enumerate(loader):
    print(f'{input_ids=}')
    print(f'{attention_mask=}')
    print(f'{target=}')
    print(f'{stance=}')
    if ix == 5:
        break

input_ids=tensor([[    0,   100,    40,    45,  1762,  4402,    13,   127,   301,     6,
            13,    19,  4402,     5,  5736,    40, 21104,   162,    25,    10,
         15978,   111, 13438,     4,   195,    35,  1092,   849, 37504,  4014,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1

# COVID-19 Stance Detection

[github](https://github.com/kglandt/stance-detection-in-covid-19-tweets/tree/main)

In [49]:
base_url = 'https://raw.githubusercontent.com/kglandt/stance-detection-in-covid-19-tweets/main/dataset/'

train_urls = [
    'face_masks_train.csv',
    'fauci_train.csv',
    'school_closures_train.csv',
    'stay_at_home_orders_train.csv',
]

train_noisy = [
    'face_masks_train_noisy.csv',
    'fauci_train_noisy.csv',
    'school_closures_train_noisy.csv',
    'stay_at_home_orders_train_noisy.csv',
]

val_urls = [
    'face_masks_val.csv',
    'fauci_val.csv',
    'school_closures_val.csv',
    'stay_at_home_orders_val.csv',
]

test_urls = [
    'face_masks_test.csv',
    'fauci_test.csv',
    'school_closures_test.csv',
    'stay_at_home_orders_test.csv',
]