In [1]:
import pandas as pd
import re
import torch
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip3 install transformers
!cp /content/drive/MyDrive/MLCYBER/utils_fake_news.py .

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 37.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 71.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 69.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [3]:
%run utils_fake_news.py

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
negate_dict = {"isn't":"is",
    "isn\\'t":"is",
    "is not ":"is ",
    "is ":"is not ",
    "didn't":"did",
    "didn\\'t":"did",
    "did not ":"did",
    "does not have":"has",
    "doesn't have":"has",
    "doesn\\'t have":"has",
    "has ":"does not have ",
    "shouldn't":"should",
    "shouldn\\'t":"should",
    "should not":"should",
    "should":"should not",
    "wouldn't":"would",
    "wouldn\\'t":"would",
    "would not":"would",
    "would":"would not",
    "mustn't":"must",
    "mustn\\'t":"must",
    "must not":"must",
    "must ":"must not ",
    "can't":"can",
    "can\\'t":"can",
    "cannot":"can",
    " can ":" cannot "}

IRREGULAR_ES_VERB_ENDINGS = ["ss", "x", "ch", "sh", "o"]

def negate(sentence):

  for key in negate_dict.keys():
    if sentence.find(key) > -1:
      return sentence.replace(key, negate_dict[key])
      #breakbreak #不知道需不需要加个break

  # doesn't work -> works
  doesnt_regex = r'(doesn\'t|doesn\\\'t|does not) (?P<verb>\w+)'
  #r的意思是后面的string无需转义

  #正则表达式
  if re.search(doesnt_regex, sentence):
    print(re.search(doesnt_regex, sentence).groupdict())
    return re.sub(doesnt_regex, replace_doesnt, sentence, 1)

  return None

def __is_consonant(letter):
  return letter not in ['a', 'e', 'i', 'o', 'u', 'y']

#下面这个函数就是去掉does not之后需要考虑后面动词三单的情况
def replace_doesnt(matchobj):
  verb = matchobj.group(2)
  #verb是doesn't什么后面的动词

  if verb.endswith("y") and __is_consonant(verb[-2]):
    return "{0}ies".format(verb[0:-1])

  for ending in IRREGULAR_ES_VERB_ENDINGS:
    if verb.endswith(ending):
      return "{0}es".format(verb)

  return "{0}s".format(verb)

def replace_verb(matchobj):
  subject = matchobj.group(1)
  verb = matchobj.group(2)
  whitespace = matchobj.group(3)

  # flies -> fly, but not die -> dy
  if verb.endswith("ie") and len(verb) > 3:
    verb = "{0}y".format(verb[0:-2])

  # stresses -> stress
  for ending in IRREGULAR_ES_VERB_ENDINGS:
    if verb.endswith("{0}e".format(ending)):
      verb = verb[0:-1]

  return "{0}does not {1}{2}".format(subject, verb, whitespace)

## Fake News

In [15]:
# Read in data
df = pd.read_csv("/content/drive/MyDrive/MLCYBER/Data/Fake-News/test.csv")
df.columns = ['statement','label']

# Clean and negate
df['statement'] = df['statement'].apply(lambda x: x.lower().replace('’',"'"))
df_neg = df.copy()
df_neg['statement'] = df_neg['statement'].apply(negate)

df_neg = df_neg.loc[~df_neg.statement.isnull()]
df_pos = df.loc[df_neg.index].reset_index(drop=True)
df_neg = df_neg.reset_index(drop=True)

# Relabel
df_neg['label'] = df_neg['label'].apply(lambda x: 0 if x==1 else 1)

# Save encoded versions for FakeBERT
# torch.save(encode_dataframe(df['statement'], df['label']),
#            '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/fake_news/training/fake_news.pt')
# torch.save(encode_dataframe(df_pos['statement'], df_pos['label']),
#            '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/fake_news/evaluation/fake_news_pos.pt')
# torch.save(encode_dataframe(df_neg['statement'], df_neg['label']),
#            '/content/drive/MyDrive/fake-news-explainability/Data/Encoded/fake_news/evaluation/fake_news_neg.pt')

# Save CSV versions for FakeBERT TF-IDF
df_pos.to_csv('/content/drive/MyDrive/MLCYBER/Data/Fake-News/fake_news_unattacked.csv', index=False)
df_neg.to_csv('/content/drive/MyDrive/MLCYBER/Data/Fake-News/fake_news_attacked.csv', index=False)

{'verb': 'recognize'}
{'verb': 'come'}
{'verb': 'heed'}
{'verb': 'believe'}


### LIAR Dataset

In [None]:
# Read in train data
df = pd.read_csv("/content/drive/MyDrive/MLCYBER/Data/LIAR/train.tsv", 
                 delimiter='\t', 
                 header=None)
df.columns = ['ID','label','statement','subject','speaker',
              'job_title','state','party','barely_true_count',
              'false_count','half_true_count','mostly_true_count',
              'pants_on_fire_count','context']
df = df.dropna(subset=['statement']).reset_index(drop=True)
df = df[['ID','statement','label']]

# Label
liar_encode = {'pants-fire':0, 'false':0, 'barely-true':0, 'half-true':1, 'mostly-true':1, 'true':1}
df['label'] = df['label'].apply(lambda x: liar_encode[x])

# Save
torch.save(encode_dataframe(df['statement'], df['label']),
           '/content/drive/MyDrive/MLCYBER/Data/Encoded/liar/training/liar_train.pt')



In [None]:
# Read in test data
import csv
df = pd.read_csv("/content/drive/MyDrive/MLCYBER/Data/LIAR/test.tsv", 
                 delimiter='\t', 
                 header=None)
df.columns = ['ID','label','statement','subject','speaker',
              'job_title','state','party','barely_true_count',
              'false_count','half_true_count','mostly_true_count',
              'pants_on_fire_count','context']
df = df.dropna(subset=['statement']).reset_index(drop=True)
df = df[['ID','statement','label']]

# Label
liar_encode = {'pants-fire':0, 'false':0, 'barely-true':0, 'half-true':1, 'mostly-true':1, 'true':1}
df['label'] = df['label'].apply(lambda x: liar_encode[x])

# Clean and negate
df['statement'] = df['statement'].apply(lambda x: x.lower().replace('’',"'"))
df_neg = df.copy()
df_neg['statement'] = df_neg['statement'].apply(negate)

df_neg = df_neg.loc[~df_neg.statement.isnull()]
df_pos = df.loc[df_neg.index].reset_index(drop=True)
df_neg = df_neg.reset_index(drop=True)

# Relabel
df_neg['label'] = df_neg['label'].apply(lambda x: 0 if x==1 else 1)

# Save
with open("/content/drive/MyDrive/MLCYBER/Data/LIAR/test_neg_attacked.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    for i in range(len(df_neg)):
        tsv_writer.writerow([df_neg['statement'][i], df_neg['label'][i]])

with open("/content/drive/MyDrive/MLCYBER/Data/LIAR/test_neg_unattacked.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    for i in range(len(df_neg)):
        tsv_writer.writerow([df_pos['statement'][i], df_pos['label'][i]])

{'verb': 'cost'}
{'verb': 'want'}
{'verb': 'contain'}
{'verb': 'require'}
{'verb': 'address'}


In [None]:
# Read in validation data
import csv
df = pd.read_csv("/content/drive/MyDrive/MLCYBER/Data/LIAR/valid.tsv", 
                 delimiter='\t', 
                 header=None)
df.columns = ['ID','label','statement','subject','speaker',
              'job_title','state','party','barely_true_count',
              'false_count','half_true_count','mostly_true_count',
              'pants_on_fire_count','context']
df = df.dropna(subset=['statement']).reset_index(drop=True)
df = df[['ID','statement','label']]

# Label
liar_encode = {'pants-fire':0, 'false':0, 'barely-true':0, 'half-true':1, 'mostly-true':1, 'true':1}
df['label'] = df['label'].apply(lambda x: liar_encode[x])

# Clean and negate
df['statement'] = df['statement'].apply(lambda x: x.lower().replace('’',"'"))
df_neg = df.copy()
df_neg['statement'] = df_neg['statement'].apply(negate)

df_neg = df_neg.loc[~df_neg.statement.isnull()]
df_pos = df.loc[df_neg.index].reset_index(drop=True)
df_neg = df_neg.reset_index(drop=True)

# Relabel
df_neg['label'] = df_neg['label'].apply(lambda x: 0 if x==1 else 1)

#print(len(df_neg))
#print(df_neg['statement'][1])
# Save
with open("/content/drive/MyDrive/MLCYBER/Data/LIAR/valid_neg_attacked.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    for i in range(len(df_neg)):
        tsv_writer.writerow([df_neg['statement'][i], df_neg['label'][i]])

with open("/content/drive/MyDrive/MLCYBER/Data/LIAR/valid_neg_unattacked.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    for i in range(len(df_neg)):
        tsv_writer.writerow([df_pos['statement'][i], df_pos['label'][i]])

{'verb': 'want'}
{'verb': 'demand'}
{'verb': 'discriminate'}
{'verb': 'affect'}
{'verb': 'show'}
