Train & Test: https://www.kaggle.com/alankuan/nlp-final-project-train-test/edit

In [68]:
# !pip install sentence-transformers

In [69]:
import os
import re
import math
import json
import numpy as np
import pandas as pd
import torch
import random

from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer, util

In [70]:
# PROJECT_BASE = '/kaggle/input/2022-inlp-final'
PROJECT_BASE = './'
# update if needed
BALANCED_TRAIN_PATH = './nlp-final-train-balanced/balanced_train1.json'
DATASET = 'train'  # train, valid, test

In [71]:
random.seed(6666)
has_label = DATASET != 'test'

In [72]:
# os.mkdir('attention_mask')
# os.mkdir('input_ids')

# Import Data

In [73]:
with open(f'{PROJECT_BASE}/{DATASET}.json') as f:
    data = json.load(f)

# if DATASET == 'train':
#     with open(BALANCED_TRAIN_PATH) as f:
#         filtered_train_data = json.load(f)
#     balanced_claim_ids = set([item['claim_id'] for item in filtered_train_data])
#     data = [
#         sample
#         for sample in data
#         if sample['metadata']['id'] in balanced_claim_ids
#     ]

In [74]:
data[0]

{'metadata': {'claimant': 'Faisal Al Qasimi, Carolina Monteiro',
  'claim': "OpIndia claimed Greta Thunberg's real name is Ghazala bhat",
  'id': 1,
  'premise_articles': {'https://web.archive.org/web/20210206135409/https://twitter.com/omar_quraishi/status/1357926247414845441': '1_1.json',
   'https://web.archive.org/web/20210206083718/https://twitter.com/runcaralisarun/status/1357714907249086465': '1_2.json',
   'https://www.facebook.com/search/photos/?q=opindia%20greta%20ghazala': '1_3.json',
   'https://twitter.com/UnSubtleDesi/status/1357723484491718659': '1_4.json'}},
 'label': {'rating': 0, 'original_rating': 'false', 'id': 1}}

In [75]:
if has_label:
    ratings = {}

    for item in data:
        rating = item['label']['rating']
        if rating in ratings:
            ratings[rating] += 1
        else:
            ratings[rating] = 1

    print(ratings)

{0: 7000, 1: 7000, 2: 2894}


# Data Preprocessing

In [76]:
sent_trans = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [77]:
def get_relevent_context(claim, article):
    claim_embed = sent_trans.encode(claim, convert_to_tensor=True)
    article_embed = sent_trans.encode(article, convert_to_tensor=True)
    sim = util.cos_sim(claim_embed, article_embed).squeeze(dim=0)
    
    return article[(sim > 0).cpu().numpy()]

In [78]:
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

In [79]:
def remove_stopwords(strings):
    new_strings = []
    for string in strings:
        clean_words = [word for word in string.split(' ') if not word in stop_words]
        new_strings.append(' '.join(clean_words))

    return new_strings
            

In [80]:
vectorizer = TfidfVectorizer()

def get_relevent_tfidf(claim, article):
    claim = [claim]
    clean_claim = remove_stopwords(claim)
    clean_article = remove_stopwords(article)

    if clean_article == []:
        return np.array([])
    tfidf = vectorizer.fit_transform(clean_claim + clean_article)
    encode_claim = vectorizer.transform(clean_claim)

    cos_sim = linear_kernel(encode_claim, tfidf).squeeze()[1:]
    indices = cos_sim.argsort()
    # print(indices)
    # print(type(indices))
    # print(cos_sim)
    # print(type(cos_sim))
    cos_sim = np.sort(cos_sim, axis=None)

    article = np.array(article)
    sorted_article = article[indices]
    relevent_article = np.flip(sorted_article[ cos_sim >= 0])
    print(f"claim = {claim}")
    print(relevent_article)
    # print(f"len of relevent articles = {len(relevent_article)}")
    return relevent_article
claim = 'Trump is stupid shit angry bitch'
article = [
    'Trump is stupid shit angry',
    'trump is happy bitch',
    'obama does not eat shit angry',
    'obama does not eat shit angrily',
    'totally different sentences'
]
relevent_article = get_relevent_tfidf(claim, article)
relevent_article

claim = ['Trump is stupid shit angry bitch']
['Trump is stupid shit angry' 'trump is happy bitch'
 'obama does not eat shit angry' 'obama does not eat shit angrily'
 'totally different sentences']


array(['Trump is stupid shit angry', 'trump is happy bitch',
       'obama does not eat shit angry', 'obama does not eat shit angrily',
       'totally different sentences'], dtype='<U31')

## Filtering Rules
- remove sentences ...
    - whose length are less than 2
    - which do not start with any digit, alphabet, or `@`
    - which start with `Link:`
- remove common urls
- replace `/` and `_` with ` `
- remove characters other than alphanumeric ones, `.`, `-`, `'`, and ` `

In [81]:
def filter_sent(article):
    filtered_sent = []
    
    for sent in article:
        if len(sent) < 2 or not (sent[0].isalnum() or sent[0] == '@') or sent.startswith('Link:'):
            continue
        sent = re.sub(r'http\S+', '', sent)
        sent = re.sub(r'[/_]', ' ', sent)
        sent = re.sub(r"[^a-zA-Z0-9\.\-' ]", '', sent)
        filtered_sent.append(sent)
    
    return filtered_sent

In [82]:
def preprocess(data, batch_size, shuffle=False):
    sample_num = len(data)
    batch_num = math.ceil(sample_num / batch_size)
    
    if shuffle:
        random.shuffle(data)
    
    claim_batch = []
    label_batch = []
    
    for batch_idx in range(batch_num):
        contexts = []
        claims = []
        claim_ids = []
        labels = []
        
        beg = batch_idx * batch_size
        end = min(beg + batch_size, sample_num)
        for sample in data[beg:end]:
            metadata = sample['metadata']
            claim = metadata['claim']
            claim_id = metadata['id']
            
            relevent_contexts, original_articles = [], []
            for file_name in metadata['premise_articles'].values():
                with open(f'./articles/{file_name}') as f:
                    original_article = json.load(f)
                original_articles.append(' '.join(original_article))
                
                filtered_article = filter_sent(original_article)
                filtered_article = pd.Series(filtered_article)
                if filtered_article.shape[0] > 0:
                    relevent_context = get_relevent_tfidf(claim, filtered_article)  
                    relevent_contexts.append(' '.join(relevent_context))
            
            if len(relevent_contexts) > 0:
                contexts.append('[SEP]'.join(relevent_contexts))
            else:
                contexts.append('[SEP]'.join(original_articles))
            claims.append(claim)
            claim_ids.append(claim_id)
            if has_label:
                labels.append(sample['label']['rating'])
        
        outputs = tokenizer(
            contexts,
            claims,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # save tokenized data
        for key in ['input_ids', 'attention_mask']:
            torch.save(outputs[key], f'{key}/{batch_idx}.pt')
            
        claim_batch.append(claim_ids)
        label_batch.append(labels)
            
    # save batch information
    output = {
        'batch_num': batch_num,
        'claim_batch': claim_batch,
        'label_batch': label_batch
    }
    with open('manifest.json', 'w') as f:
        json.dump(output, f, indent=2)

In [83]:
is_train = DATASET == 'train'
preprocess(data, 4, shuffle=is_train)

claim = ['Dr. Manmohan Singh declared strongest PM in the world']
['Something went wrong but dont fret  lets give it another shot.'
 'Twitter Inc.'
 'Terms of Service Privacy Policy Cookie Policy Imprint Ads info  2021'
 'Help Center'
 'You can see a list of supported browsers in our Help Center.'
 'JavaScript or switch to a supported browser to continue using twitter.com.'
 'Weve detected that JavaScript is disabled in this browser. Please enable'
 'JavaScript is not available.' 'Twitter']
claim = ['Dr. Manmohan Singh declared strongest PM in the world']
['Privacy Policy' 'Domain-Inhaber oder Sedo in keiner Beziehung.'
 'bereitgestellten Werbeanzeigen kommen von dritter Seite und stehen mit'
 'Domain Parking Programm nutzt. Die auf dieser Seite automatisiert'
 'Diese Webseite wurde vom Domain Inhaber dynamisch generiert der das Sedo'
 'Click here to buy this domain.' 'Domain erwerben' 'headline24hindi.com']
claim = ['Dr. Manmohan Singh declared strongest PM in the world']
['Facebook  

  filtered_article = pd.Series(filtered_article)


claim = ['Antifa provoked the shooting of Ashli Babbitt at the Capitol.']
['the Capitol.'
 'The shooting in Boulder Colo. was a false flag like most of the'
 'Capitol Jan. 6 2021' 'before he was seen at the Capitol.'
 'Capitol posts in pro-Trump Facebook groups claimed antifa activists were'
 'claiming that antifa activists stormed the Capitol has provided legitimate'
 'Trump supporters at the Capitol Jan. 7 2021'
 'suggest antifa is to blame for pro-Trump mob rioting into Capitol Jan.'
 'Theres no proof antifa stormed the Capitol. The rumor spread quickly'
 'Capitol riots Jan. 7 2021' 'leader of antifa.'
 'Violence at The U.S. Capitol Jan. 7 2021'
 'Capitol on Jan. 6 2021 in Washington. AP'
 'the U.S. Capitol on Jan. 6 2021 in Washington. AP'
 'Theres no proof antifa stormed the Capitol. The rumor spread quickly anyway'
 "are actually 'antifa' Jan. 6 2021"
 'as more reporting about those who entered the Capitol.'
 'Trump and QAnon not antifa Jan. 7 2021'
 'joins storming of U.S. Capit

KeyboardInterrupt: 