In [1]:
import pandas as pd
import re
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline

model_id = "meta-llama/Meta-Llama-3-8B"

In [2]:
cce_df = pd.read_csv("C:/Users/joelt/Downloads/full_data.csv")

In [3]:
cce_df.iloc[21]

id                                        087922a2-e3ce-415a-8149-d146175ee6de
citing_title                               USST's System for AutoSimTrans 2022
citing_pub_year                                                           2022
citing_authors                                               Jiahui Zhu;Jun Yu
cited_title                  ['unknown', 'Learning to translate in real-tim...
cited_pub_year                                                ['2018', '2017']
cited_authors                ['Mingbo Ma;Liang Huang;Hao Xiong;Renjie Zheng...
citation_type                                                            group
paragraph                    Simultaneous;translation;<ref type="group">(Gu...
target_reference_location                                                    2
context_location1            [1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3...
context_location2            [1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3...
iaa_macro                                           

In [4]:
cce_df["paragraph"][21]

'Simultaneous;translation;<ref type="group">(Gu et al., 2017; Ma et al., 2018)</ref>;consists;in;generating;a;translation;before;the;source;speaker;finishes;speaking.;It;is;widely;used;in;many;real-time;scenarios;such;as;international;conferences,;business;negotiations;and;legal;proceedings.;The;challenge;of;Simultaneous;machine;translation;is;to;find;a;read-write;policy;that;balances;translation;quality;and;latency.;The;translation;quality;will;decline;if;the;machine;translation;system;reads;insufficient;source;information.;When;reading;wider;source;text,;latency;will;increase.'

In [5]:
cce_df["context_location1"][21]

'[1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0]'

In [6]:
results = []

for index, row in cce_df.iterrows():
    # Clean the paragraph by replacing <ref> tags with '[TREF]'
    clean_paragraph = re.sub(r'<ref.*?>.*?</ref>', '[TREF]', row["paragraph"])

    # Split the cleaned paragraph into words using ';' as the delimiter
    words = clean_paragraph.split(';')

    # Process the context_location1 list
    context_location1 = eval(row["context_location1"])

    # Check if the lengths match, and map the context_location1 to the words
    if len(context_location1) == len(words):
        # Aggregate the mapped results for the current row
        mapped_result = list(zip(context_location1, words))
        
        # Separate the numbers and words into separate lists
        numbers = [item[0] for item in mapped_result]
        mapped_words = [item[1].strip() for item in mapped_result]
        
        results.append({
            "Paragraph": ' '.join(mapped_words),
            "Scope": numbers
        })
    else:
        results.append({
            "Paragraph": "Length of context_location1 and words don't match",
            "Scope": ["Mismatch"]
        })

df = pd.DataFrame(results)
df.head()

Unnamed: 0,Paragraph,Scope
0,Neural Machine Translation (NMT) has opened se...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
1,"As shown in Table 1, the size of the 'in-domai...","[0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,Automatic extraction of events has gained siza...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
3,The subject NP 'Bill' is coindexed with the tr...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,Self-training [TREF] ) uses a source-to-target...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [7]:
# Load the tokenizer for LLaMA 3
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

# Check if the tokenizer has an eos_token and set it as the padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define a function to tokenize the input text with the maximum length
def tokenize_function(examples):
    return tokenizer(examples['Paragraph'], padding="max_length", truncation=True, max_length=786)

# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df)

# Apply the tokenization function to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/1055 [00:00<?, ? examples/s]

In [8]:
# Define a function to process the labels
def process_labels(examples):
    return {"labels": examples['Scope']}

# Apply the label processing to the dataset
labeled_dataset = tokenized_dataset.map(process_labels, batched=True)

Map:   0%|          | 0/1055 [00:00<?, ? examples/s]

In [None]:
# Load the pre-trained LLaMA 3 model
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Meta-Llama-3-8B", num_labels=4)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=labeled_dataset,
    eval_dataset=labeled_dataset,
)

trainer.train()

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")

trainer.save_model("./trained_llama_model")

In [None]:
# Create a pipeline for text classification
classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

# Define your new input paragraph
paragraph = "Neural Machine Translation (NMT) has opened several research directions to exploit as many and diverse data as possible. Massive multilingual NMT models, for instance, take advantage of several language-pair datasets in a single system [TREF] . This offers several advantages, such as a simple training process and enhanced performance of the language-pairs with little data (although sometimes detrimental to the high-resource language-pairs). However, massive models of dozens of languages are not necessarily the best outcome, as it is demonstrated that smaller clusters still offer the same benefits [TREF] ."

# Split the paragraph into words
words = paragraph.split()

word_predictions = []

# Iterate over each word in the paragraph
for word in words:
    # Tokenize the word and get the input tensors
    tokenized_word = tokenizer(word, return_tensors='pt', truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**tokenized_word)

    # Get the logits and apply softmax to get probabilities
    scores = torch.softmax(outputs.logits, dim=1).detach().numpy()

    # Get the label with the highest score
    max_score_idx = scores.argmax(axis=1)[0]
    max_score = scores[0, max_score_idx]
    label = f"{max_score_idx}"

    word_predictions.append({'word': word, 'scope': label, 'score': max_score})

for prediction in word_predictions:
    print(f"Word: {prediction['word']}, Predicted Scope: {prediction['scope']}, Score: {prediction['score']:.4f}")