In [1]:
!pip install datasets
!pip install evaluate
!pip install tokenizers
!pip install transformers
!pip install bs4
!pip install lxml



In [2]:
import utils as utils
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from keras.preprocessing.text import Tokenizer
import os
from datasets import Dataset
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
import numpy as np
import evaluate
from transformers import create_optimizer, AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding
import nltk
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshuasegal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joshuasegal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# dataset_path = 'dataset'
# papers_path = 'papers'
# presentations_path = 'presentations'
#
# utils.move_xml_files(dataset_path, papers_path, presentations_path)

In [4]:
sample_xml_pres_path = "sample_data/presentations/slide.clean_tika.xml"
sample_xml_pres = utils.read_file(sample_xml_pres_path)
sample_xml_pres

sample_xml_paper_path = "sample_data/papers/Paper_BRM.tei.xml"
sample_xml_paper = utils.read_file(sample_xml_paper_path)
sample_xml_paper

'<?xml version="1.0" encoding="UTF-8"?>\n<TEI xmlns="http://www.tei-c.org/ns/1.0" \nxmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" \nxsi:schemaLocation="http://www.tei-c.org/ns/1.0 /Users/atharsefid/Desktop/grobid-0.5.3/grobid-home/schemas/xsd/Grobid.xsd"\n xmlns:xlink="http://www.w3.org/1999/xlink">\n\t<teiHeader xml:lang="en">\n\t\t<encodingDesc>\n\t\t\t<appInfo>\n\t\t\t\t<application version="0.5.3" ident="GROBID" when="2019-03-26T16:26+0000">\n\t\t\t\t\t<ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>\n\t\t\t\t</application>\n\t\t\t</appInfo>\n\t\t</encodingDesc>\n\t\t<fileDesc>\n\t\t\t<titleStmt>\n\t\t\t\t<title level="a" type="main">Best-Response Mechanisms</title>\n\t\t\t</titleStmt>\n\t\t\t<publicationStmt>\n\t\t\t\t<publisher/>\n\t\t\t\t<availability status="unknown"><licence/></availability>\n\t\t\t</publicationStmt>\n\t\t\t<sourceDesc>\n\t\t\t\t<biblStruct>\n\t\t\t\t\t<analy

In [5]:
sample_pres_text = utils.parse_presentation_xml(sample_xml_pres)
sample_pres_text[0]

'Noam Nisan, Michael Schapira, Gregory Valiant, and Aviv Zohar'

In [6]:
sample_paper_text = utils.parse_paper_xml(sample_xml_paper)
sample_paper_text[0]

'The basic object of study in game theory and in economics is the equilibrium: a "stable" state from which none of the players wish to deviate. Equilibrium is a static concept that often abstracts away the question of how it is reached. Once we start looking at dynamics, or at algorithms for finding equilibria, we cannot escape questions of the form "How is an equilibrium reached?". While there can be different formalizations of this question, in most cases, a truly satisfactory answer would have each player performing only simple "locally rational" actions and yet, mysteriously, the system would reach a global equilibrium. The simplest example of such phenomena is repeated best-response dynamics: each player selects the best (locally optimal) response to what others are currently doing, and this process goes on "for a while" until it "converges" to what must be a (pure Nash) equilibrium. Convergence of repeated bestresponse is, unfortunately, not guaranteed in general, and is the subj

In [7]:
sample_paper_title = utils.parse_title(sample_xml_paper)
print(sample_paper_title)

Best-Response Mechanisms


In [8]:
sample_pres_preprocessed = utils.preprocess_text(sample_pres_text)

In [9]:
sample_paper_preprocessed = utils.preprocess_text(sample_paper_text)

In [10]:
print(sample_pres_preprocessed[0])
print(sample_paper_preprocessed[0])

['noam', 'nisan', 'michael', 'schapira', 'gregori', 'valiant', 'aviv', 'zohar']
['basic', 'object', 'studi', 'game', 'theori', 'econom', 'equilibrium', 'stabl', 'state', 'none', 'player', 'wish', 'deviat', 'equilibrium', 'static', 'concept', 'often', 'abstract', 'away', 'question', 'reach', 'start', 'look', 'dynam', 'algorithm', 'find', 'equilibria', 'escap', 'question', 'form', 'equilibrium', 'reach', 'differ', 'formal', 'question', 'case', 'truli', 'satisfactori', 'answer', 'would', 'player', 'perform', 'simpl', 'local', 'ration', 'action', 'yet', 'mysteri', 'system', 'would', 'reach', 'global', 'equilibrium', 'simplest', 'exampl', 'phenomena', 'repeat', 'bestrespons', 'dynam', 'player', 'select', 'best', 'local', 'optim', 'respons', 'other', 'current', 'process', 'goe', 'converg', 'must', 'pure', 'nash', 'equilibrium', 'converg', 'repeat', 'bestrespons', 'unfortun', 'guarante', 'gener', 'subject', 'much', 'research', 'converg', 'sophist', 'locallyr', 'dynam', 'eg', 'fictiti', 'play'

In [11]:
presentation_word_model = Word2Vec(sentences = sample_pres_preprocessed, vector_size = 50, window = 5, min_count = 1, workers = 3, sg = 1)
paper_word_model = Word2Vec(sentences = sample_paper_preprocessed, vector_size = 50, window = 5, min_count = 1, workers = 3, sg = 1)

In [12]:
print(presentation_word_model)
print(paper_word_model)

Word2Vec<vocab=187, vector_size=50, alpha=0.025>
Word2Vec<vocab=829, vector_size=50, alpha=0.025>


In [13]:
EMBEDDING_PRES_MODEL_FILE = "pres_word_model.txt"
EMBEDDING_PAPER_MODEL_FILE = "paper_word_model.txt"

presentation_word_model.wv.save_word2vec_format(EMBEDDING_PRES_MODEL_FILE, binary=False)
paper_word_model.wv.save_word2vec_format(EMBEDDING_PAPER_MODEL_FILE, binary=False)

In [14]:
presentation_folder = "sample_data/presentations"   #Original: "dataset/presentations"
paper_folder = "sample_data/papers"                 #Original: "dataset/papers"

papers_data = []
presentations_data = []
id2label = {}
label2id = {}
unknowns = 0

# Loop through presentation XML files
for presentation_file in os.listdir(presentation_folder):
    file_path = os.path.join(presentation_folder, presentation_file)
    if os.path.isfile(file_path):
      file_content = utils.read_file(file_path)
      if file_content:
          # Parse presentation XML
          presentation_data = utils.parse_presentation_xml(file_content)
          # Preprocess presentation data
          # preprocessed_presentation_data = utils.preprocess_text(presentation_data)
          # presentations_data.append(preprocessed_presentation_data)
          presentations_data.append(presentation_data)

# Loop through paper XML files
for idx, paper_file in enumerate(os.listdir(paper_folder)):
    file_path = os.path.join(paper_folder, paper_file)
    file_content = utils.read_file(file_path)
    if file_content:
        # Parse paper XML
        paper_data = utils.parse_paper_xml(file_content)
        # Preprocess paper data
        # preprocessed_paper_data = utils.preprocess_text(paper_data)
        title = utils.parse_title(file_content)
        if title is not None:
            # Check if title is already in label2id
            if title not in label2id:
                # If title is not in label2id, add it directly
                id2label[idx] = title
                label2id[title] = idx
            else:
                # If title is already in label2id, generate a unique title
                unique_title = f"{title}_{idx}"
                id2label[idx] = unique_title
                label2id[unique_title] = idx
            # Append paper data
        else:
            unknowns += 1  # Increment unknowns counter
            unique_title = f"unknown_{idx}"
            id2label[idx] = unique_title
            label2id[unique_title] = idx
        # papers_data.append(preprocessed_paper_data)
        papers_data.append(paper_data)
data = {
    "papers": papers_data,
    "presentations": presentations_data
}
# presentation_to_paper = utils.create_presentation_to_paper_mapping(presentation_folder, paper_folder)
#
# presentations_data = utils.process_presentation_folder(presentation_folder)
# papers_data = utils.process_papers_folder(paper_folder)



In [15]:
# print(presentations_data[0])
# print(papers_data[3])
# print(data)
for idx in range(5):
    paper = data["papers"][idx][0] if idx < len(data["papers"]) and data["papers"][idx] else "N/A"
    presentation = data["presentations"][idx][0] if idx < len(data["presentations"]) and data["presentations"][idx] else "N/A"
    print(f"Pair {idx+1}:")
    print("Paper:", paper)
    print("Presentation:", presentation)
    print()

Pair 1:
Paper: The basic object of study in game theory and in economics is the equilibrium: a "stable" state from which none of the players wish to deviate. Equilibrium is a static concept that often abstracts away the question of how it is reached. Once we start looking at dynamics, or at algorithms for finding equilibria, we cannot escape questions of the form "How is an equilibrium reached?". While there can be different formalizations of this question, in most cases, a truly satisfactory answer would have each player performing only simple "locally rational" actions and yet, mysteriously, the system would reach a global equilibrium. The simplest example of such phenomena is repeated best-response dynamics: each player selects the best (locally optimal) response to what others are currently doing, and this process goes on "for a while" until it "converges" to what must be a (pure Nash) equilibrium. Convergence of repeated bestresponse is, unfortunately, not guaranteed in general, a

In [16]:
labels = []
for key, value in label2id.items():
    labels.append(value)

In [17]:
# print(label2id)
# print(labels)
print(len(presentations_data))
print(len(papers_data))
print(len(label2id))
print(len(labels))
print(unknowns)

6
6
6
6
0


In [18]:
## distilBERT tokenizer to preprocess
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [19]:
## split into train and test sets with labels
#presentation_data = utils.stringify(data["presentations"])
#print(data["presentations"])
#print(presentation_data)
#print(len(presentation_data))
X_train, X_test, y_train, y_test = train_test_split(data["presentations"], labels, test_size=0.2, train_size=0.8, random_state=42)

train_text = []
test_text = []
train_label = []
test_label = []
max_length = 511

for text, label in zip(X_train, y_train):
    # Tokenize the text
    tokenized_text = tokenizer(' '.join(text), truncation=True, max_length=max_length, return_overflowing_tokens=False)

    # Convert the token IDs back to tokens
    truncated_text = tokenizer.decode(tokenized_text["input_ids"], skip_special_tokens=True)

    # Append the truncated text to the list
    train_text.append(truncated_text)
    train_label.append(label)

for text, label in zip(X_test, y_test):
    # Tokenize the text
    tokenized_text = tokenizer(' '.join(text), truncation=True, max_length=max_length, return_overflowing_tokens=False)

    # Convert the token IDs back to tokens
    truncated_text = tokenizer.decode(tokenized_text["input_ids"], skip_special_tokens=True)

    # Append the truncated text to the list
    test_text.append(truncated_text)
    test_label.append(label)
    
#print(train_text)

#train_text = utils.stringify(train_data["text"])
#print(train_text[0])

#test_text = utils.stringify(test_data["text"])
#print(test_text)
#print(len(test_text))
#print(len(test_label))

train_dict = {
    "label": train_label,
    "text": train_text
}

test_dict = {
    "label": test_label,
    "text": test_text
}

train_data = Dataset.from_dict(train_dict)
test_data = Dataset.from_dict(test_dict)

print(test_data[1])
print(train_data[1])


{'label': 1, 'text': 'dianed : time - aware named entity disambiguation for diachronic corpora prabal agarwal1, jannik strotgen1, 2, luciano del corro3, johannes hoffart3, ger hard weikum1 july 18, 2018 bush to stress domestic issues in speech. ( year 1989 ) george w. bush george h. w. bush bush to stress domestic issues in speech. ( year 1989 ) george w. bush george h. w. bush bush to stress domestic issues in speech. ( year 1989 ) george w. bush george h. w. bush bush to stress domestic issues in speech. ( year 1989 ) george w. bush george h. w. bush table of contents introduction problem description given : set of entity mentions m in a document. entities : entries in a knowledge base ( kb ). task : link each m, where m m, to its correct entry in kb, if available. predict as an ookbe, otherwise. named entity disambiguation in 1959, david pearson exhibited as part of the young contemporaries exhibition in london. ( en. wikipedia. org / wiki / dave pearson ( painter ) ) in 1981, with 

In [20]:
## preprocessing function to apply tokenizer over whole dataset
def preprocess_function(data):
    return tokenizer(data["text"],padding="max_length", truncation=True)

In [21]:
## batch to process multiple at once for faster compute
#print(train_data)
#print(test_data)

#print(len(train_data["text"][0]))
#print(len(test_data["text"][0]))
#preprocessed_data = [preprocess_function(item) for item in train_data["text"][0]]
tokenized_train_data = train_data.map(preprocess_function, batched=True)
print(tokenized_train_data)

tokenized_test_data = test_data.map(preprocess_function, batched=True)
print(tokenized_test_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 4
})


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 2
})


In [22]:
## padding dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [23]:
accuracy = evaluate.load("accuracy")

In [24]:
## metrics function that passes preds and labels to compute metrics
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [25]:
batch_size = 4
num_epochs = 5
batches_per_epoch = len(train_data) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
# try 3e-5
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)



In [26]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(labels), id2label=id2label, label2id=label2id)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [27]:
print(train_data)
#print(tokenized_train_data["label"])
#print(tokenized_train_data["text"])
#print(tokenized_train_data["input_ids"])
#print(tokenized_train_data["attention-mask"])
#print(train_data["attention-mask"])
tf_train_set = model.prepare_tf_dataset(
    tokenized_train_data,
    shuffle=True,
    batch_size=1,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_test_data,
    shuffle=False,
    batch_size=1,
    collate_fn=data_collator,
)
#print(tf_train_set)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Dataset({
    features: ['label', 'text'],
    num_rows: 4
})


In [39]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [40]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)

In [41]:
!pip install huggingface_hub
import huggingface_hub
#huggingface_hub.login()

push_to_hub_callback = PushToHubCallback(
    output_dir="CS4120final",
    tokenizer=tokenizer,
)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/Users/joshuasegal/Coding/Jupyter/NLP/final/CS4120final is already a clone of https://huggingface.co/Joshua-Segal/CS4120final. Make sure you pull the latest changes with `repo.git_pull()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [42]:
callbacks = [metric_callback, push_to_hub_callback]

In [43]:
print(len(tokenized_train_data["text"][3]))

2491


In [44]:
print(tf_train_set)
# model.fit(tf_train_set)
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(1, 511), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(1, 511), dtype=tf.int64, name=None)}, TensorSpec(shape=(1,), dtype=tf.int64, name=None))>
Epoch 1/3

2024-04-09 21:02:43.682898: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [1,511]
	 [[{{node Placeholder/_1}}]]
2024-04-09 21:02:44.193775: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [1,511]
	 [[{{node Placeholder/_1}}]]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2024-04-09 21:02:52.656018: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [1,511]
	 [[{{node Placeholder/_1}}]]
2024-04-09 21:02:52.822227: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [1,511]
	 [[{{node Placeholder/_1}}]]


Epoch 3/3

2024-04-09 21:02:55.383137: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [1,511]
	 [[{{node Placeholder/_1}}]]
2024-04-09 21:02:55.552910: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [1,511]
	 [[{{node Placeholder/_1}}]]




<keras.callbacks.History at 0x29e509350>

In [34]:
# ## check if works
#
# word_tokenizer = Tokenizer()
# word_tokenizer.fit_on_texts(presentations_data)
# encoded = word_tokenizer.texts_to_sequences(presentations_data)
#
# char_tokenizer = Tokenizer()
# char_tokenizer.fit_on_texts(papers_data)
# encoded = char_tokenizer.texts_to_sequences(papers_data)

In [35]:
# ## correct implementation for LR ?
#
# word_map, index_map = utils.read_embeddings("spooky_embedding_word.txt", word_tokenizer)
# char_map, char_index_map = utils.read_embeddings("spooky_embedding_char.txt", char_tokenizer)

In [36]:
# tfidf_vectorizer = TfidfVectorizer()
# X = tfidf_vectorizer.fit_transform(presentations_data)
#
# y = list(range(len(presentations_data)))
#
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
# logreg_model = LogisticRegression(max_iter=1000)
# logreg_model.fit(X_train, y_train)
#
# y_pred = logreg_model.predict(X_test)
#
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

In [37]:
# # print(y_pred[0])
# # print(y_test[0])
# # print(presentations_data)
# print(X_train)
# print(X_test)
# # print(len(y))