In [1]:
import utils as utils
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from keras.preprocessing.text import Tokenizer
import os
from datasets import Dataset
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
import numpy as np
import evaluate
from transformers import create_optimizer, AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshuasegal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# dataset_path = 'dataset'
# papers_path = 'papers'
# presentations_path = 'presentations'
#
# utils.move_xml_files(dataset_path, papers_path, presentations_path)

In [3]:
sample_xml_pres_path = "dataset/sample_data/presentations/slide.clean_tika.xml"
sample_xml_pres = utils.read_file(sample_xml_pres_path)
sample_xml_pres

sample_xml_paper_path = "dataset/sample_data/papers/Paper_BRM.tei.xml"
sample_xml_paper = utils.read_file(sample_xml_paper_path)
sample_xml_paper

'<?xml version="1.0" encoding="UTF-8"?>\n<TEI xmlns="http://www.tei-c.org/ns/1.0" \nxmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" \nxsi:schemaLocation="http://www.tei-c.org/ns/1.0 /Users/atharsefid/Desktop/grobid-0.5.3/grobid-home/schemas/xsd/Grobid.xsd"\n xmlns:xlink="http://www.w3.org/1999/xlink">\n\t<teiHeader xml:lang="en">\n\t\t<encodingDesc>\n\t\t\t<appInfo>\n\t\t\t\t<application version="0.5.3" ident="GROBID" when="2019-03-26T16:26+0000">\n\t\t\t\t\t<ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>\n\t\t\t\t</application>\n\t\t\t</appInfo>\n\t\t</encodingDesc>\n\t\t<fileDesc>\n\t\t\t<titleStmt>\n\t\t\t\t<title level="a" type="main">Best-Response Mechanisms</title>\n\t\t\t</titleStmt>\n\t\t\t<publicationStmt>\n\t\t\t\t<publisher/>\n\t\t\t\t<availability status="unknown"><licence/></availability>\n\t\t\t</publicationStmt>\n\t\t\t<sourceDesc>\n\t\t\t\t<biblStruct>\n\t\t\t\t\t<analy

In [4]:
sample_pres_text = utils.parse_presentation_xml(sample_xml_pres)
sample_pres_text[0]

'Noam Nisan, Michael Schapira, Gregory Valiant, and Aviv Zohar'

In [5]:
sample_paper_text = utils.parse_paper_xml(sample_xml_paper)
sample_paper_text[0]

'The basic object of study in game theory and in economics is the equilibrium: a "stable" state from which none of the players wish to deviate. Equilibrium is a static concept that often abstracts away the question of how it is reached. Once we start looking at dynamics, or at algorithms for finding equilibria, we cannot escape questions of the form "How is an equilibrium reached?". While there can be different formalizations of this question, in most cases, a truly satisfactory answer would have each player performing only simple "locally rational" actions and yet, mysteriously, the system would reach a global equilibrium. The simplest example of such phenomena is repeated best-response dynamics: each player selects the best (locally optimal) response to what others are currently doing, and this process goes on "for a while" until it "converges" to what must be a (pure Nash) equilibrium. Convergence of repeated bestresponse is, unfortunately, not guaranteed in general, and is the subj

In [6]:
sample_paper_title = utils.parse_title(sample_xml_paper)
print(sample_paper_title)

Best-Response Mechanisms


In [7]:
sample_pres_preprocessed = utils.preprocess_text(sample_pres_text)

In [8]:
sample_paper_preprocessed = utils.preprocess_text(sample_paper_text)

In [9]:
print(sample_pres_preprocessed[0])
print(sample_paper_preprocessed[0])

['noam', 'nisan', 'michael', 'schapira', 'gregori', 'valiant', 'aviv', 'zohar']
['basic', 'object', 'studi', 'game', 'theori', 'econom', 'equilibrium', 'stabl', 'state', 'none', 'player', 'wish', 'deviat', 'equilibrium', 'static', 'concept', 'often', 'abstract', 'away', 'question', 'reach', 'start', 'look', 'dynam', 'algorithm', 'find', 'equilibria', 'escap', 'question', 'form', 'equilibrium', 'reach', 'differ', 'formal', 'question', 'case', 'truli', 'satisfactori', 'answer', 'would', 'player', 'perform', 'simpl', 'local', 'ration', 'action', 'yet', 'mysteri', 'system', 'would', 'reach', 'global', 'equilibrium', 'simplest', 'exampl', 'phenomena', 'repeat', 'bestrespons', 'dynam', 'player', 'select', 'best', 'local', 'optim', 'respons', 'other', 'current', 'process', 'goe', 'converg', 'must', 'pure', 'nash', 'equilibrium', 'converg', 'repeat', 'bestrespons', 'unfortun', 'guarante', 'gener', 'subject', 'much', 'research', 'converg', 'sophist', 'locallyr', 'dynam', 'eg', 'fictiti', 'play'

In [10]:
presentation_word_model = Word2Vec(sentences = sample_pres_preprocessed, vector_size = 50, window = 5, min_count = 1, workers = 3, sg = 1)
paper_word_model = Word2Vec(sentences = sample_paper_preprocessed, vector_size = 50, window = 5, min_count = 1, workers = 3, sg = 1)

In [11]:
print(presentation_word_model)
print(paper_word_model)

Word2Vec<vocab=187, vector_size=50, alpha=0.025>
Word2Vec<vocab=829, vector_size=50, alpha=0.025>


In [12]:
EMBEDDING_PRES_MODEL_FILE = "pres_word_model.txt"
EMBEDDING_PAPER_MODEL_FILE = "paper_word_model.txt"

presentation_word_model.wv.save_word2vec_format(EMBEDDING_PRES_MODEL_FILE, binary=False)
paper_word_model.wv.save_word2vec_format(EMBEDDING_PAPER_MODEL_FILE, binary=False)

In [13]:
presentation_folder = "dataset/presentations"
paper_folder = "dataset/papers"

papers_data = []
presentations_data = []
id2label = {}
label2id = {}
unknowns = 0

# Loop through presentation XML files
for presentation_file in os.listdir(presentation_folder):
    file_path = os.path.join(presentation_folder, presentation_file)
    file_content = utils.read_file(file_path)
    if file_content:
        # Parse presentation XML
        presentation_data = utils.parse_presentation_xml(file_content)
        # Preprocess presentation data
        # preprocessed_presentation_data = utils.preprocess_text(presentation_data)
        # presentations_data.append(preprocessed_presentation_data)
        presentations_data.append(presentation_data)

# Loop through paper XML files
for idx, paper_file in enumerate(os.listdir(paper_folder)):
    file_path = os.path.join(paper_folder, paper_file)
    file_content = utils.read_file(file_path)
    if file_content:
        # Parse paper XML
        paper_data = utils.parse_paper_xml(file_content)
        # Preprocess paper data
        # preprocessed_paper_data = utils.preprocess_text(paper_data)
        title = utils.parse_title(file_content)
        if title is not None:
            # Check if title is already in label2id
            if title not in label2id:
                # If title is not in label2id, add it directly
                id2label[idx] = title
                label2id[title] = idx
            else:
                # If title is already in label2id, generate a unique title
                unique_title = f"{title}_{idx}"
                id2label[idx] = unique_title
                label2id[unique_title] = idx
            # Append paper data
        else:
            unknowns += 1  # Increment unknowns counter
            unique_title = f"unknown_{idx}"
            id2label[idx] = unique_title
            label2id[unique_title] = idx
        # papers_data.append(preprocessed_paper_data)
        papers_data.append(paper_data)
data = {
    "papers": papers_data,
    "presentations": presentations_data
}
# presentation_to_paper = utils.create_presentation_to_paper_mapping(presentation_folder, paper_folder)
#
# presentations_data = utils.process_presentation_folder(presentation_folder)
# papers_data = utils.process_papers_folder(paper_folder)



In [14]:
# print(presentations_data[0])
# print(papers_data[3])
# print(data)
for idx in range(5):
    paper = data["papers"][idx][0] if idx < len(data["papers"]) and data["papers"][idx] else "N/A"
    presentation = data["presentations"][idx][0] if idx < len(data["presentations"]) and data["presentations"][idx] else "N/A"
    print(f"Pair {idx+1}:")
    print("Paper:", paper)
    print("Presentation:", presentation)
    print()

Pair 1:
Paper: We analyze in this paper the performance of TCP (Transmission Control Protocol), the widely-used transport protocol of the Internet [15,30]. TCP is a reliable window-based flow control protocol where the window is increased until a packet loss is detected. Here, the source assumes that the network is congested and reduces its window. Once the lost packets are recovered, the source resumes its window increase. As a performance measure, we consider the throughput of a long time TCP connection having an infinite amount of data to send. A mathematical model is presented to find a closed form expression for the throughput of the connection. We assume that the reader is familiar with basic mechanisms of TCP such as Slow Start and Congestion Avoidance algorithms, the two methods for loss detection: Duplicate ACKs and TimeOut, the Delay ACK mechanism, the limitation on the congestion window due to receiver buffer, etc. (see [5] for a survey on TCP issues). * The work of this aut

In [15]:
labels = []
for key, value in label2id.items():
    labels.append(value)

In [16]:
# print(label2id)
# print(labels)
print(len(presentations_data))
print(len(papers_data))
print(len(label2id))
print(len(labels))
print(unknowns)

4984
4984
4984
4984
25


In [17]:
## distilBERT tokenizer to preprocess
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [18]:
## split into train and test sets with labels
X_train, X_test, y_train, y_test = train_test_split(data["presentations"], labels, test_size=0.2, random_state=42)

train_text = []
test_text = []
train_label = []
test_label = []

for text, label in zip(X_train, y_train):
    train_text.append(text)
    train_label.append(label)

for text, label in zip(X_test, y_test):
    test_text.append(text)
    test_label.append(label)

train_dict = {
    "label": train_label,
    "text": train_text
}

test_dict = {
    "label": test_label,
    "text": test_text
}

train_data = Dataset.from_dict(train_dict)
test_data = Dataset.from_dict(test_dict)

In [19]:
## preprocessing function to apply tokenizer over whole dataset
def preprocess_function(data):
    return tokenizer(data["text"], truncation=True)

In [20]:
## batch to process multiple at once for faster compute
tokenized_train_data = train_data.map(preprocess_function, batched=True)
tokenized_test_data = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/3987 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [None]:
## padding dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
## metrics function that passes preds and labels to compute metrics
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
batch_size = 16
num_epochs = 5
batches_per_epoch = len(train_data) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
# try 3e-5
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(labels), id2label=id2label, label2id=label2id)


In [None]:
tf_train_set = model.prepare_tf_dataset(
    train_data,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    test_data,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
model.compile(optimizer=optimizer) #Transformer has default task-relevant loss function


In [None]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)

In [None]:
push_to_hub_callback = PushToHubCallback(
    output_dir="CS4120final",
    tokenizer=tokenizer,
)



In [None]:
callbacks = [metric_callback, push_to_hub_callback]

In [None]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)

In [None]:
# ## check if works
#
# word_tokenizer = Tokenizer()
# word_tokenizer.fit_on_texts(presentations_data)
# encoded = word_tokenizer.texts_to_sequences(presentations_data)
#
# char_tokenizer = Tokenizer()
# char_tokenizer.fit_on_texts(papers_data)
# encoded = char_tokenizer.texts_to_sequences(papers_data)

In [None]:
# ## correct implementation for LR ?
#
# word_map, index_map = utils.read_embeddings("spooky_embedding_word.txt", word_tokenizer)
# char_map, char_index_map = utils.read_embeddings("spooky_embedding_char.txt", char_tokenizer)

In [None]:
# tfidf_vectorizer = TfidfVectorizer()
# X = tfidf_vectorizer.fit_transform(presentations_data)
#
# y = list(range(len(presentations_data)))
#
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
# logreg_model = LogisticRegression(max_iter=1000)
# logreg_model.fit(X_train, y_train)
#
# y_pred = logreg_model.predict(X_test)
#
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

In [None]:
# # print(y_pred[0])
# # print(y_test[0])
# # print(presentations_data)
# print(X_train)
# print(X_test)
# # print(len(y))