In [1]:
!pip install datasets
!pip install evaluate
!pip install tokenizers
!pip install transformers
!pip install bs4
!pip install lxml



In [2]:
import utils as utils
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import shutil
from datasets import Dataset
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
import numpy as np
import evaluate
from transformers import create_optimizer, AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding

import nltk
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshuasegal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joshuasegal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# dataset_path = 'dataset'
# papers_path = 'papers'
# presentations_path = 'presentations'
#
# utils.move_xml_files(dataset_path, papers_path, presentations_path)


source_folder = "data/paper_slides_data/raw_data/dataset"
papers_folder = "data/paper_slides_data/raw_data/papers"
presentations_folder = "data/paper_slides_data/raw_data/presentations"

utils.organize_xml_folders(source_folder, papers_folder, presentations_folder)

In [4]:
sample_xml_pres_path = "data/paper_slides_data/sample_data/presentations/slide.clean_tika.xml"
sample_xml_pres = utils.read_file(sample_xml_pres_path)
sample_xml_pres

sample_xml_paper_path = "data/paper_slides_data/sample_data/papers/Paper_BRM.tei.xml"
sample_xml_paper = utils.read_file(sample_xml_paper_path)
sample_xml_paper

'<?xml version="1.0" encoding="UTF-8"?>\n<TEI xmlns="http://www.tei-c.org/ns/1.0" \nxmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" \nxsi:schemaLocation="http://www.tei-c.org/ns/1.0 /Users/atharsefid/Desktop/grobid-0.5.3/grobid-home/schemas/xsd/Grobid.xsd"\n xmlns:xlink="http://www.w3.org/1999/xlink">\n\t<teiHeader xml:lang="en">\n\t\t<encodingDesc>\n\t\t\t<appInfo>\n\t\t\t\t<application version="0.5.3" ident="GROBID" when="2019-03-26T16:26+0000">\n\t\t\t\t\t<ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>\n\t\t\t\t</application>\n\t\t\t</appInfo>\n\t\t</encodingDesc>\n\t\t<fileDesc>\n\t\t\t<titleStmt>\n\t\t\t\t<title level="a" type="main">Best-Response Mechanisms</title>\n\t\t\t</titleStmt>\n\t\t\t<publicationStmt>\n\t\t\t\t<publisher/>\n\t\t\t\t<availability status="unknown"><licence/></availability>\n\t\t\t</publicationStmt>\n\t\t\t<sourceDesc>\n\t\t\t\t<biblStruct>\n\t\t\t\t\t<analy

In [5]:
sample_pres_text = utils.parse_presentation_xml(sample_xml_pres)
print(len(sample_pres_text))
print(sample_pres_text[:3])

21
['Noam Nisan, Michael Schapira, Gregory Valiant, and Aviv Zohar', 'Motivation Equilibrium is the basic object of study in game theory. Question: How is an equilibrium reached? In a truly satisfactory answer each players rule of behavior is simple and locally rational repeated best-response repeated better-response regret-minimization', 'Motivation Repeated best-response is often employed in practice e.g., Internet routing We ask: When is such locallyrational behavior really rational?']


In [6]:
sample_paper_text = utils.parse_paper_xml(sample_xml_paper)
print(len(sample_paper_text))
print(sample_paper_text[:3])

385
['The basic object of study in game theory and in economics is the equilibrium: a "stable" state from which none of the players wish to deviate.', 'Equilibrium is a static concept that often abstracts away the question of how it is reached.', 'Once we start looking at dynamics, or at algorithms for finding equilibria, we cannot escape questions of the form "How is an equilibrium reached?".']


In [7]:
sample_paper_title = utils.parse_title(sample_xml_paper)
print(sample_paper_title)
sample_pres_title = utils.parse_title(sample_xml_pres)
print(sample_pres_title)

Best-Response Mechanisms
None


In [8]:
sample_pres_preprocessed = utils.preprocess_text(sample_pres_text)

In [9]:
sample_paper_preprocessed = utils.preprocess_text(sample_paper_text)

In [10]:
print(sample_pres_preprocessed[:3])
print(sample_paper_preprocessed[:3])

[['noam nisan  michael schapira  gregori valiant  aviv zohar'], ['motiv equilibrium basic object studi game theori ', 'question  equilibrium reach ', 'truli satisfactori answer player rule behavior simpl local ration repeat bestrespons repeat betterrespons regretminim'], ['motiv repeat bestrespons often employ practic eg  internet rout ask  locallyr behavior realli ration ']]
[['basic object studi game theori econom equilibrium   stabl  state none player wish deviat '], ['equilibrium static concept often abstract away question reach '], ['start look dynam  algorithm find equilibria  escap question form  equilibrium reach ', ' ']]


In [11]:
print(len(sample_paper_preprocessed))
print(len(sample_pres_preprocessed))

385
21


In [15]:
def process_folder(folder_path, parse_func, preprocess_func):
    data_list = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            file_content = file.read()
            parsed_data = parse_func(file_content)
            preprocessed_data = preprocess_func(parsed_data)
            data_list.append(preprocessed_data)
    return data_list

def combine_data(papers_folder, presentations_folder):
    papers_data = process_folder(papers_folder, utils.parse_paper_xml, utils.preprocess_text)
    presentations_data = process_folder(presentations_folder, utils.parse_presentation_xml, utils.preprocess_text)

    combined_data = {"papers": papers_data,
                     "presentations": presentations_data}
    return combined_data

# Example usage:
papers_folder = "data/paper_slides_data/raw_data/papers"
presentations_folder = "data/paper_slides_data/raw_data/presentations"

combined_data = combine_data(papers_folder, presentations_folder)

In [16]:
# Zip presentations and papers together
zipped_data = zip(combined_data["presentations"][:3], combined_data["papers"][:3])

# Print the zipped data
for i, (presentation, paper) in enumerate(zipped_data, start=1):
    print("preso sentences", len(presentation))
    print(f"Pair {i}: \n Presentation - {presentation[:10]}")
    print("")
    print("paper sentences", len(paper))
    print(f"Paper - {paper[:10]}")
    print("-----------------------------------------------------------------------------------------------------------------------------")

preso sentences 36
Pair 1: 
 Presentation - [['tolerang filesystem mistak envyf lakshmi n bairavasundaram netapp  inc swaminathan sundararaman andrea c arpacidusseau remzi h arpacidusseau univers wisconsin madison'], ['file system today world modern file system complex ten thousand line code  eg  xf 45k loc  storag stack also gevng deeper hypervisor  network  logic volum manag need handl gamut failur memori allocaon  disk fault  bit flip  system crash preserv integr metadata user data'], ['file system bug bug report linux 26 seri bugzilla ext3  64  jf  17  reiserf  38 fs corrupon caus perman data loss fs bug broadli classifi two categori failstop  system immedi crash soluon  nook  swi 04   curio  david08  failsil  accident corrupt ondisk state mani bug uncov  prabhakaran05  gunawi08  yang04  yang06b '], ['bug inevit file system challeng  cope '], ['base nversion program  avizienis77  nf server  rodrigues01   databas  vandiver07   secur  cox06  nversion file system envyf  simpl solwar l

In [17]:
def find_most_similar_sentence(query_sentence, sentences):
    # Combine query sentence with the list of sentences
    all_sentences = [query_sentence] + sentences

    # Initialize TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Compute TF-IDF vectors for all sentences
    tfidf_matrix = vectorizer.fit_transform(all_sentences)

    # Calculate cosine similarity between query sentence and all sentences
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]

    # Find the index of the most similar sentence
    most_similar_index = similarity_scores.argmax()

    # Return the most similar sentence and its similarity score
    most_similar_sentence = sentences[most_similar_index]
    similarity_score = similarity_scores[most_similar_index]

    return most_similar_sentence, similarity_score

presentation_paper_pairs = []
for presentation, paper in zip(combined_data["presentations"], combined_data["papers"]):
    presentation_flat = [sentence for sublist in presentation for sentence in sublist]
    paper_flat = [sentence for sublist in paper for sentence in sublist]
    presentation_sentence_pairs = []
    for sentence in presentation_flat:
        most_similar_sentence, similarity_score = find_most_similar_sentence(sentence, paper_flat)
        presentation_sentence_pairs.append([sentence, most_similar_sentence, similarity_score])
        sorted_presentation_sentence_pairs = sorted(presentation_sentence_pairs, key=lambda x: x[2], reverse=True)
    presentation_paper_pairs.append(sorted_presentation_sentence_pairs)


In [18]:
print(len(presentation_paper_pairs))
print(presentation_paper_pairs[:3])

1192
[[['thank ', '3 thank adam paul suggest featur class ', 0.4072465115358739], ['futur work debug tool develop run older newer version file system compar result older version file system repair simpl repair  copi data file system complex repair  recreat en file system tree micro repair ', 'compar two system use treetotre grammar ', 0.19439958985854655], ['hard work alreadi done us 30 differ disk base file system linux 26 file system use ', 'compar two system use treetotre grammar ', 0.1722803126553838], ['subsist  singl instanc store variant singl instanc store selecv merg data block block address si export virtual disk fse manag map  free space info ', ' x n singl substitut site x 1      x n ', 0.16731210441726532], ['corrupon data singl fs due bug  bit flip  storag stack corrupt data block merg n1 data block merg corrupt data block fix next read corrupon data block insid disk singl copi data differ code path differ ondisk structur envyf layer fs 2 fs n applicaon vf layer vdisk 1 v

In [19]:
average_similarity = []
for sentence_pairs in presentation_paper_pairs:
    similarity_score_sum = sum(pair[2] for pair in sentence_pairs)
    similarity_score_average = similarity_score_sum / len(sentence_pairs)
    average_similarity.append(similarity_score_average)

print(average_similarity[:5])

print(sum(average for average in average_similarity) / len(average_similarity))


[0.10473271582810344, 0.1626123030226938, 0.08837021936706536, 0.08777198569090229, 0.15362456899990776]
0.12297329954399107


In [20]:
presentation_sentences_list = []
paper_sentences_list = []

for pair in presentation_paper_pairs:
    presentation_sentences = [sentences[0] for sentences in pair]
    paper_sentences = [sublist[1] for sublist in pair]
    presentation_sentences_list.append(presentation_sentences)
    paper_sentences_list.append(paper_sentences)

# Print the separated lists for each pair
for i in range(3):
    print("Presentation sentences for pair", i+1, ":", presentation_sentences_list[i])
    print()
    print("Paper sentences for pair", i+1, ":", paper_sentences_list[i])
    print()

Presentation sentences for pair 1 : ['thank ', 'futur work debug tool develop run older newer version file system compar result older version file system repair simpl repair  copi data file system complex repair  recreat en file system tree micro repair ', 'hard work alreadi done us 30 differ disk base file system linux 26 file system use ', 'subsist  singl instanc store variant singl instanc store selecv merg data block block address si export virtual disk fse manag map  free space info ', 'corrupon data singl fs due bug  bit flip  storag stack corrupt data block merg n1 data block merg corrupt data block fix next read corrupon data block insid disk singl copi data differ code path differ ondisk structur envyf layer fs 2 fs n applicaon vf layer vdisk 1 vdisk 2 vdisk n read cach chash layer free space manag su b si st', 'summari result robust tradion file system vulner corrupon envyfs3 toler almost mistak one fs perform desktop workload  envyfs3 compar perform io intens workload  regul

In [21]:
print(len(presentation_sentences_list[5]))
print(len(presentation_sentences_list[5]))

18
18


In [23]:
model = load_model("best_model.h5")

2024-04-11 20:38:37.390660: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-11 20:38:37.391399: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-11 20:38:37.392197: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [30]:
print(presentation_sentences_list[0])
print(paper_sentences_list[0][0])

['thank ', 'futur work debug tool develop run older newer version file system compar result older version file system repair simpl repair  copi data file system complex repair  recreat en file system tree micro repair ', 'hard work alreadi done us 30 differ disk base file system linux 26 file system use ', 'subsist  singl instanc store variant singl instanc store selecv merg data block block address si export virtual disk fse manag map  free space info ', 'corrupon data singl fs due bug  bit flip  storag stack corrupt data block merg n1 data block merg corrupt data block fix next read corrupon data block insid disk singl copi data differ code path differ ondisk structur envyf layer fs 2 fs n applicaon vf layer vdisk 1 vdisk 2 vdisk n read cach chash layer free space manag su b si st', 'summari result robust tradion file system vulner corrupon envyfs3 toler almost mistak one fs perform desktop workload  envyfs3 compar perform io intens workload  regular operaon  envyfs3  subsist accept 

In [25]:
#TODO: tokenize and vectorize input data
import pickle

# Load the tokenizer object
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)


In [31]:
# Tokenize inference sentences using the loaded tokenizer
inference_premise_sequences = tokenizer.texts_to_sequences(presentation_sentences_list[0])
inference_hypothesis_sequences = tokenizer.texts_to_sequences(paper_sentences_list[0])

# Pad the sequences to the same maximum sequence length
inference_premise_sequences = pad_sequences(inference_premise_sequences, maxlen=45, padding='post')
inference_hypothesis_sequences = pad_sequences(inference_hypothesis_sequences, maxlen=45, padding='post')

# Print the size of inference sequences
print("Size of Premise Inference Sequences:", len(inference_premise_sequences))
print("Size of Hypothesis Inference Sequences:", len(inference_hypothesis_sequences))

Size of Premise Inference Sequences: 48
Size of Hypothesis Inference Sequences: 48


In [46]:
probabilities = model.predict([inference_premise_sequences, inference_hypothesis_sequences])
print(probabilities)

[[0.05270668 0.41092962 0.5363636 ]
 [0.36244947 0.44154504 0.19600545]
 [0.36244947 0.44154504 0.19600545]
 [0.04072573 0.5585361  0.40073818]
 [0.0357815  0.6602722  0.30394632]
 [0.44055733 0.40984273 0.14959997]
 [0.36244947 0.44154504 0.19600545]
 [0.36244947 0.44154504 0.19600545]
 [0.44055733 0.40984273 0.14959997]
 [0.06903552 0.60474473 0.32621968]
 [0.04054331 0.72048444 0.23897228]
 [0.04809627 0.7215158  0.23038794]
 [0.57638747 0.26500243 0.15861017]
 [0.22449347 0.5176156  0.25789094]
 [0.0357815  0.6602722  0.30394632]
 [0.36244947 0.44154504 0.19600545]
 [0.36244947 0.44154504 0.19600545]
 [0.04809627 0.7215158  0.2303879 ]
 [0.02816491 0.704522   0.26731312]
 [0.02625869 0.8680804  0.10566093]
 [0.36244947 0.44154504 0.19600545]
 [0.1571911  0.5417155  0.30109337]
 [0.03578271 0.66028804 0.30392927]
 [0.03577476 0.6602885  0.3039367 ]
 [0.0357875  0.6602784  0.30393407]
 [0.36244947 0.44154504 0.19600545]
 [0.0357875  0.6602784  0.30393407]
 [0.04054331 0.72048444 0.23

In [1]:
len(probabilities)

NameError: name 'probabilities' is not defined

In [41]:
predicted_classes = np.argmax(probabilities, axis=1)

# Step 3: Class Labels
class_labels = ["Entailment", "Neutral", "Contradictory"]  # Replace with your actual class labels
predicted_labels = [class_labels[idx] for idx in predicted_classes]

# Print the predicted labels
print("Predicted Labels:", predicted_labels)

Predicted Labels: ['Contradictory', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Entailment', 'Neutral', 'Neutral', 'Entailment', 'Neutral', 'Neutral', 'Neutral', 'Entailment', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Contradictory', 'Neutral', 'Contradictory', 'Contradictory', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral']


In [62]:
# Count the occurrences of each label
label_counts = {label: predicted_labels.count(label) for label in set(predicted_labels)}

# Calculate the total count of all labels
total_count = sum(label_counts.values())

# Calculate the proportion of each label
label_proportions = {label: count / total_count for label, count in label_counts.items()}

# Define weights for each label
label_weights = {
    'Contradictory': 0,
    'Neutral': 0.5,
    'Entailment': 1
}

# Calculate the weighted sum of counts for all labels
weighted_sum = sum(label_counts[label] * label_weights[label] for label in label_counts)

# Normalize the weighted sum to range from 0 to 1
normalized_weighted_sum = weighted_sum / (total_count * max(label_weights.values()))

# Print the normalized weighted sum
print("Normalized Weighted Sum of Label Counts:", normalized_weighted_sum)

# Print the label proportions
print("Label Proportions:", label_proportions)


Normalized Weighted Sum of Label Counts: 0.4895833333333333
Label Proportions: {'Neutral': 0.8541666666666666, 'Contradictory': 0.08333333333333333, 'Entailment': 0.0625}


In [70]:
# Define the thresholds
thresholds = {
    'BAD': 0.33,
    'GOOD': 0.66,
    'GREAT': 1.0
}

# Determine the category based on the normalized weighted sum
category = None
for label, threshold in thresholds.items():
    if normalized_weighted_sum <= threshold:
        category = label
        break

# Print the category
print("This presentation was a", category, "representation of this paper.")


This presentation was a GOOD representation of this paper.
