In [4]:
import spacy
from collections import deque
import pandas as pd
import transformers
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [164]:

class TreeNode:
    def __init__(self, data):
        self.data = data
        self.parent = None
        self.children = []

    def add_child(self, child):
        child.parent = self
        self.children.append(child)

def spacy_doc_to_tree(doc):
    # Create a TreeNode for each token in the doc
    nodes = [TreeNode(token) for token in doc]

    # Create the tree structure by connecting parent-child relationships
    for token, node in zip(doc, nodes):
        if token.head.i == token.i:  # Skip root node (head is itself)
            continue
        parent_node = nodes[token.head.i]
        parent_node.add_child(node)

    # Find and return the root node
    root = next(node for node in nodes if node.parent is None)
    return root



def create_dataframe_from_tree(root):
    if not root:
        return pd.DataFrame()

    # Data list to hold information about each node
    data = []
    queue = deque([(root, 0, None)])

    while queue:
        node, level, parent = queue.popleft()

        # If the node has a parent, find the positions of its siblings
        if parent:
            sibling_positions = [child.data.i for child in parent.children]
        else:
            sibling_positions = []

        node_data = {
            "word": node.data.text,
            "word_position": node.data.i,
            "level": level,
            "level_weight": 1 / (level + 1),
            "parent": parent.data.text if parent else None,
            "sibling_positions": sibling_positions
        }
        data.append(node_data)

        for child in node.children:
            queue.append((child, level + 1, node))

    # Create DataFrame from the collected data
    df = pd.DataFrame(data)
    df = df.sort_values(by='level')
    return df



# Example usage:
# nlp = spacy.load("en_core_web_sm")
# text = "This is an example sentence."
# doc = nlp(text)

# Convert Spacy dependency tree to a Tree object
# tree_root = spacy_doc_to_tree(doc)

# Example usage with the Tree structure
# tree_df = create_dataframe_from_tree(tree_root)
# print(tree_df)


In [165]:
nlp = spacy.load("en_core_web_sm")
text = "A grandpa is not a mum but a"
doc = nlp(text+' MASK')

# Convert Spacy dependency tree to a Tree object
tree_root = spacy_doc_to_tree(doc)

# Example usage with the Tree structure
tree_df = create_dataframe_from_tree(tree_root)
tree_df = tree_df[tree_df['word'] != 'MASK']
tree_df


Unnamed: 0,word,word_position,level,level_weight,parent,sibling_positions
0,is,2,0,1.0,,[]
1,grandpa,1,1,0.5,is,"[1, 3, 5, 8]"
2,not,3,1,0.5,is,"[1, 3, 5, 8]"
3,mum,5,1,0.5,is,"[1, 3, 5, 8]"
5,A,0,2,0.333333,grandpa,[0]
6,a,4,2,0.333333,mum,"[4, 6]"
7,but,6,2,0.333333,mum,"[4, 6]"
8,a,7,2,0.333333,MASK,[7]


In [148]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [166]:
sentence = 'A grandpa is not a mum but a'
words = [t.strip() for t in re.findall(r'\b.*?\S.*?(?:\b|$)', sentence)]
df_words = pd.DataFrame({'word': words, 'word_position': range(len(words))})
df_words

Unnamed: 0,word,word_position
0,A,0
1,grandpa,1
2,is,2
3,not,3
4,a,4
5,mum,5
6,but,6
7,a,7


In [167]:
token_ids = tokenizer('A grandpa is not a mum but a')['input_ids']
num_tokens = len(token_ids)
positions = range(num_tokens)
df_tokens = pd.DataFrame({'token_id': token_ids, 'token_position': positions})
df_tokens['token'] = [tokenizer.decode([token_id]) for token_id in df_tokens['token_id']]
df_tokens

Unnamed: 0,token_id,token_position,token
0,32,0,A
1,4490,1,grand
2,8957,2,pa
3,318,3,is
4,407,4,not
5,257,5,a
6,25682,6,mum
7,475,7,but
8,257,8,a


In [168]:
# Initialize dictionary to store word-subtoken mappings
token_to_word = {}
token_id_to_word = {}
pos_token_to_word = {}
k = 0

for i in range(len(words)):
    word = words[i]
    word_len = 0
    while word_len < len(word):
        decoded_word = tokenizer.decode([token_ids[k]]).replace(' ','')
        word_len += len(decoded_word)
        pos_token_to_word[k] = i
        token_to_word[decoded_word] = word
        token_id_to_word[token_ids[k]] = word
        k += 1

print(token_to_word)
print(pos_token_to_word)

{'A': 'A', 'grand': 'grandpa', 'pa': 'grandpa', 'is': 'is', 'not': 'not', 'a': 'a', 'mum': 'mum', 'but': 'but'}
{0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}


In [170]:
# Add 'word' column based on token_id_to_word mapping
df_tokens['word_position'] = df_tokens['token_position'].map(pos_token_to_word)

# Merge based on 'word'
merged_df = pd.merge(df_tokens, df_words, on='word_position', how='inner')
merged_df

Unnamed: 0,token_id,token_position,token,word_position,word
0,32,0,A,0,A
1,4490,1,grand,1,grandpa
2,8957,2,pa,1,grandpa
3,318,3,is,2,is
4,407,4,not,3,not
5,257,5,a,4,a
6,25682,6,mum,5,mum
7,475,7,but,6,but
8,257,8,a,7,a


In [171]:
pd.merge(merged_df, tree_df, on=['word', 'word_position'], how='inner')

Unnamed: 0,token_id,token_position,token,word_position,word,level,level_weight,parent,sibling_positions
0,32,0,A,0,A,2,0.333333,grandpa,[0]
1,4490,1,grand,1,grandpa,1,0.5,is,"[1, 3, 5, 8]"
2,8957,2,pa,1,grandpa,1,0.5,is,"[1, 3, 5, 8]"
3,318,3,is,2,is,0,1.0,,[]
4,407,4,not,3,not,1,0.5,is,"[1, 3, 5, 8]"
5,257,5,a,4,a,2,0.333333,mum,"[4, 6]"
6,25682,6,mum,5,mum,1,0.5,is,"[1, 3, 5, 8]"
7,475,7,but,6,but,2,0.333333,mum,"[4, 6]"
8,257,8,a,7,a,2,0.333333,MASK,[7]
