In [35]:
import spacy
from collections import deque
import pandas as pd
import transformers
from transformers import AutoTokenizer

In [36]:

class TreeNode:
    def __init__(self, data):
        self.data = data
        self.parent = None
        self.children = []

    def add_child(self, child):
        child.parent = self
        self.children.append(child)

def spacy_doc_to_tree(doc):
    # Create a TreeNode for each token in the doc
    nodes = [TreeNode(token) for token in doc]

    # Create the tree structure by connecting parent-child relationships
    for token, node in zip(doc, nodes):
        if token.head.i == token.i:  # Skip root node (head is itself)
            continue
        parent_node = nodes[token.head.i]
        parent_node.add_child(node)

    # Find and return the root node
    root = next(node for node in nodes if node.parent is None)
    return root



def create_dataframe_from_tree(root):
    if not root:
        return pd.DataFrame()

    # Data list to hold information about each node
    data = []
    queue = deque([(root, 0, None)])

    while queue:
        node, level, parent = queue.popleft()

        # If the node has a parent, find the positions of its siblings
        if parent:
            sibling_positions = [child.data.i for child in parent.children]
        else:
            sibling_positions = []

        node_data = {
            "word": node.data.text,
            "position": node.data.i,
            "level": level,
            "level_weight": 1 / (level + 1),
            "parent": parent.data.text if parent else None,
            "sibling_positions": sibling_positions
        }
        data.append(node_data)

        for child in node.children:
            queue.append((child, level + 1, node))

    # Create DataFrame from the collected data
    df = pd.DataFrame(data)
    df = df.sort_values(by='level')
    return df



# Example usage:
# nlp = spacy.load("en_core_web_sm")
# text = "This is an example sentence."
# doc = nlp(text)

# Convert Spacy dependency tree to a Tree object
# tree_root = spacy_doc_to_tree(doc)

# Example usage with the Tree structure
# tree_df = create_dataframe_from_tree(tree_root)
# print(tree_df)


In [37]:
nlp = spacy.load("en_core_web_sm")
text = "A grandpa is not living in Himalayas."
doc = nlp(text+' MASK')

# Convert Spacy dependency tree to a Tree object
tree_root = spacy_doc_to_tree(doc)

# Example usage with the Tree structure
tree_df = create_dataframe_from_tree(tree_root)
tree_df = tree_df[tree_df['token'] != 'MASK']
tree_df


KeyError: 'token'

In [38]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [39]:
encoding = tokenizer('A grandpa is not living in Himalayas.')
num_tokens = len(encoding['input_ids'])
positions = range(num_tokens)
df_tokens = pd.DataFrame({'token_id': encoding['input_ids'], 'position': positions})
df_tokens['token'] = [tokenizer.decode([token_id]) for token_id in df_tokens['token_id']]
df_tokens

Unnamed: 0,token_id,position,token
0,32,0,A
1,4490,1,grand
2,8957,2,pa
3,318,3,is
4,407,4,not
5,2877,5,living
6,287,6,in
7,42438,7,Himal
8,323,8,ay
9,292,9,as


In [47]:
tree_df = tree_df[['word', 'level', 'level_weight', 'parent']]
tree_df

Unnamed: 0,word,level,level_weight,parent
0,living,0,1.0,
1,grandpa,1,0.5,living
2,is,1,0.5,living
3,not,1,0.5,living
4,in,1,0.5,living
5,.,1,0.5,living
6,A,2,0.333333,grandpa
7,Himalayas,2,0.333333,in


In [75]:
import re

# Initialize tokenizer (you can replace 'bert-base-uncased' with your desired model)
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# Sentence combining all words
sentence = 'A grandpa is not living in Himalayas.'

def get_word_subtoken_dict(sentence, tokenizer):
    # Tokenize the sentence to get subtokens
    token_ids = tokenizer.encode(sentence, add_special_tokens=False)

    # Get words from the sentence
    words = [t.strip() for t in re.findall(r'\b.*?\S.*?(?:\b|$)', sentence)]

    # Initialize dictionary to store word-subtoken mappings
    word_subtoken_dict = {}
    k = 0

    for word in words:
        word_len = 0
        list_subtokens = []
        while word_len < len(word):
            list_subtokens.append(token_ids[k])
            decoded_word = tokenizer.decode([token_ids[k]]).replace(' ','')
            word_len += len(decoded_word)
            k += 1
        word_subtoken_dict[word] = list_subtokens

    return word_subtoken_dict

# Track position in the encoded_sentence


words: ['A', 'grandpa', 'is', 'not', 'living', 'in', 'Himalayas', '.']
token_ids: [32, 4490, 8957, 318, 407, 2877, 287, 42438, 323, 292, 13]
word_subtoken_dict: {'A': [32], 'grandpa': [4490, 8957], 'is': [318], 'not': [407], 'living': [2877], 'in': [287], 'Himalayas': [42438, 323, 292], '.': [13]}


In [48]:
word_subtoken_dict['living']

[2877]

In [79]:

def build_token_dependency_tree(tree_df, word_subtoken_dict, tokenizer, df_tokens):
    new_rows = []
    for index, row in tree_df.iterrows():
        word = row['word']
        token_ids = word_subtoken_dict[word]  # Use your tokenizer here
        for token_id in token_ids:
            new_row = row.copy()
            new_row['token_id'] = token_id
            new_row['token'] = tokenizer.decode([token_id])
            if row['parent'] is not None:
                new_row['parent_ids'] = word_subtoken_dict[row['parent']]
            new_rows.append(new_row)
    new_df = pd.DataFrame(new_rows)
    new_df = pd.merge(new_df, df_tokens, on=['token_id', 'token'], how='left')
    return new_df


Unnamed: 0,word,level,level_weight,parent,token_id,token_x,parent_ids,position,token_y
0,living,0,1.0,,2877,living,,5,living
1,grandpa,1,0.5,living,4490,grand,[2877],1,grand
2,grandpa,1,0.5,living,8957,pa,[2877],2,pa
3,is,1,0.5,living,318,is,[2877],3,is
4,not,1,0.5,living,407,not,[2877],4,not
5,in,1,0.5,living,287,in,[2877],6,in
6,.,1,0.5,living,13,.,[2877],10,.
7,A,2,0.333333,grandpa,32,A,"[4490, 8957]",0,A
8,Himalayas,2,0.333333,in,42438,Himal,[287],7,Himal
9,Himalayas,2,0.333333,in,323,ay,[287],8,ay
