This code was written as part of the Pro-Text project, focusing on the relation between syntactic dependencies and pauses made when typing texts. 

In [257]:
data_file = open("test.conll", "r")
data = []

# Read file and split into sentences as lists of lines
current_sentence = []
for line in data_file:
	if line == "\n":
		if current_sentence:  # Ensure the sentence is not empty
			data.append(current_sentence)
		current_sentence = []
	else:
		current_sentence.append(line.strip())  # Remove newline characters

if current_sentence:  # Add the last sentence if file doesn't end with a newline
	data.append(current_sentence)

data_file.close()  # Close the file after reading

last_text_version = None

# Insert last_text_version where needed
for i in range(len(data)):
	if not data[i][0].startswith("# text_version"):
		if last_text_version is not None:
			data[i].insert(0, last_text_version)
	else:
		last_text_version = data[i][0]


for i in range(len(data)):
	expected_number = 1
	for j in range(2, len(data[i])):  # Assuming the first two lines are comments
		components = data[i][j].split('\t')
		# print(components)
		if len(components) > 1:  # Ensure the line is not a comment
			components[0] = str(expected_number)
			data[i][j] = '\t'.join(components)
			expected_number += 1


# # Print the updated data for verification
for sentence in data:
	for line in sentence:
		print(line)
#     print()  # Print a newline between sentences for clarity

# text_version=54
# sentence_id=1
1	la	la	DET	DET	n=s|g=f	3	det	_	charID=3__char=l__charStatus=True__pause=7270|charID=4__char=a__charStatus=True__pause=16
2	violence	violence	NC	NC	n=s|g=f	4	suj	_	charID=6__char=v__charStatus=True__pause=141|charID=7__char=i__charStatus=True__pause=156|charID=8__char=o__charStatus=True__pause=141|charID=9__char=l__charStatus=True__pause=125|charID=10__char=e__charStatus=True__pause=343|charID=11__char=n__charStatus=True__pause=109|charID=12__char=c__charStatus=True__pause=343|charID=13__char=e__charStatus=True__pause=62
3	est	être	V	V	n=s|t=P|p=3	0	root	_	charID=16__char=e__charStatus=True__pause=3370|charID=17__char=s__charStatus=True__pause=202|charID=18__char=t__charStatus=True__pause=93
4	une	une	DET	DET	n=s|g=f	8	det	_	charID=40__char=u__charStatus=True__pause=796|charID=41__char=n__charStatus=True__pause=171|charID=42__char=e__charStatus=True__pause=109
5	chose	chose	NC	NC	n=s|g=f	4	obj	_	charID=44__char=c__charStatus=True__pause=140|charID=45__

In [4]:
def clean_conll(file):
	data_file = open(file, "r")
	data = []

	# Read file and split into sentences as lists of lines
	current_sentence = []
	for line in data_file:
		if line == "\n":
			if current_sentence:  # Ensure the sentence is not empty
				data.append(current_sentence)
			current_sentence = []
		else:
			current_sentence.append(line.strip())  # Remove newline characters

	if current_sentence:  # Add the last sentence if file doesn't end with a newline
		data.append(current_sentence)

	data_file.close()  # Close the file after reading

	last_text_version = None

	# Insert last_text_version where needed
	for i in range(len(data)):
		if not data[i][0].startswith("# text_version"):
			if last_text_version is not None:
				data[i].insert(0, last_text_version)
		else:
			last_text_version = data[i][0]


	for i in range(len(data)):
		expected_number = 1
		for j in range(2, len(data[i])):  # Assuming the first two lines are comments
			components = data[i][j].split('\t')
			# print(components)
			if len(components) > 1:  # Ensure the line is not a comment
				components[0] = str(expected_number)
				data[i][j] = '\t'.join(components)
				expected_number += 1

	return data

In [84]:
import stanza
from stanza.models.common.doc import Document

nlp = stanza.Pipeline('fr', processors='depparse', depparse_pretagged=True)

2024-07-12 16:50:05 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-12 16:50:05 INFO: Downloaded file to /Users/madalina/stanza_resources/resources.json
2024-07-12 16:50:05 INFO: Loading these models for language: fr (French):
| Processor | Package         |
-------------------------------
| depparse  | combined_charlm |

2024-07-12 16:50:05 INFO: Using device: cpu
2024-07-12 16:50:05 INFO: Loading: depparse
2024-07-12 16:50:05 INFO: Done loading processors!


In [5]:
def create_new_conll(data, output_file):
	miscs = []
	with open (output_file, "w", encoding="utf-8") as f:
		for i in range(len(data)):
			conll_sent = data[i].split("\n")
			text_version = conll_sent [0] #literally: # text_version=0
			f.write(text_version + "\n")
			sentence_id = conll_sent[1] #literally: # sentence_id=1
			f.write(sentence_id + "\n")
			sentence_complete = conll_sent[2:]
			sentence = ""
			for token in sentence_complete:
				elements = token.split("\t")
				miscs.append(elements[9] if len(elements) > 1 else "")
				if len(elements) > 1:
					sentence += elements[1] + " "
			f.write(f"# text = {sentence}\n")
			doc = nlp(sentence)
			print(doc)
			for i in range(len(doc.sentences[0].words)):
				token = doc.sentences[0].words[i]
				xpos = token.xpos if not "None" else "_"
				deps = token.deps if not "None" else "_"
				f.write(f"{token.id}\t{token.text}\t{token.lemma}\t{token.pos}\t{xpos}\t{token.feats}\t{token.head}\t{token.deprel}\t{deps}\t{miscs[i]}\n")
			f.write("\n")
	print(f"File {output_file} created")

In [6]:
def extract_data(file):
	data_file = open(file, "r", encoding="utf-8")
	sentence = ""
	data = []
	for line in data_file:
		if line[0] == "\n":
			data.append(sentence)
			sentence = ""
		else:
			sentence = sentence + line
	return data

In [5]:
import stanza


nlp = stanza.Pipeline('fr', processors='tokenize, pos, lemma, depparse')


2024-07-17 13:55:01 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-17 13:55:01 INFO: Downloaded file to /Users/madalina/stanza_resources/resources.json
2024-07-17 13:55:02 INFO: Loading these models for language: fr (French):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2024-07-17 13:55:02 INFO: Using device: cpu
2024-07-17 13:55:02 INFO: Loading: tokenize
2024-07-17 13:55:02 INFO: Loading: mwt
2024-07-17 13:55:02 INFO: Loading: pos
2024-07-17 13:55:02 INFO: Loading: lemma
2024-07-17 13:55:02 INFO: Loading: depparse
2024-07-17 13:55:02 INFO: Done loading processors!


In [205]:
def get_token_text_from_misc(misc):
	misc = misc.split("|")
	word = ""
	for char_details in misc:
		char_details = char_details.split("__")
		char = char_details[1].split("=")[1]
		word += char
	return word

	

In [29]:
original_list= [['1', 'Alors', 'alors', 'ADV', 'ADV', '_', '92', 'mod', '_', 'charID=408__char=A__charStatus=True__pause=6942|charID=409__char=l__charStatus=True__pause=780|charID=410__char=o__charStatus=True__pause=140|charID=411__char=r__charStatus=True__pause=359|charID=412__char=s__charStatus=True__pause=312'], ['2', 'a', 'avoir', 'V', 'V', 'n=s|t=P|p=3', '0', 'root', '_', 'charID=414__char=a__charStatus=True__pause=312'], ['3', 'la', 'la', 'DET', 'DET', 'n=s|g=f', '94', 'det', '_', 'charID=416__char=l__charStatus=True__pause=156|charID=417__char=a__charStatus=True__pause=265'], ['4', 'fin', 'fin', 'NC', 'NC', 'n=s', '92', 'obj', '_', 'charID=419__char=f__charStatus=True__pause=312|charID=420__char=i__charStatus=True__pause=219|charID=421__char=n__charStatus=True__pause=187'], ['5', 'du', 'de', 'P+D', 'P+D', 'n=s|g=m', '94', 'dep', '_', 'charID=423__char=d__charStatus=True__pause=281|charID=424__char=u__charStatus=True__pause=328'], ['6', 'cours', 'cour', 'NC', 'NC', 'n=p|g=f', '95', 'prep', '_', 'charID=426__char=c__charStatus=True__pause=171|charID=427__char=o__charStatus=True__pause=171|charID=428__char=u__charStatus=True__pause=249|charID=429__char=r__charStatus=True__pause=296|charID=430__char=s__charStatus=True__pause=827'], ['7', "j'", 'je', 'CLS', 'CLS', 'n=s|p=1', '98', 'suj', '_', "charID=432__char=j__charStatus=True__pause=7379|charID=433__char='__charStatus=True__pause=359"], ['8', 'ai', 'avoir', 'V', 'V', 'n=s|t=P|p=1', '92', 'mod', '_', 'charID=434__char=a__charStatus=True__pause=296|charID=435__char=i__charStatus=True__pause=328'], ['9', 'etait', '_', 'ADV', 'ADV', '_', '98', 'mod', '_', 'charID=437__char=e__charStatus=True__pause=375|charID=438__char=t__charStatus=True__pause=358|charID=439__char=a__charStatus=True__pause=312|charID=440__char=i__charStatus=True__pause=202|charID=441__char=t__charStatus=True__pause=218'], ['10', 'voir', '_', 'I', 'I', '_', '0', '_', '_', 'charID=443__char=v__charStatus=True__pause=234|charID=444__char=o__charStatus=True__pause=172|charID=445__char=i__charStatus=True__pause=141|charID=446__char=r__charStatus=True__pause=265']]
stanza_list = [[1, 'Alors', 'alors', 'ADV', '_', None, 11, 'advmod', '_', 'charID=408__char=A__charStatus=True__pause=6942|charID=409__char=l__charStatus=True__pause=780|charID=410__char=o__charStatus=True__pause=140|charID=411__char=r__charStatus=True__pause=359|charID=412__char=s__charStatus=True__pause=312'], [2, 'a', 'à', 'ADP', '_', 'Typo=Yes', 4, 'case', '_', 'charID=414__char=a__charStatus=True__pause=312'], [3, 'la', 'le', 'DET', '_', 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', 4, 'det', '_', 'charID=416__char=l__charStatus=True__pause=156|charID=417__char=a__charStatus=True__pause=265'], [4, 'fin', 'fin', 'NOUN', '_', 'Gender=Fem|Number=Sing', 11, 'obl:mod', '_', 'charID=419__char=f__charStatus=True__pause=312|charID=420__char=i__charStatus=True__pause=219|charID=421__char=n__charStatus=True__pause=187'], [5, 'de', 'de', 'ADP', '_', None, 7, 'case', '_', 'charID=419__char=f__charStatus=True__pause=312|charID=420__char=i__charStatus=True__pause=219|charID=421__char=n__charStatus=True__pause=187'], [6, 'le', 'le', 'DET', '_', 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', 7, 'det', '_', 'charID=419__char=f__charStatus=True__pause=312|charID=420__char=i__charStatus=True__pause=219|charID=421__char=n__charStatus=True__pause=187'], [7, 'cours', 'cours', 'NOUN', '_', 'Gender=Masc|Number=Sing', 4, 'nmod', '_', 'charID=419__char=f__charStatus=True__pause=312|charID=420__char=i__charStatus=True__pause=219|charID=421__char=n__charStatus=True__pause=187'], [8, "j'", 'moi', 'PRON', '_', 'Emph=No|Number=Sing|Person=1|PronType=Prs', 11, 'nsubj', '_', 'charID=419__char=f__charStatus=True__pause=312|charID=420__char=i__charStatus=True__pause=219|charID=421__char=n__charStatus=True__pause=187'], [9, 'ai', 'avoir', 'AUX', '_', 'Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin', 11, 'aux:tense', '_', 'charID=419__char=f__charStatus=True__pause=312|charID=420__char=i__charStatus=True__pause=219|charID=421__char=n__charStatus=True__pause=187'], [10, 'etait', 'être', 'AUX', '_', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 11, 'aux:pass', '_', 'charID=419__char=f__charStatus=True__pause=312|charID=420__char=i__charStatus=True__pause=219|charID=421__char=n__charStatus=True__pause=187'], [11, 'voir', 'voir', 'VERB', '_', 'VerbForm=Inf', 0, 'root', '_', 'charID=419__char=f__charStatus=True__pause=312|charID=420__char=i__charStatus=True__pause=219|charID=421__char=n__charStatus=True__pause=187']]


In [35]:
new_list = []
head = ""
min_len = min(len(original_list), len(stanza_list))
if min_len == len(original_list):
	print("Entered if min_len == len(original_list)")
	for i in range(min_len):
		print(f"i: {i}")
		orig_misc = original_list[i][9]
		stanza_misc = stanza_list[i][9]
		if orig_misc == stanza_misc:
			print("Entered if orig_misc == stanza_misc")
			print(f"int(stanza_list[i][6]) = {int(stanza_list[i][6])}")
			new_list.append(stanza_list[i])
			if int(stanza_list[i][6]) == 0:
				head = stanza_list[i]
		else:
			print("Entered else orig_misc == stanza_misc")
			new_element = [stanza_list[i][0], original_list[i][1], original_list[i][2], stanza_list[i][3], stanza_list[i][4], stanza_list[i][5], stanza_list[i][6], stanza_list[i][7], stanza_list[i][8], orig_misc]
			print(f"int(new_element[6]) = {int(new_element[6])}")
			if int(new_element[6]) == 0:
				head = new_element
			new_list.append(new_element)
else:
	print("Entered else min_len == len(original_list)")
	for i in range(min_len):
		print(f"i: {i}")
		orig_misc = original_list[i][9]
		stanza_misc = stanza_list[i][9]
		if orig_misc == stanza_misc:
			print("Entered if orig_misc == stanza_misc")
			new_list.append(stanza_list[i])
			print(f"int(stanza_list[i][6]) = {int(stanza_list[i][6])}")
			if int(stanza_list[i][6]) == 0:
				head = stanza_list[i]
		else:
			print("Entered else orig_misc == stanza_misc")
			new_element_1 = [int(original_list[i][0]), original_list[i][1], original_list[i][2], stanza_list[i][3], stanza_list[i][4], stanza_list[i][5], stanza_list[i][6], stanza_list[i][7], stanza_list[i][8], orig_misc]
			new_element_2 = [int(original_list[i+1][0]), original_list[i+1][1], original_list[i+1][2], stanza_list[i][3], stanza_list[i][4], stanza_list[i][5], stanza_list[i][6], stanza_list[i][7], original_list[i+1][8], original_list[i+1][9]]
			print(f"int(new_element_1[6]) = {int(new_element_1[6])}")
			print(f"int(new_element_2[6]) = {int(new_element_2[6])}")
			if int(new_element_1[6]) == 0:
				head = new_element_1
			elif int(new_element_2[6]) == 0:
				head = new_element_2
			new_list.append(new_element_1)
			new_list.append(new_element_2)

if head == "":
	for token in new_list:
		if int(token[6]) == 0:
			head = token

print(f"head: {head}")
# Check and adjust the head for each token
num_tokens = len(new_list)
for token in new_list:
	if int(token[6]) > num_tokens:
		token[6] = int(head[0])

# for element in new_list:
# 	print(element)

# print(f"Length of original list: {len(original_list)}")
# print(f"Length of stanza list: {len(stanza_list)}")
# print(f"Length of new list: {len(new_list)}")
# print(new_list)

Entered if min_len == len(original_list)
i: 0
Entered if orig_misc == stanza_misc
int(stanza_list[i][6]) = 11
i: 1
Entered if orig_misc == stanza_misc
int(stanza_list[i][6]) = 4
i: 2
Entered if orig_misc == stanza_misc
int(stanza_list[i][6]) = 4
i: 3
Entered if orig_misc == stanza_misc
int(stanza_list[i][6]) = 11
i: 4
Entered else orig_misc == stanza_misc
int(new_element[6]) = 7
i: 5
Entered else orig_misc == stanza_misc
int(new_element[6]) = 7
i: 6
Entered else orig_misc == stanza_misc
int(new_element[6]) = 4
i: 7
Entered else orig_misc == stanza_misc
int(new_element[6]) = 11
i: 8
Entered else orig_misc == stanza_misc
int(new_element[6]) = 11
i: 9
Entered else orig_misc == stanza_misc
int(new_element[6]) = 11
head: 


IndexError: string index out of range

In [27]:
indices_heads = [(1, 3), (2, 3), (3, 0), (4, 5), (5, 3), (6, 7), (7, 5), (8, 10), (9, 10), (10, 5), (11, 3), (1, 2), (2, 0), (3, 2), (4, 9), (5, 9), (6, 9), (7, 9), (8, 9), (9, 2), (10, 12), (11, 12), (12, 2), (13, 16), (14, 16), (15, 16), (16, 12), (17, 19), (18, 19), (19, 16), (20, 2), (1, 4), (2, 4), (3, 4), (4, 0), (5, 7), (6, 7)]

sentences = []  # List to hold all sentences
current_sentence = []  # List to hold tuples of the current sentence

for pair in indices_heads:
	if pair[0] == 1 and current_sentence:  # If it's the start of a new sentence and not the first tuple
		sentences.append(current_sentence)  # Append the current sentence to sentences
		current_sentence = []  # Start a new current sentence
	current_sentence.append(pair)  # Add the current tuple to the current sentence

sentences.append(current_sentence)  # Append the last sentence after the loop finishes

# Now, sentences contains the separated sentences
for i, sentence in enumerate(sentences, start=1):
	print(f"Sentence {i}: {sentence}")

Sentence 1: [(1, 3), (2, 3), (3, 0), (4, 5), (5, 3), (6, 7), (7, 5), (8, 10), (9, 10), (10, 5), (11, 3)]
Sentence 2: [(1, 2), (2, 0), (3, 2), (4, 9), (5, 9), (6, 9), (7, 9), (8, 9), (9, 2), (10, 12), (11, 12), (12, 2), (13, 16), (14, 16), (15, 16), (16, 12), (17, 19), (18, 19), (19, 16), (20, 2)]
Sentence 3: [(1, 4), (2, 4), (3, 4), (4, 0), (5, 7), (6, 7)]


In [38]:
def correct_heads(indices_heads):

    sentences = []  # List to hold all sentences
    current_sentence = []  # List to hold tuples of the current sentence

    for pair in indices_heads:
        if pair[0] == 1 and current_sentence:  # If it's the start of a new sentence and not the first tuple
            sentences.append(current_sentence)  # Append the current sentence to sentences
            current_sentence = []  # Start a new current sentence
        current_sentence.append(pair)  # Add the current tuple to the current sentence

    sentences.append(current_sentence)  # Append the last sentence after the loop finishes

    for i, sentence in enumerate(sentences):
        for j, pair in enumerate(sentence):
            if pair[1] > sentence[-1][0]:  # If the head is outside the current sentence
                # Find the root of the current sentence (pair with second element == 0)
                root_pair = next((p for p in sentence if p[1] == 0), None)
                if root_pair:
                    # Replace the current pair with a new tuple
                    sentences[i][j] = (pair[0], root_pair[0])

    concat_list = sum(sentences, [])
    return concat_list

In [39]:
def merge_elements(original_list, stanza_list):
	new_list = []
	final_list = []
	print(f"original list: {original_list}")
	print(f"stanza list: {stanza_list}")
	min_len = min(len(original_list), len(stanza_list))
	if min_len == len(original_list):
		for i in range(min_len):
			orig_misc = original_list[i][9]
			stanza_misc = stanza_list[i][9]
			if orig_misc == stanza_misc:
				new_list.append(stanza_list[i])
			else:
				new_element = [stanza_list[i][0], original_list[i][1], original_list[i][2], stanza_list[i][3], stanza_list[i][4], stanza_list[i][5], stanza_list[i][6], stanza_list[i][7], stanza_list[i][8], orig_misc]
				new_list.append(new_element)
	else:
		for i in range(min_len):
			orig_misc = original_list[i][9]
			stanza_misc = stanza_list[i][9]
			if orig_misc == stanza_misc:
				new_list.append(stanza_list[i])
			else:
				new_element_1 = [int(original_list[i][0]), original_list[i][1], original_list[i][2], stanza_list[i][3], stanza_list[i][4], stanza_list[i][5], stanza_list[i][6], stanza_list[i][7], stanza_list[i][8], orig_misc]
				new_element_2 = [int(original_list[i+1][0]), original_list[i+1][1], original_list[i+1][2], stanza_list[i][3], stanza_list[i][4], stanza_list[i][5], stanza_list[i][6], stanza_list[i][7], original_list[i+1][8], original_list[i+1][9]]
				new_list.append(new_element_1)
				new_list.append(new_element_2)


	# Check and adjust the head for each token
	indices_heads = []
	for token in new_list:
		indices_heads.append((int(token[0]), int(token[6])))
	correct_indices_heads = correct_heads(indices_heads)
	for i in range(len(correct_indices_heads)):
		current_element = [int(correct_indices_heads[i][0]), new_list[i][1], new_list[i][2], new_list[i][3], new_list[i][4], new_list[i][5], correct_indices_heads[i][1], new_list[i][7], new_list[i][8], new_list[i][9]]
		final_list.append(current_element)
	
	return final_list



In [41]:
data = clean_conll("test.conll")
# data = clean_conll("/Users/madalina/Documents/M1TAL/stage_GC/Pro-TEXT_annotated_corpus_v0.3/conll/P18C6N1_with_pause.conll")


# miscs = []
previous_text_version = 0
previous_sent_id = 0
with open ("output.conll", "w", encoding="utf-8") as f:
	misc =""
	for i in range(len(data)):
		elements = [str(element) for element in data[i]]
		text_version = int(elements[0].split("=")[1]) #literally: # text_version=0
		sent_id = 1

		if text_version == previous_text_version and sent_id <= previous_sent_id:
			sent_id = previous_sent_id + 1

		sentence_complete = elements[2:]

		sentence = ""
		original_tokens = []
		list_elements = []
		for token in sentence_complete:

			conll_columns = token.split("\t")
			list_elements.append(conll_columns)
			item_dict = {'first_elements': tuple(conll_columns[:10]), 'misc': conll_columns[9]} if len(conll_columns) > 1 else {'first_elements': tuple(conll_columns), 'misc': ""}
			original_tokens.append(item_dict)
			
			if len(conll_columns) > 1:
				sentence += conll_columns[1] + " "


		doc = nlp(sentence)
		stanza_nb_tokens = 0
		for j in range(len(doc.sentences)):
			stanza_nb_tokens += len(doc.sentences[j].words)
		list_stanza_elements = []
		if stanza_nb_tokens != len(list_elements):
			for j in range(len(doc.sentences)):
				for i in range(len(doc.sentences[j].words)):
					token = doc.sentences[j].words[i]
					
					xpos = token.xpos if not "None" else "_"
					deps = token.deps if not "None" else "_"
					if j == 0:
						id_to_check_against = token.id
					else:
						id_to_check_against = token.id + len(doc.sentences[j-1].words)
					for element in original_tokens:
						if element['first_elements'][1] == token.text and int(element['first_elements'][0]) == id_to_check_against:
							misc = element['misc']
							break
					list_stanza_elements.append([token.id, token.text, token.lemma, token.pos, xpos, token.feats, token.head, token.deprel, deps, misc])
			new_list = merge_elements(list_elements, list_stanza_elements)
			iterations = 0
			for element in new_list:
				if element[0] == 1:
					mini_sentence = " ".join([token.text for token in doc.sentences[iterations].words])
					if iterations > 0:
						f.write("\n")
					f.write("# text_version=")
					f.write(str(text_version))
					previous_text_version = text_version
					f.write("\n")

					f.write("# sentence_id=")
					f.write(str(sent_id))
					previous_sent_id = sent_id
					sent_id += 1
					f.write("\n")

					f.write(f"# text = {mini_sentence}")
					f.write("\n")

					f.write(f"{element[0]}\t{element[1]}\t{element[2]}\t{element[3]}\t{element[4]}\t{element[5]}\t{element[6]}\t{element[7]}\t{element[8]}\t{element[9]}\n")
					iterations += 1
				else:
					f.write(f"{element[0]}\t{element[1]}\t{element[2]}\t{element[3]}\t{element[4]}\t{element[5]}\t{element[6]}\t{element[7]}\t{element[8]}\t{element[9]}\n")
		else:
			nb_words_in_sentence = []
			for j in range(len(doc.sentences)):
				nb_words_in_sentence.append(len(doc.sentences[j].words))
				mini_sentence = ""
				mini_sentence = " ".join([token.text for token in doc.sentences[j].words])
				# print(mini_sentence)
				if j > 0:
					print("J > 0!!!!!!!!!!!!!!!!")
					f.write("# text_version=")
					f.write(str(text_version))
					previous_text_version = text_version
					f.write("\n")

					f.write("# sentence_id=")
					f.write(str(sent_id))
					previous_sent_id = sent_id
					sent_id += 1
					f.write("\n")

					f.write(f"# text = {mini_sentence}")
					f.write("\n")
				else:
					f.write("# text_version=")
					f.write(str(text_version))
					previous_text_version = text_version
					f.write("\n")

					f.write("# sentence_id=")
					f.write(str(sent_id))
					previous_sent_id = sent_id
					sent_id += 1
					f.write("\n")

					f.write(f"# text = {mini_sentence}")
					f.write("\n")
				for i in range(len(doc.sentences[j].words)):
					token = doc.sentences[j].words[i]
					# print(f"token: {token}")

					xpos = token.xpos if not "None" else "_"
					deps = token.deps if not "None" else "_"
					if j == 0:
						id_to_check_against = token.id
					else:
						id_to_check_against = token.id + sum(nb_words_in_sentence[:j])                    

					for element in original_tokens:

						if element['first_elements'][1] == token.text and int(element['first_elements'][0]) == id_to_check_against:
							misc = element['misc']
							break   
					f.write(f"{token.id}\t{token.text}\t{token.lemma}\t{token.pos}\t{xpos}\t{token.feats}\t{token.head}\t{token.deprel}\t{deps}\t{misc}\n")
				f.write("\n")

original list: [['1', 'Ensuite', 'ensuite', 'ADV', 'ADV', '_', '27', 'mod', '_', 'charID=118__char=E__charStatus=True__pause=1716|charID=103__char=n__charStatus=True__pause=1077|charID=104__char=s__charStatus=True__pause=2309|charID=105__char=u__charStatus=True__pause=2511|charID=106__char=i__charStatus=True__pause=842|charID=107__char=t__charStatus=True__pause=562|charID=108__char=e__charStatus=True__pause=422'], ['2', 'il', 'il', 'CLS', 'CLS', 'n=s|g=m|p=3', '27', 'suj', '_', 'charID=119__char=i__charStatus=True__pause=14009|charID=120__char=l__charStatus=True__pause=437'], ['3', 'faudrait', 'falloir', 'V', 'V', 'n=s|t=C|p=3', '0', 'root', '_', 'charID=122__char=f__charStatus=True__pause=4212|charID=123__char=a__charStatus=True__pause=656|charID=124__char=u__charStatus=True__pause=1248|charID=125__char=d__charStatus=True__pause=2901|charID=126__char=r__charStatus=True__pause=483|charID=127__char=a__charStatus=True__pause=718|charID=128__char=i__charStatus=True__pause=998|charID=129__

In [40]:
def find_first_difference_index(l_orig, l_stanza):
	# Find the minimum length to avoid index out of range errors
	min_len = min(len(l_orig), len(l_stanza))
	
	# Compare the lists element by element
	for i in range(min_len):
		if l_orig[i] != l_stanza[i]:
			return i
	
	# If no difference was found within the range of the shortest list
	# Check if the lengths are different
	if len(l_orig) != len(l_stanza):
		return min_len
	
	# If no difference is found and lengths are the same
	return -1

# Example usage
l_orig = ["C'", 'etait', 'en', 'décembre', 'je', 'connaisser', 'ce', 'garçon', 'depuis', 'la', 'marenelle', 'il', 'etait', 'très', 'gentil', '.', 'Quand', 'nous', 'sommes', 'rentrée', 'en', 'sixiéme', 'il', 'trainaient', 'avec', 'des', 'quatriéme', 'des', 'garçond']
l_stanza = ["C'", 'etait', 'en', 'décembre', 'je', 'connaisser', 'ce', 'garçon', 'depuis', 'la', 'marenelle', 'il', 'etait', 'très', 'gentil', '.', 'Quand', 'nous', 'sommes', 'rentrée', 'en', 'sixiéme', 'il', 'trainaient', 'avec', 'des', 'quatriéme', 'de', 'les', 'garçond']

index = find_first_difference_index(l_orig, l_stanza)
if index != -1:
	print(f"First difference is at index: {index}")
	print(f"Original: {l_orig[index]}")
else:
	print("No difference found")


First difference is at index: 27
Original: des


In [103]:
import stanza
from stanza.models.common.doc import Document

nlp = stanza.Pipeline(lang='en', processors='depparse', depparse_pretagged=True)
pretagged_doc = Document([[{'id': 1, 'text': "C'", 'lemma': 'ce', 'upos': 'CLS', 'xpos': 'DET', 'feats': 'n=s|g=m'}, {'id': 2, 'text': 'etait', 'lemma': 'être', 'upos': 'V', 'xpos': 'NPP', 'feats': '_'}, {'id': 3, 'text': 'en', 'lemma': 'en', 'upos': 'P', 'xpos': 'P', 'feats': '_'}, {'id': 4, 'text': 'décembre', 'lemma': 'décembre', 'upos': 'NC', 'xpos': 'NC', 'feats': 'n=s|g=m'}, {'id': 5, 'text': 'je', 'lemma': 'je', 'upos': 'CLS', 'xpos': 'CLS', 'feats': 'n=s|p=1'}, {'id': 6, 'text': 'connaisser', 'lemma': 'connaître', 'upos': 'V', 'xpos': 'VINF', 'feats': '_'}, {'id': 7, 'text': 'ce', 'lemma': 'ce', 'upos': 'DET', 'xpos': 'DET', 'feats': 'n=s|g=m'}, {'id': 8, 'text': 'garçon', 'lemma': 'garçon', 'upos': 'NC', 'xpos': 'NC', 'feats': 'n=s|g=m'}, {'id': 9, 'text': 'depuis', 'lemma': 'depuis', 'upos': 'P', 'xpos': 'P', 'feats': '_'}, {'id': 10, 'text': 'la', 'lemma': 'le', 'upos': 'DET', 'xpos': 'DET', 'feats': 'n=s|g=f'}, {'id': 11, 'text': 'marenelle', 'lemma': 'maternelle', 'upos': 'NC', 'xpos': 'NC', 'feats': '_'}, {'id': 12, 'text': 'il', 'lemma': 'il', 'upos': 'CLS', 'xpos': 'CLS', 'feats': 'n=s|g=m|p=3'}, {'id': 13, 'text': 'etait', 'lemma': 'être', 'upos': 'V', 'xpos': 'V', 'feats': '_'}, {'id': 14, 'text': 'très', 'lemma': 'très', 'upos': 'ADV', 'xpos': 'ADV', 'feats': '_'}, {'id': 15, 'text': 'gentil', 'lemma': 'gentil', 'upos': 'ADJ', 'xpos': 'ADJ', 'feats': 'n=s|g=m'}, {'id': 16, 'text': '.', 'lemma': '.', 'upos': 'PONCT', 'xpos': 'PONCT', 'feats': '_'}, {'id': 17, 'text': 'Quand', 'lemma': 'quand', 'upos': 'CS', 'xpos': 'CS', 'feats': '_'}, {'id': 18, 'text': 'nous', 'lemma': 'nous', 'upos': 'CLS', 'xpos': 'CLS', 'feats': 'n=p|p=1'}, {'id': 19, 'text': 'sommes', 'lemma': 'être', 'upos': 'V', 'xpos': 'V', 'feats': 'n=p|t=P|p=1'}, {'id': 20, 'text': 'rentrée', 'lemma': 'rentrer', 'upos': 'VPP', 'xpos': 'VPP', 'feats': 'n=s|g=f|t=K'}, {'id': 21, 'text': 'en', 'lemma': 'en', 'upos': 'P', 'xpos': 'P', 'feats': '_'}, {'id': 22, 'text': 'sixiéme', 'lemma': 'sixième', 'upos': 'NC', 'xpos': 'ADV', 'feats': '_'}, {'id': 23, 'text': 'il', 'lemma': 'il', 'upos': 'CLS', 'xpos': 'CLS', 'feats': 'n=s|g=m|p=3'}, {'id': 24, 'text': 'trainaient', 'lemma': 'traîner', 'upos': 'V', 'xpos': 'V', 'feats': '_'}, {'id': 25, 'text': 'avec', 'lemma': 'avec', 'upos': 'P', 'xpos': 'P', 'feats': '_'}, {'id': 26, 'text': 'des', 'lemma': 'un', 'upos': 'DET', 'xpos': 'DET', 'feats': 'n=p'}, {'id': 27, 'text': 'quatriéme', 'lemma': 'quatrième', 'upos': 'NC', 'xpos': 'NC', 'feats': '_'}, {'id': 28, 'text': 'des', 'lemma': 'un', 'upos': 'DET', 'xpos': 'P+D', 'feats': 'n=p'}, {'id': 29, 'text': 'garçond', 'lemma': 'garçon', 'upos': 'NC', 'xpos': 'NC', 'feats': '_'}]
])
doc = nlp(pretagged_doc)
print(doc)

2024-07-12 17:06:52 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-12 17:06:52 INFO: Downloaded file to /Users/madalina/stanza_resources/resources.json
2024-07-12 17:06:52 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| depparse  | combined_charlm |

2024-07-12 17:06:52 INFO: Using device: cpu
2024-07-12 17:06:52 INFO: Loading: depparse
2024-07-12 17:06:53 INFO: Done loading processors!


[
  [
    {
      "id": 1,
      "text": "C'",
      "lemma": "ce",
      "upos": "CLS",
      "xpos": "DET",
      "feats": "n=s|g=m",
      "head": 2,
      "deprel": "dep"
    },
    {
      "id": 2,
      "text": "etait",
      "lemma": "être",
      "upos": "V",
      "xpos": "NPP",
      "feats": "_",
      "head": 0,
      "deprel": "root"
    },
    {
      "id": 3,
      "text": "en",
      "lemma": "en",
      "upos": "P",
      "xpos": "P",
      "feats": "_",
      "head": 2,
      "deprel": "flat"
    },
    {
      "id": 4,
      "text": "décembre",
      "lemma": "décembre",
      "upos": "NC",
      "xpos": "NC",
      "feats": "n=s|g=m",
      "head": 2,
      "deprel": "flat"
    },
    {
      "id": 5,
      "text": "je",
      "lemma": "je",
      "upos": "CLS",
      "xpos": "CLS",
      "feats": "n=s|p=1",
      "head": 6,
      "deprel": "punct"
    },
    {
      "id": 6,
      "text": "connaisser",
      "lemma": "connaître",
      "upos": "V",
      "xpos": "V

In [None]:
# create_new_conll(data, "output.conll")

# for i in range(len(data)):
#     # print(data[0].split("\n"))
#     # print(data[0].split("\n")[2].split("\t")[1])
#     conll_sent = data[i].split("\n")
#     print(conll_sent)
#     sentence_complete = conll_sent[2:]
#     # print(sentence_complete)
#     sentence = ""
#     miscs = []
#     for token in sentence_complete:
#         elements = token.split("\t")
#         # print(elements)
#         miscs.append(elements[9] if len(elements) > 1 else "")
#         if len(elements) > 1:
#             sentence += elements[1] + " "
#     print(sentence)

#     doc = nlp(sentence)
#     print(doc.sentences)
#     # print(miscs)
#     # print(doc.sentences[0].words[0])
#     # for i in range(len(doc.sentences[0].words)):
#     #     token = doc.sentences[0].words[i]
#         # print("============")
#         # print(token.text)
#         # print(miscs[i])

In [36]:
import stanza

nlp = stanza.Pipeline('fr', processors='tokenize, pos, lemma, depparse')

def extract_sentences(data):
	sentences = []
	miscs = []
	with open ("clean_file.conll", "w", encoding="utf-8") as f:
		for i in range(len(data)):
			conll_sent = data[i].split("\n")
			text_version = conll_sent [0] #literally: # text_version=0
			f.write(text_version + "\n")
			sentence_id = conll_sent[1] #literally: # sentence_id=1
			f.write(sentence_id + "\n")
			sentence_complete = conll_sent[2:]
			sentence = ""
			for token in sentence_complete:
				elements = token.split("\t")
				miscs.append(elements[9] if len(elements) > 1 else "")
				if len(elements) > 1:
					sentence += elements[1] + " "
			f.write(f"# text = {sentence}\n")
			doc = nlp(sentence)
			for i in range(len(doc.sentences[0].words)):
				token = doc.sentences[0].words[i]
				xpos = token.xpos if not "None" else "_"
				deps = token.deps if not "None" else "_"
				f.write(f"{token.id}\t{token.text}\t{token.lemma}\t{token.pos}\t{xpos}\t{token.feats}\t{token.head}\t{token.deprel}\t{deps}\t{miscs[i]}\n")
			f.write("\n")

extract_sentences(data)



2024-07-10 11:57:28 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-10 11:57:28 INFO: Downloaded file to /Users/madalina/stanza_resources/resources.json
2024-07-10 11:57:29 INFO: Loading these models for language: fr (French):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2024-07-10 11:57:29 INFO: Using device: cpu
2024-07-10 11:57:29 INFO: Loading: tokenize
2024-07-10 11:57:29 INFO: Loading: mwt
2024-07-10 11:57:29 INFO: Loading: pos
2024-07-10 11:57:29 INFO: Loading: lemma
2024-07-10 11:57:29 INFO: Loading: depparse
2024-07-10 11:57:29 INFO: Done loading processors!


In [3]:
from conllu import parse_incr, parse

def create_sentences_file(data, output):
	with open(output, "w") as f:
		for i in range(len(data)):
			sent = ""
			sentence = data[i]
			sentence = parse(sentence)
			tokens = sentence[0]
			for token in tokens:
				sent = sent + str(token) + " "
			f.write(sent.strip() + "\n")
	print(f"File {output} created")

create_sentences_file(data, "sentences.txt")

File sentences.txt created


In [8]:
import stanza

nlp = stanza.Pipeline('fr', processors='tokenize, pos, lemma, depparse')
with open ("sentences.txt", "r") as f:
	for line in f:
		print("====================================")
		doc = nlp(line)
		print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')



2024-07-10 10:51:39 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-10 10:51:39 INFO: Downloaded file to /Users/madalina/stanza_resources/resources.json
2024-07-10 10:51:40 INFO: Loading these models for language: fr (French):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2024-07-10 10:51:40 INFO: Using device: cpu
2024-07-10 10:51:40 INFO: Loading: tokenize
2024-07-10 10:51:40 INFO: Loading: mwt
2024-07-10 10:51:40 INFO: Loading: pos
2024-07-10 10:51:40 INFO: Loading: lemma
2024-07-10 10:51:40 INFO: Loading: depparse
2024-07-10 10:51:40 INFO: Done loading processors!


id: 1	word: Moi	head id: 4	head: aime	deprel: dislocated
id: 2	word: je	head id: 4	head: aime	deprel: nsubj
id: 3	word: n'	head id: 4	head: aime	deprel: advmod
id: 4	word: aime	head id: 0	head: root	deprel: root
id: 5	word: pas	head id: 4	head: aime	deprel: advmod
id: 6	word: ca	head id: 4	head: aime	deprel: obj
id: 7	word: la	head id: 8	head: violence	deprel: det
id: 8	word: violence	head id: 4	head: aime	deprel: obj
id: 9	word: parce	head id: 14	head: a	deprel: mark
id: 10	word: '	head id: 9	head: parce	deprel: punct
id: 11	word: que	head id: 14	head: a	deprel: mark
id: 12	word: il	head id: 14	head: a	deprel: expl:subj
id: 13	word: y	head id: 14	head: a	deprel: expl:comp
id: 14	word: a	head id: 4	head: aime	deprel: advcl
id: 15	word: une	head id: 16	head: attebn	deprel: det
id: 16	word: attebn	head id: 14	head: a	deprel: obj
id: 1	word: Moi	head id: 4	head: aime	deprel: dislocated
id: 2	word: je	head id: 4	head: aime	deprel: nsubj
id: 3	word: n'	head id: 4	head: aime	deprel: advmod
i

In [7]:
from stanza.utils.conll import CoNLL

dicts = [[{'id': 1, 'text': 'Test', 'upos': 'NOUN', 'xpos': 'NN', 'feats': 'Number=Sing', 'misc': 'start_char=0|end_char=4'}, {'id': 2, 'text': 'sentence', 'upos': 'NOUN', 'xpos': 'NN', 'feats': 'Number=Sing', 'misc': 'start_char=5|end_char=13'}, {'id': 3, 'text': '.', 'upos': 'PUNCT', 'xpos': '.', 'misc': 'start_char=13|end_char=14'}]] # dicts is List[List[Dict]], representing each token / word in each sentence in the document
conll = CoNLL.convert_dict(dicts) # conll is List[List[List]], representing each token / word in each sentence in the document
print(conll)

[[['1', 'Test', '_', 'NOUN', 'NN', 'Number=Sing', '0', '_', '_', 'start_char=0|end_char=4'], ['2', 'sentence', '_', 'NOUN', 'NN', 'Number=Sing', '1', '_', '_', 'start_char=5|end_char=13'], ['3', '.', '_', 'PUNCT', '.', '_', '2', '_', '_', 'start_char=13|end_char=14']]]


In [9]:
import stanza

# Download the model (if not already done)
stanza.download('en')

# Initialize the pipeline
nlp = stanza.Pipeline('en')

# Annotate a sentence
doc = nlp("Your sentence here.")  # Replace with your sentence

# Convert to CoNLL format
conll = doc.to_conll()

# Write to a file
with open("output.conll", "w") as f:
	f.write(conll)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-10 10:56:27 INFO: Downloaded file to /Users/madalina/stanza_resources/resources.json
2024-07-10 10:56:27 INFO: Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/default.zip:   0%|          | 0…

2024-07-10 10:56:44 INFO: Downloaded file to /Users/madalina/stanza_resources/en/default.zip
2024-07-10 10:56:45 INFO: Finished downloading models and saved to /Users/madalina/stanza_resources
2024-07-10 10:56:45 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-10 10:56:45 INFO: Downloaded file to /Users/madalina/stanza_resources/resources.json
2024-07-10 10:56:46 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2024-07-10 10:56:46 INFO: Using device: cpu
2024-07-10 10:56:46 INFO: Loading: tokenize
2024-07-10 10:56:46 INFO: Loading: mwt
2024-07-10 10:56:46 INFO: Loading: pos
2024-07-10 10:56:46 INFO: Loading: lemma
2024-07-10 10:56:46 INFO: Loading: constituency
2024-07-10 10:56:46 INFO: Loading: depparse
2024-07-10 10:56:47 INFO: Loading: sentiment
2024-07-10 10:56:47 INFO: Loading: ne

AttributeError: 'Document' object has no attribute 'to_conll'

In [40]:
s = '3	au	à + le	ADP	_	None	5	case	_	charID=879__char=a__charStatus=True__pause=671|charID=880__char=u__charStatus=True__pause=109'
s = s.split("\t")
print(len(s))

10
