In [1]:
# https://towardsdatascience.com/calculating-string-similarity-in-python-276e18a7d33a
# https://pypi.org/project/fuzzywuzzy/
# https://www.adamsmith.haus/python/answers/how-to-find-a-similarity-metric-between-two-strings-in-python


from transformers import T5TokenizerFast, T5ForConditionalGeneration 
from transformers import Trainer

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2" 


from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
import numpy as np
import torch
import torchvision
import Levenshtein
from fuzzywuzzy import fuzz
import time

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cuda:2") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
def get_levenshtein_dis(str_1, str_2):
    return Levenshtein.distance(str_1, str_2)

In [3]:
def get_fuzzy_ration(str_1, str_2):
    return fuzz.ratio(str_1, str_2)

In [4]:
max_source_length = 1024
max_target_length = 128

In [5]:
tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model_path = "./NEL_model_shuffled_add_spaces_in_input/checkpoint-20000"
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
# model = T5ForConditionalGeneration.from_pretrained(model_path)

## testing with the simple test data

In [6]:
simple_test_data = pd.read_csv('./2-NEL_Data/2-csv_format_2/simple_test_shuffled.csv')
# simple_test_data = simple_test_data.sample(frac=1, random_state=1)
simple_test_data

Unnamed: 0,qid,question,entity,wikidata_reply,qid_in_reply,input_len
0,Q5487302,** Which genre of album is harder.....faster? **,** **,** **,False,64
1,Q16330302,** what city was alex golfis born in **,** alex golfis **,"** [[ Q16330302 , Alex Golfis , Greek actor (1...",True,121
2,Q16225521,** what film is by the writer phil hay? **,** phil hay **,"** [[ Q16225521 , Phil Hay , screenwriter ], [...",True,419
3,Q7358590,** Where did roger marquis die **,** roger marquis **,"** [[ Q7358590 , Roger Marquis , American base...",True,209
4,Q154335,** what was the cause of death of yves klein **,** yves klein **,"** [[ Q8062325 , Yves Klein Blue , Australian ...",True,553
...,...,...,...,...,...,...
9956,Q1447249,** who was the creator of the fictional charac...,** doctor faustus **,"** [[ Q386431 , Doctor Faustus , novel written...",True,549
9957,Q34863,** what's a college sporting event that took p...,** oklahoma city **,"** [[ Q15256873 , OKC Energy FC , football clu...",True,636
9958,Q582715,** what celestial object is 2974 holden **,** 2974 holden **,"** [[ Q582715 , 2974 Holden , asteroid ]] **",True,107
9959,Q582147,** what is the film genre for snow falling on ...,** snow falling on cedars **,"** [[ Q582147 , Snow Falling on Cedars , 1999 ...",True,229


In [7]:
input_text = list(simple_test_data['question'] + ',' + simple_test_data['entity'] + ',' + simple_test_data['wikidata_reply'])
input_text[0]

'** Which genre of album is harder.....faster? **,**    **,**  **'

In [8]:
target_text = list(simple_test_data['qid'])
target_text[0]

'Q5487302'

In [9]:
counter_empty = 0
for s in simple_test_data['entity']:
    if s == '****':
        counter_empty += 1
counter_empty

0

In [10]:
X_test_tokenized = tokenizer(['nel: ' + sequence for sequence in input_text], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_test_tokenized = tokenizer(target_text, 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(simple_test_data))

9961


In [11]:
tokens = []
for text in input_text:
    tokens.append(tokenizer('nel: ' + text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids)

In [12]:
tokens[0]

tensor([[    3,    29,    15,    40,    10, 14011,  4073,  5349,    13,  2306,
            19,  7501,  9374, 11584,    49,    58, 14011,     6, 19844, 14011,
             6, 19844, 14011,     1]])

In [13]:
# results = []
# # model = model.to(device)

# for token in tokens:
#     results.append(model.generate(token.to(device))) 

In [18]:
start = time.time()
counter = 0
simple_predictions = []
for token in tokens:
    if counter%100 == 0:
        print(counter)
    counter += 1
    prediction = model.generate(token.to(device))
    simple_predictions.append(prediction)
    
print(time.time() - start)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
1271.919031381607


In [19]:
# tokenizer.decode(simple_predictions[0][0], skip_special_tokens=True)

In [20]:
final_ouput_2 = []
for result in simple_predictions:
    final_ouput_2.append(tokenizer.decode(result[0], skip_special_tokens=True))

In [21]:
final_ouput_2[0]

'Q71616'

In [22]:
# counter = 0
# for i in range(len(final_ouput_2)):
#     if final_ouput_2[i] != predicitons[i]:
#         print(target_text[i] + "    " + final_ouput_2[i] + "    " + predicitons[i])
#         counter += 1
# counter

In [23]:
# counter = 0
# for i in range(len(final_ouput_2)):
#     if predicitons[i] != target_text[i]:
#         counter += 1
# 1- counter/len(predicitons)

In [25]:
counter = 0
for i in range(len(final_ouput_2)):
    if final_ouput_2[i] == target_text[i]:
        counter += 1
counter*100/len(final_ouput_2)

84.98142756751331

In [36]:
counter = 0
for i in range(len(final_ouput_2)):
    if final_ouput_2[i] == target_text[i]:
        counter += 1
counter*100/len(final_ouput_2)

84.98142756751331

In [27]:
counter = 0
for i in range(len(final_ouput_2)):
    if final_ouput_2[i] != target_text[i] and simple_test_data['entity'][i] != '****':
        counter += 1
#         print(final_ouput_2[i] + '    ' + target_text[i])
1- counter/len(final_ouput_2)

0.849814275675133

## disamb_test_data

In [28]:
disamb_test_data = pd.read_csv('./2-NEL_Data/2-csv_format_2/wikidata_disambig_test_shuffled.csv')
disamb_test_data

Unnamed: 0,qid,question,entity,wikidata_reply,qid_in_reply,input_len
0,Q3918,** The University of Nanking (金陵大学) was a priv...,** University **,"** [[ Q13371 , Harvard University , private un...",True,743
1,Q16666,"** private university in Nanjing, China which...",** Nanjing **,"** [[ Q19851992 , Nanjing , direct-administere...",True,707
2,Q1860,** first school officially named University (...,** English **,"** [[ Q1062280 , Channel 5 , British TV statio...",True,688
3,Q81982,** to the German-speaking nations. Silbermann...,** harpsichord **,"** [[ Q29385649 , Harpsichord , chordophone-zi...",True,682
4,Q3758482,** Bart van Oort and Gary Cooper. Opinions. O...,** Gary Cooper **,"** [[ Q3758482 , Gary Cooper , British musicia...",True,649
...,...,...,...,...,...,...
9995,Q81292,** = 100 sq ft). ≈0.00002295684 acres. 92 903...,** acre **,"** [[ Q126084 , Acre , city in Israel ], [ Q15...",True,702
9996,Q190,** that all beings are expressions of and par...,** God **,"** [[ Q604258 , Göd , town in Hungary ], [ Q17...",True,711
9997,Q193291,** idealistic and panentheistic philosophies....,** Spirit **,"** [[ Q256616 , Spirit , 2007 album by Leona L...",True,712
9998,Q1845,** as Religious Science or Science of Mind. U...,** Bible **,"** [[ Q1845 , Bible , collection of sacred boo...",True,620


In [29]:
disamb_input_text = list(disamb_test_data['question'] + ',' + disamb_test_data['entity'] + ',' + disamb_test_data['wikidata_reply'])
disamb_input_text[0]

'** The University of Nanking (金陵大学) was a private university in Nanjing, China  **,**  University  **,** [[ Q13371 , Harvard University , private university in Cambridge, Massachusetts ], [ Q3918 , university , academic institution for further education ], [ Q21028957 , Hochschule , German term for "higher schooling", i.e. institutions of tertiary education (such as Universities), which may or may not include vocational or professional schools ], [ Q2463874 , University , Hillsborough County, Florida, USA ], [ Q7894478 , University , Orange County, Florida, USA ], [ Q34433 , University of Oxford , collegiate research university in Oxford, England ], [ Q3612349 , University , census-designated place in Mississippi, United States ]] **'

In [30]:
disamb_target_text = list(disamb_test_data['qid'])
disamb_target_text[0]

'Q3918'

In [31]:
counter_empty = 0
for s in disamb_test_data['entity']:
    if s == '****':
        counter_empty += 1
counter_empty

0

In [32]:
disamb_tokens = []
for text in disamb_input_text:
    disamb_tokens.append(tokenizer('nel: ' + text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids)

In [33]:
start = time.time()
counter = 0
disamb_predictions = []
for token in disamb_tokens:
    if counter%100 == 0:
        print(counter)
    counter += 1
    prediction = model.generate(token.to(device))
    disamb_predictions.append(prediction)
    
print(time.time() - start)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
1248.0933773517609


In [34]:
disamb_final_ouput_2 = []
for result in disamb_predictions:
    disamb_final_ouput_2.append(tokenizer.decode(result[0], skip_special_tokens=True))

In [35]:
disamb_counter = 0
for i in range(len(disamb_final_ouput_2)):
    if disamb_final_ouput_2[i] == disamb_target_text[i]:
        disamb_counter += 1
disamb_counter*100/len(disamb_final_ouput_2)

93.01

In [53]:
disamb_counter = 0
for i in range(len(disamb_final_ouput_2)):
    if disamb_final_ouput_2[i] == disamb_target_text[i]:
        disamb_counter += 1
disamb_counter*100/len(disamb_final_ouput_2)

92.02

In [54]:
disamb_counter = 0
for i in range(len(disamb_final_ouput_2)):
    if disamb_final_ouput_2[i] != disamb_target_text[i] and disamb_test_data['entity'][i] != '****':
        disamb_counter += 1
#         print(final_ouput_2[i] + '    ' + target_text[i])
1- disamb_counter/len(disamb_final_ouput_2)

0.9202

###########################

In [None]:
import re
entities = []
for wiki_reply in wikidata_data:
    entities.append(re.findall(r'\[+(Q.*?),', wiki_reply))
len(entities)

In [None]:
final_copy = final_ouput_2.copy()
for i in range(len(final_copy)):
    min_value = 10000
    value_to_add = ''
    for entity in entities[i]:
        if get_levenshtein_dis(entity, final_ouput_2[i]) < min_value:
            min_value =  get_levenshtein_dis(entity, final_ouput_2[i])
            value_to_add = entity
    final_copy[i] = value_to_add

In [None]:
counter = 0
for i in range(len(final_copy)):
    if final_copy[i] != target_text[i]:
        counter += 1
1- counter/len(final_copy)

In [None]:
final_copy_3 = final_ouput_2.copy()
for i in range(len(final_copy)):
    diff = 10000
    value_to_add = ''
    for entity in entities[i]:
        try:
            if abs(int(entity[1:]) -  int(final_ouput_2[i][1:])) < diff:
                diff =  abs(int(entity[1:]) -  int(final_ouput_2[i][1:]))
                value_to_add = entity
        except:
            pass
    final_copy_3[i] = value_to_add

In [None]:
counter = 0
for i in range(len(final_copy_3)):
    if final_copy_3[i] != target_text[i]:
        counter += 1
1- counter/len(final_copy_3)

In [31]:
final_copy_2 = final_ouput_2.copy()
for i in range(len(final_copy_2)):
    max_value = -10000
    value_to_add = ''
    for entity in entities[i]:
        if get_levenshtein_dis(entity, final_ouput_2[i]) > max_value:
            max_value =  get_fuzzy_ration(entity, final_ouput_2[i])
            value_to_add = entity
    final_copy_2[i] = value_to_add
    print(max_value)

100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
36
100
100
100
100
100
35
100
24
100
100
100
100
57
100
100
100
100
100
50
100
100
100
36
100
100
100
100
100
100
100
100
100
36
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
57
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
53
100
100
100
100
100
100
100
100
100
100
43
100
100
100
40
100
100
100
100
100
50
50
100
100
100
46
100
100
100
100
100
100
100
100
100
100
29
100
100
100
100
100
47
100
100
40
100
100
100
100
62
100
100
36
100
50
100
50
25
100
100
59
33
100
100
100
100
100
100
100
100
100
100
40
100
100
25
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
40
100
100
100
100
100
100
100
100
100
71
100
35
100
100
100
100
27
100
100
88
100
100
100
100
100
100
100
100
100
100
100
100
100
100

100
100
100
100
100
100
100
100
100
57
100
46
100
100
27
100
100
100
100
100
100
100
40
100
88
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
40
100
29
47
100
38
100
62
100
100
100
100
93
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
35
100
100
100
100
100
100
100
100
88
100
100
100
100
100
100
100
100
100
43
100
100
100
40
100
100
29
100
100
100
100
100
100
100
100
100
47
100
100
100
100
100
100
100
27
43
100
100
100
46
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
31
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
50
100
100
100
100
100
100
100
100
31
100
100
100
100
100
100
100
100
100
100
100
100
100
100
25
100
100
43
40
100
88
47
100
100
100
100
100
100
100
100
100
100
100
100
100
100
27
100
27
100
100
29
35
100
100
100
100
46
100
22
24
57
25
100
100
100
59
100
100
100
59
100
100
100
47
100
100
100
38
100
100
100
100
100
100
27
53
40
100
33
100
47
35
100
100
100
100
100
100
100
100
100
100

67
100
100
100
100
35
100
100
100
100
100
100
18
100
43
100
100
62
100
47
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
62
100
33
100
100
100
100
100
100
100
100
100
100
100
50
100
100
100
100
100
100
100
100
100
100
100
100
100
62
100
100
43
100
100
40
100
100
100
100
100
100
100
100
100
100
100
100
35
100
100
100
100
100
47
40
100
100
100
100
25
57
100
100
100
100
38
100
100
100
42
100
86
88
100
100
100
100
100
100
100
31
100
100
47
100
100
100
43
100
100
100
57
100
100
100
100
100
100
100
100
100
100
100
100
100
100
88
25
100
100
100
100
100
50
100
100
100
100
100
100
100
38
62
100
100
100
100
38
40
100
100
100
100
100
27
100
100
100
100
100
100
100
100
100
100
100
100
100
100
25
100
38
100
100
40
47
100
100
100
100
100
100
100
100
43
100
100
47
100
100
25
100
100
100
100
27
100
100
100
100
100
100
88
100
100
100
100
100
100
100
25
100
100
100
100
100
40
100
100
100
100
100
100
100
100
100
100
100
100
100
53
100
100
100
100
100
53
100
100
100
100
100
33

33
100
100
100
100
35
100
100
44
100
100
100
100
100
100
56
100
100
100
31
100
100
40
100
100
100
100
82
100
100
100
100
100
100
100
31
100
100
100
100
100
100
100
38
100
100
100
100
100
44
100
100
40
100
40
100
100
100
100
100
100
100
100
100
100
100
44
100
100
100
100
100
25
62
100
100
100
100
100
100
100
100
100
100
100
100
100
40
100
38
100
100
100
100
100
100
59
40
100
100
100
59
100
100
100
100
100
100
100
29
13
100
33
100
100
100
100
100
100
100
100
100
38
100
100
100
100
100
100
100
100
31
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
24
100
35
100
100
100
40
100
100
100
100
100
100
100
100
100
100
47
100
40
100
40
100
100
100
100
100
27
100
100
100
100
15
100
35
38
100
100
33
29
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
38
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
43
40
33
27
100
100
40
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
15
53
100
100
100
100
100
100
100
100
100
100
100
1

100
100
100
100
100
100
100
100
38
100
100
100
100
13
100
100
100
100
100
100
44
100
100
100
62
100
50
100
100
35
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
18
100
100
43
100
100
100
100
38
100
100
100
57
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
40
100
53
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
35
100
100
100
100
25
100
53
100
100
100
100
100
100
100
88
100
100
100
100
100
100
38
100
100
44
100
100
100
100
40
100
100
100
100
94
100
100
100
100
100
100
100
100
100
100
53
50
100
27
100
100
100
100
100
100
100
27
100
100
100
100
100
100
100
100
100
100
35
100
33
100
100
100
100
100
100
100
100
100
62
100
47
24
100
100
100
100
100
100
33
100
100
100
100
38
100
100
100
100
100
100
100
100
100
100
100
46
50
100
100
100
100
100
35
100
100
100
100
100
100
100
93
100
100
100
100
100
100
100
100
100
100
100
24
100
100
100
100
38
100
100
100
100
100
1

In [32]:
counter = 0
for i in range(len(final_copy_2)):
    if final_copy_2[i] != target_text[i]:
        counter += 1
1- counter/len(final_copy_2)

0.8339390268524127

## testing with the unshuffled data

In [27]:
test_data = pd.read_csv('./2-NEL_Data/2-csv_format_2/test_data.csv')
test_data = test_data.sample(frac=1, random_state=1)
test_data

Unnamed: 0,qid,question,entity,wikidata_reply
8687,Q183862,**what album has metalcore music?**,**metalcore**,"**[[Q183862, metalcore, fusion genre of heavy ..."
7972,Q1641839,**Name an experimental rock album.**,**experimental rock**,"**[[Q1641839, experimental rock, type of music..."
1628,Q17285413,**Where is joy sengupta from?**,**joy sengupta**,"**[[Q17285413, Joy Sengupta, Indian actor and ..."
8699,Q513674,**What is the sex of matthew breeze?**,**matthew breeze**,"**[[Q513674, Matthew Breeze, Australian soccer..."
5648,Q7333580,**what country is tuxbury pond in**,**tuxbury pond**,"**[[Q7333580, Tuxbury Pond, lake in Rockingham..."
...,...,...,...,...
2895,Q200092,**What is the name of a horror movie on netflix**,**horror movie**,"**[[Q200092, horror film, film genre], [Q59051..."
7813,Q7038198,**what kind of film is ninaithen vandhai?**,**ninaithen vandhai**,"**[[Q7038198, Ninaithen Vandhai, 1998 film by ..."
905,Q534599,**Where did damon knight die?**,**damon knight**,"**[[Q534599, Damon Knight, American science fi..."
5192,Q21077,**what artist is signed to warner music group?**,**warner music group**,"**[[Q21077, Warner Music Group, American multi..."


In [28]:
input_text = list(test_data['question'] + ',' + test_data['entity'] + ',' + test_data['wikidata_reply'])
input_text[0]

'**what album has metalcore music?**,**metalcore**,**[[Q183862, metalcore, fusion genre of heavy metal and hardcore punk], [Q108940567, Metalcore Superstars, album by One Morning Left], [Q4490718, melodic metalcore, subgenre of metalcore], [Q30587784, progressive metalcore, subgenre of metalcore], [Q3501147, gabber metal, fusion of gabber and metal], [Q1965804, Metalcore-bändide loend, Wikimedia list article]]**'

In [29]:
target_text = list(test_data['qid'])
target_text[0]

'Q183862'

In [30]:
X_test_tokenized = tokenizer(['nel: ' + sequence for sequence in input_text], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_test_tokenized = tokenizer(target_text, 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(test_data))

9906


In [31]:
from transformers import Seq2SeqTrainingArguments

test_args = Seq2SeqTrainingArguments(
    "test_trainer",
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    eval_accumulation_steps = 50,  # VIP
    predict_with_generate=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [32]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model, 
    args=test_args
#     train_dataset= train_dataset, 
#     eval_dataset= eval_dataset
)

In [33]:
test_dataset = Dataset(X_test_tokenized, y_test_tokenized) 

In [34]:
tokens = []
for text in input_text:
    tokens.append(tokenizer('nel: ' + text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids)

In [35]:
results = []
model = model.to(device)

for token in tokens:
    results.append(model.generate(token.to(device))) 

In [36]:
final_ouput_2 = []
for result in results:
    final_ouput_2.append(tokenizer.decode(result[0], skip_special_tokens=True))

In [37]:
counter = 0
for i in range(len(final_ouput_2)):
    if final_ouput_2[i] != target_text[i]:
        counter += 1
1- counter/len(final_ouput_2)

0.8921865536038764

In [38]:
wikidata_data = list(test_data['wikidata_reply'])

In [39]:
import re
entities = []
for wiki_reply in wikidata_data:
    entities.append(re.findall(r'\[+(Q.*?),', wiki_reply))
len(entities)

9906

In [40]:
final_copy = final_ouput_2.copy()
for i in range(len(final_copy)):
    min_value = 10000
    value_to_add = ''
    for entity in entities[i]:
        if get_levenshtein_dis(entity, final_ouput_2[i]) < min_value:
            min_value =  get_levenshtein_dis(entity, final_ouput_2[i])
            value_to_add = entity
    final_copy[i] = value_to_add

In [41]:
counter = 0
for i in range(len(final_copy)):
    if final_copy[i] != target_text[i]:
        counter += 1
1- counter/len(final_copy)

0.8924894003634161