### Suppression des didascalies

In [1]:
import os
import json
import spacy
from forced_alignment import ForcedAlignment

### Json File

In [3]:
json_file = "/vol/work1/bergoend/didascalies.jsonl"

with open(json_file, 'r') as json_file:
    json_list = list(json_file)
    
print("Nombre de phrases annotées :", len(json_list))

Nombre de phrases annotées : 1510


In [4]:
forced_alignment = ForcedAlignment()

### Création d'un dictionnaire de corrections

- Chargement de l'alignement forcé correspondant à l'épisode en cours
- Recherche de la phrase annotée à l'aide du contexte gauche
- Recherche des mots à supprimer (chaque mot à supprimer a un identifiant qui est sa position dans la phrase affichée dans Prodigy)
- Stockage : {(épisode, index de la phrase annotée dans l'alignement forcé) : (phrase affichée dans Prodigy, correction à apporter)}

In [7]:
# modifications for the new transcript
corrections = {}
n_corrections = 0

for el in json_list:
    
    # read json
    el = json.loads(el)
    
    # find elements to delete
    if el['answer'] == 'reject':
        try:
            # displayed tokens in Prodigy (left and right context + sentence)
            tokens = el['tokens']            
            # words to delete
            delete = el['spans']
            meta = el['meta']
            # processed sentence in Prodigy
            initial_sent = el['sentence']
            # context (disabled)
            left = el['left']
            right = el['right']
        except KeyError:
            continue

        # token list of tokens displayed in Prodigy
        token_list = [token['text'] for token in tokens]
        print("\nEpisode :", meta['episode'])
        print("Tokens affichés dans Prodigy :", token_list)

        # load forced alignment
        episode = meta['episode'].split('.')        
        aligned = f"/vol/work1/bergoend/pyannote-db-plumcot/Plumcot/data/{episode[0]}/forced-alignment/{meta['episode']}.aligned"       
        transcript = forced_alignment(aligned)      
        sentences = list(transcript.sents)

        for sentence in sentences:

            # find left context
            if str(sentence) == left:
                
                # find initial sentence
                center = sentences[sentences.index(sentence) +1]
                index = sentences.index(center)

                if str(center) == initial_sent:
                    print("Phrase sélectionnable dans Prodigy :", str(center))
                    # delete
                    for dic in delete:
                        if dic['label'] == "DELETE":                            
                            n_corrections +=1
                            # find the words group to delete in initial sentence
                            to_delete = " ".join(token_list[dic['token_start']:dic['token_end']+1])
                            print("Tokens à supprimer :", to_delete)
                            
                            # delete didascalie
                            if to_delete in initial_sent:                                
                                new_sent = initial_sent.replace(to_delete, ' ')
                                print("Nouvelle phrase après correction :", new_sent)
                                corrections[(meta['episode'], index)] = (center, new_sent)

                # continue until initial sentence is found
                else:
                    continue




Episode : 24.Season01.Episode02
Tokens affichés dans Prodigy : ['Get', 'back', '!', 'RICK', ':', 'Do', "n't", 'be', 'a', 'moron', '!']
Phrase sélectionnable dans Prodigy : RICK :
Tokens à supprimer : RICK :
Nouvelle phrase après correction :  

Episode : 24.Season01.Episode02
Tokens affichés dans Prodigy : ['et', "'s", 'get', 'out', 'of', 'here', '.', 'JACK', ':', 'Take', 'the', 'wall', '.']
Phrase sélectionnable dans Prodigy : JACK
Tokens à supprimer : JACK
Nouvelle phrase après correction :  

Episode : 24.Season01.Episode02
Tokens affichés dans Prodigy : ['Hello', '?', 'KIM', ':', 'Mom', '?']
Phrase sélectionnable dans Prodigy : KIM :
Tokens à supprimer : KIM :
Nouvelle phrase après correction :  

Episode : 24.Season01.Episode02
Tokens affichés dans Prodigy : ['She', 'went', 'for', 'a', 'walk', '.', 'KIM', ':', 'We', "'re", 'okay']
Phrase sélectionnable dans Prodigy : KIM :
Tokens à supprimer : KIM :
Nouvelle phrase après correction :  

Episode : 24.Season01.Episode02
Tokens affi

In [8]:
print(corrections)
print("Nombre d'annotations contenant des suppressions :", n_corrections)
print("Nombre de corrections prises en compte :", len(corrections))

{('24.Season01.Episode02', 206): (RICK :, ' '), ('24.Season01.Episode02', 343): (JACK, ' '), ('24.Season01.Episode02', 534): (KIM :, ' '), ('24.Season01.Episode02', 553): (KIM :, ' '), ('24.Season01.Episode02', 35): (MAN :, ' '), ('24.Season01.Episode05', 763): ((bird flies off Just be cool ., '  Just be cool .'), ('24.Season01.Episode08', 513): (TERI :, ' '), ('24.Season01.Episode08', 604): (KIM :, ' '), ('24.Season01.Episode24', 547): (REPORTER :, ' '), ('BattlestarGalactica.Season01.Episode03', 621): (Roslin :, ' '), ('BattlestarGalactica.Season01.Episode10', 11): (snakes-, ' '), ('BattlestarGalactica.Season01.Episode10', 27): (Giggling, ' '), ('BattlestarGalactica.Season01.Episode10', 61): (Apollo :, ' '), ('BattlestarGalactica.Season01.Episode10', 323): (Crashdown :, ' '), ('BattlestarGalactica.Season01.Episode10', 329): (Gaeta :, ' '), ('BattlestarGalactica.Season01.Episode11', 2): (6 gasps.], ' '), ('BattlestarGalactica.Season01.Episode12', 369): (Adama :, ' '), ('BattlestarGala

### Episodes to process

In [11]:
episode_list = set([el[0] for el in corrections])
print("Episodes contenant des éléments à supprimer :\n\n", episode_list)

Episodes contenant des éléments à supprimer :

 {'BattlestarGalactica.Season01.Episode11', '24.Season01.Episode24', '24.Season01.Episode08', 'BattlestarGalactica.Season01.Episode05', 'BattlestarGalactica.Season01.Episode03', 'BattlestarGalactica.Season01.Episode10', 'BattlestarGalactica.Season01.Episode12', '24.Season01.Episode05', '24.Season01.Episode02'}


### Ecriture des nouveaux fichiers

In [12]:
path = "/vol/work1/bergoend/pyannote-db-plumcot/Plumcot/data/"

for episode in episode_list:
    
    # load forced alignment
    aligned = path + f"{episode.split('.')[0]}/forced-alignment/{episode}.aligned"    
    transcript = forced_alignment(aligned)      
    sentences = list(transcript.sents)
    sentences_str = [str(sentence) for sentence in sentences]

    # modifications for the current episode
    corrects = []
    
    for key, val in corrections.items():        
        if key[0] == episode:
            corrects.append((key[1],val))
            
    # correct sentences
    for idx, s in enumerate(sentences_str):
        # for the current sentence, find the modification to afford
        for el in corrects:
            # if sentence's index equals index in corrections (current episode)            
            if idx == el[0]:
                # correct the sentence
                sentences_str[el[0]] = el[1][1]

    # write new file
    name = f"correct_{episode}.txt"
    with open(name, 'w') as f:
        writer = f        
        for sentence, str_s in zip(sentences, sentences_str):
            # do not write corrections if they are white spaces
            if str_s != ' ':                
                writer.write(sentence._.speaker + ' ' + str_s + '\n')
        
    
    

### Vérifier si on ne perd pas d'information

In [10]:
for episode in episode_list:
    
    aligned = path + f"{episode.split('.')[0]}/forced-alignment/{episode}.aligned"
    
    transcript = forced_alignment(aligned)      
    sentences = list(transcript.sents)
    sentences_str = [str(sentence) for sentence in sentences]
    
    name = f"verif_{episode}.txt"
    with open(name, 'w') as f:
        writer = f        
        for sentence in sentences:
              
            writer.write(sentence._.speaker + ' ' + str(sentence) + '\n')
        
