 # Création de la dataframe pour les actes de dialogues dans Snorkel
 

In [1]:
import sys
sys.path.insert(1,"scripts/")
from alignments import *
from tony import *
from gold import *
from silences import *
from speaker import *
from tony import *
from gold import *
from punctuation import *
from pitchenergy import *
from punctuation_samir import *

## Contents
### 1. Needed Files
### 2. List of functions
### 3. Data creation for one meeting 
### 4. Dataframe creation for all meetings 

-----------

## 1. Needed Files
<br>

#### 1. Alignments files
<br>
For the real use, we will only need the output of the ASR system, which must give the words said and their position in time. For the study, we had to use the output of the software Jtrans, which takes the audio and the .trs file to create the alignments.
<br>


#### 2. Speaker changes detection files
<br>
Output files from "pyannote_SCD_OVL.ipynb" which takes a wav file as input and gives a file with the limits of speaker changes under the following format hh:mm:ss. We must convert the segments in seconds.
<br>

#### 3. Segments of silences files

Output files from Py WebRTCVad, which look like that: <br>
<br>
*File : Linagora_A1_0_05_27--end.wav<br>
0.09 0.30 0.18 NS<br>
0.48 0.63 0.12 NS<br>
...*<br>
<br>
The columns are :
Beginning/End/Duration-3s/NS
<br>
#### 4. Ouptut file of ToNy
<br>
Output files coming from ToNy. To use ToNy we need files taht can be created with the function data_prep_tony. <br>
The output file of ToNy look like this :<br>
<br>
1	eh	_	_	_	_	_	_	_	BeginSeg=Yes<br>
2	ben	_	_	_	_	_	_	_	_<br>
3	du	_	_	_	_	_	_	_	_<br>
4	coup	_	_	_	_	_	_	_	_<br>
5	ouais	_	_	_	_	_	_	_	_<br>


#### 5. Gold (for Dialogue Acts) files
<br>
For the meetings with gold labels, we need text files with separator | for each dialogue act.
<br>

##  2. List of functions
<br>

### Alignments


**alignments_word_extraction(filename_alignments)**
> return **word, beg_word, end_word** 
> respectivement la liste des mots, le début des mots en secondes, la fin des mots en secondes 
<br>

**alignments_real_turn_extraction(filename_alignments)**
> return n_turn, beg_turn, end_turn, rank_turn, text_turn, loc


<br>


### Speaker change detection

<br>

**read_change_spk_detection(filename_change_spk_detection)**
> return **beg_chg_spk**, **end_chg_spk**


**turn_extraction(word, beg_word, end_word, beg_chg_spk ,end_chg_spk)**
> return **n_turn**, **beg_turn**, **end_turn**, **rank_turn**, **text_turn**

<br>

### Silences

<br>

**extract_silences_positions(filename_silences)**
> return **beg_sil**, **end_sil**, **dur_sil**, **mid_sil**


**silences_word_position(word, beg_word, end_word, beg_sil, end_sil, dur_sil, mid_sil)**
> return **sil_bef**, **sil_aft**

<br>

### ToNy 

<br>

**data_prep_tony(name, word, rank_turn, text_turn)**
> create file for ToNy


**convert_tony_results(filename_tony_result)**
> return **tony**

<br>


### Gold

<br>

**convert_gold(filename_gold)**
> return **gold**


<br>

### Punctuation

<br>

**convert_punctuation(filename_punct)**
> return **prob_nothing**, **prob_period**, **prob_comma**, **prob_nothing_bef**, **prob_period_bef**, **prob_comma_bef**


<br>

**punctuation_samir_extraction(file_punctuation, word)**
> return **punct**, **punct_bef**



## 3. Data creation for one meeting 

In [2]:
#Emplacement des fichiers : 

name_meeting="Linagora_R1"
file_alignments="data/alignments/Linagora_R1_align.txt"
file_spk_change="data/speakers/Linagora_R1_spk.txt"
file_silences="data/silences/Linagora_R1_sil.txt"
file_tony_result="data/tony/Linagora_R1_tony.txt.split.tok"
file_gold="data/gold/Linagora_R1_gold.txt"
file_punct="data/punctuation/Linagora_R1_punct.txt"
file_punct_samir="data/punctuation_samir/Linagora_R1_samir.txt"
file_pitchenergy="data/audio/Linagora_R1_pitchenergy.txt"

    
work_directory="path"


##### Alignements

In [3]:
word,beg_word,end_word = alignments_word_extraction(file_alignments)
print(len(word), len(beg_word), len(end_word))

7943 7943 7943


In [4]:
real_n_turn, real_beg_turn, real_end_turn, real_rank_turn, real_text_turn, real_loc=alignments_real_turn_extraction(file_alignments)

7943 7943 7943 7943 7943


##### Silences

In [5]:

beg_sil,end_sil,dur_sil,mid_sil=extract_silences_positions(file_silences)
sil_bef, sil_aft=silences_word_position(word, beg_word, end_word, beg_sil, end_sil, dur_sil, mid_sil)

print(len(sil_bef), len(sil_aft))

7943 7943


##### Speaker change detection

In [6]:
beg_chg_spk, end_chg_spk = read_change_spk(file_spk_change)
print(len(beg_chg_spk), len(end_chg_spk))
n_turn, beg_turn, end_turn, rank_turn, text_turn = turn_extraction(word, beg_word, end_word, beg_chg_spk, end_chg_spk)

print(len(n_turn), len(beg_turn), len(end_turn), len(rank_turn), len(text_turn), len(beg_chg_spk), len(end_chg_spk))

119 119
7943 7943 7943 7943 7943 119 119


##### ToNy Begin Of Segment

In [7]:

data_prep_tony(name_meeting, word, rank_turn, text_turn)
tony=convert_tony_results(file_tony_result, word)

print(len(tony))

7943


##### Gold BOS

In [8]:
if file_gold!="":
    gold=convert_gold(file_gold)
    print(len(gold))

7943


##### Ponctuation

In [9]:
prob_nothing, prob_period, prob_comma, prob_nothing_bef, prob_period_bef, prob_comma_bef = convert_punctuation(file_punct, word)
print(len(prob_nothing))


#ponctuation samir

punct_bef, punct=punctuation_samir_extraction(file_punct_samir, word)

exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
7943
7943 7943


##### Audio Features

In [10]:
pitch, pitch_bef, energy, energy_bef = audio_features_extraction(file_pitchenergy, word)

exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
7943 7943 7943 7943


In [12]:


vp=0
fn=0
fp=0
vn=0


for i in range(0,len(rank_turn)):
    if rank_turn[i]==0 and real_rank_turn[i]==0:
        vp+=1
    elif rank_turn[i]!=0 and real_rank_turn[i]==0:
        fn+=1
    elif rank_turn[i]==0 and real_rank_turn[i]!=0:
        fp+=1
    else:
        vn+=1

print(vp, fn, fp, vn)
p=vp/(vp+fp)
r=vp/(vp+fn)
fscore=2*p*r/(p+r)

print("precision=", p, "rappel=", r, "fscore=", fscore)

42 137 67 7697
precision= 0.3853211009174312 rappel= 0.2346368715083799 fscore= 0.2916666666666667


In [19]:
import pandas as pd
df_dev = pd.DataFrame({'word': word, 'beg_word': beg_word, 'end_word': end_word,'n_turn': n_turn,
                       'beg_turn': beg_turn, 'end_turn': end_turn, 'rank_turn' : rank_turn,
                       'text_turn' : text_turn, 'sil_bef' : sil_bef, 'sil_aft' : sil_aft, 'tony' : tony,
                       'gold': gold, 'prob_nothing' : prob_nothing, 'prob_period': prob_period,
                       'prob_comma' : prob_comma, 'prob_nothing_bef' : prob_nothing_bef,
                       'prob_period_bef': prob_period_bef, 'prob_comma_bef' : prob_comma_bef,
                       'real_rank_turn': real_rank_turn, 'pitch': pitch, 'energy': energy,
                       'pitch_bef': pitch_bef, 'energy_bef': energy_bef})
    

In [27]:
#df_dev=
file=open("spk_change_comparison_R1.txt", "w")


i=0
liste_texte=[]

for x in df_dev.itertuples():
    mot=x.word
    if x.real_rank_turn==0 :
        if float(x.prob_period_bef)<0.5 and i>0 and float(x.prob_comma_bef)<0.5:
            liste_texte.append(".")
        liste_texte.append("\n"+"\n"+"---Real turn---"+"\n"+"\n")
        

    if x.rank_turn==0:
        liste_texte.append("|D_SPK|")
    
    #if label[i]==1:
        #liste_texte.append("|D_BOS|")
        
    
    
    if float(x.prob_period_bef)>0.5 or x.real_rank_turn==0:
        mot=str(mot).capitalize()
    
    if float(x.prob_period)>0.5:
        mot=mot+"."

    if float(x.prob_comma)>0.5:
        mot=mot+","

    liste_texte.append(mot)
    
    
    

        
    i+=1
    if i==len(df_dev):
        liste_texte.append(".")

file.write(" ".join(liste_texte))
print("bonjou")
file.close()

bonjou


In [15]:
file=open("real_turns.txt", "w")

for i in range(0,len(real_rank_turn)):
    if real_rank_turn[i]==0:
        file.write(str(turn(real_beg_turn[i],3))+" "+str(turn(real_end_turn[i],3))+" "+real_loc[i]+"\n")

file.close()
        
    

## 4. Dataframe creation for all meetings 

In [2]:
import pandas as pd

liste_dataframes=[]
name_meeting_list =['Linagora_P1', 'Linagora_C1', 'Linagora_R1', 'Linagora_A1', 'Linagora_P6', 'Linagora_C3', 'Linagora_P5', 'Linagora_R4', 'Linagora_R3', 'Linagora_C2', 'Linagora_P4']

for i in range(0,len(name_meeting_list)):
    # fichiers
    print(name_meeting_list[i])
    file_alignments="data/alignments/"+name_meeting_list[i]+"_align.txt"
    file_spk_change="data/speakers/"+name_meeting_list[i]+"_spk.txt"
    file_silences="data/silences/"+name_meeting_list[i]+"_sil.txt"
    file_tony_result="data/tony/"+name_meeting_list[i]+"_tony.txt.split.tok"
    gold_files=['Linagora_P1', 'Linagora_C1', 'Linagora_R1', 'Linagora_A1']
    file_punct="data/punctuation/"+name_meeting_list[i]+"_punct.txt"
    file_pitchenergy="data/audio/"+name_meeting_list[i]+"_pitchenergy.txt"
    file_punct_samir="data/punctuation_samir/"+name_meeting_list[i]+"_samir.txt"
    # calculs
    word,beg_word,end_word = alignments_word_extraction(file_alignments)
    
    real_n_turn, real_beg_turn, real_end_turn, real_rank_turn, real_text_turn, real_loc=alignments_real_turn_extraction(file_alignments)
    
    beg_sil,end_sil,dur_sil,mid_sil=extract_silences_positions(file_silences)
    sil_bef, sil_aft=silences_word_position(word, beg_word, end_word, beg_sil, end_sil, dur_sil, mid_sil)
    beg_chg_spk, end_chg_spk = read_change_spk(file_spk_change)
    n_turn, beg_turn, end_turn, rank_turn, text_turn = turn_extraction(word, beg_word, end_word, beg_chg_spk, end_chg_spk)
    tony=convert_tony_results(file_tony_result, word)
    prob_nothing, prob_period, prob_comma, prob_nothing_bef, prob_period_bef, prob_comma_bef = convert_punctuation(file_punct, word)
    pitch, pitch_bef, energy, energy_bef = audio_features_extraction(file_pitchenergy, word)
    punct, punct_bef= punctuation_samir_extraction(file_punct_samir, word)
    
    if name_meeting_list[i] in gold_files:
        file_gold="data/gold/"+name_meeting_list[i]+"_gold.txt"
        gold=convert_gold(file_gold)
    else:
        gold=[0]*len(word)

    df = pd.DataFrame({'word': word, 'beg_word': beg_word, 'end_word': end_word,'n_turn': n_turn,
                       'beg_turn': beg_turn, 'end_turn': end_turn, 'rank_turn' : rank_turn,
                       'text_turn' : text_turn, 'sil_bef' : sil_bef, 'sil_aft' : sil_aft, 'tony' : tony,
                       'gold': gold, 'prob_nothing' : prob_nothing, 'prob_period': prob_period,
                       'prob_comma' : prob_comma, 'prob_nothing_bef' : prob_nothing_bef,
                       'prob_period_bef': prob_period_bef, 'prob_comma_bef' : prob_comma_bef,
                       'real_rank_turn': real_rank_turn, 'pitch': pitch, 'energy': energy,
                       'pitch_bef': pitch_bef, 'energy_bef': energy_bef, "punct": punct, "punct_bef":punct_bef})
    
    
    df["file"]=name_meeting_list[i]
    liste_dataframes.append(df)
                       
df_all=pd.concat(liste_dataframes)

df_all.head()
df_all.to_csv("df_all_10022021.csv")
     
    

Linagora_P1
7235 7235 7235 7235 7235
exception
exception
exception_a
exception_a
7235 7235 7235 7235
7235 7235
Linagora_C1
1598 1598 1598 1598 1598
1598 1598 1598 1598
1598 1598
Linagora_R1
7943 7943 7943 7943 7943
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
7943 7943 7943 7943
7943 7943
Linagora_A1
1809 1809 1809 1809 1809
1809 1809 1809 1809
ERROR: 1795 à 1804 bon
ERROR2
1809 1809
Linagora_P6
9441 9441 9441 9441 9441
exception
exception
exception
exception
exception
exception
exception_a
exception_a
exception_a
exception_a
exception_a
exception_a
9441 9441 9441 9441
9441 9441
Linagora_C3
1732 1732 1732 1732 1732
1732 1732 1732 1732
1732 1732
Linagora_P5
5720 5720 5720 5720 5720
exception
exception
exception
exception
excepti

In [3]:
df_all.head()
#len(df_all)

Unnamed: 0,word,beg_word,end_word,n_round,beg_round,end_round,rank_round,text_round,sil_bef,sil_aft,...,prob_point_bef,prob_comma_bef,real_rank_round,pitch,energy,pitch_bef,energy_bef,punct,punct_bef,file
0,ok,2.0,2.37,0,0.0,10.123,0,ok donc juste comme on enregistre euh donc là ...,1.26,0.0,...,0.0,0.0,0,0,0,0,0,0,0,Linagora_P1
1,donc,2.73,3.01,0,0.0,10.123,1,ok donc juste comme on enregistre euh donc là ...,0.0,0.24,...,0.0,0.0,1,0,0,0,0,0,0,Linagora_P1
2,juste,3.38,3.67,0,0.0,10.123,2,ok donc juste comme on enregistre euh donc là ...,0.24,0.0,...,0.0,0.0,2,U,D,0,0,0,0,Linagora_P1
3,comme,3.92,4.22,0,0.0,10.123,3,ok donc juste comme on enregistre euh donc là ...,0.0,0.0,...,0.018153,0.286862,3,D,D,U,D,0,0,Linagora_P1
4,on,4.23,4.67,0,0.0,10.123,4,ok donc juste comme on enregistre euh donc là ...,0.0,0.12,...,0.000417,0.001607,4,D,U,D,D,0,0,Linagora_P1


In [5]:
#df_all.to_csv("df_all.csv")

In [6]:
# création des fichiers pour samir
name_meeting_list =['Linagora_P1', 'Linagora_C1', 'Linagora_R1', 'Linagora_A1', 'Linagora_P6', 'Linagora_C3', 'Linagora_P5', 'Linagora_R4', 'Linagora_R3', 'Linagora_C2', 'Linagora_P4']

for i in range(0,len(liste_dataframes)):
    df=liste_dataframes[i]
    new_file=open("samir/"+name_meeting_list[i]+"_samir.txt", "w")
    liste_mot=[]
    for x in df.itertuples():
        liste_mot.append(x.word)
    
    new_file.write(" ".join(liste_mot))
    new_file.close()
        