In [1]:
# Import necessary libraries

import csv
import re
import pandas as pd

In [2]:
# Define a function to transform the transcription file (.vtt) to .csv

def vtt_to_csv(vtt_file, csv_file):
    with open(vtt_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    data = []
    current_timestamp = None
    current_text = []
    
    for line in lines:
        line = line.strip()
        
        # Check if the line contains a timestamp
        timestamp_match = re.match(r'(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})', line)
        
        if timestamp_match:
            # If there is current text being processed, append it to data
            if current_timestamp and current_text:
                data.append([current_timestamp[0], current_timestamp[1], ' '.join(current_text)])
            
            # Reset for the new timestamp and text block
            current_timestamp = (timestamp_match.group(1), timestamp_match.group(2))
            current_text = []
        
        elif line and not line.startswith('NOTE') and not line.startswith('WEBVTT'):
            # Add non-empty, non-comment lines to the text
            current_text.append(line)
    
    # Add the last block to the data
    if current_timestamp and current_text:
        data.append([current_timestamp[0], current_timestamp[1], ' '.join(current_text)])

    # Write data to a new .csv file
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        # Write header
        writer.writerow(['Start Time', 'End Time', 'Text'])
        # Write data
        writer.writerows(data)
    
    print(f"Conversion complete! Saved as {csv_file}")

In [3]:
vtt_to_csv(r"/Users/jacksontran/Desktop/Focus Group/Meeting Transcription/AMA Focus Group 1.vtt",r"/Users/jacksontran/Desktop/Focus Group/AMA Focus Group 1.csv")

Conversion complete! Saved as /Users/jacksontran/Desktop/Focus Group/AMA Focus Group 1.csv


In [4]:
# Convert the .csv file to a DataFrame

df = pd.read_csv(r"/Users/jacksontran/Desktop/Focus Group/AMA Focus Group 1.csv")

In [5]:
df

Unnamed: 0,Start Time,End Time,Text
0,00:02:59.572,00:03:02.412,<v Hoai Thuc Nhi Le>I was thinking about today...
1,00:04:10.992,00:04:13.472,<v Hoai Thuc Nhi Le>How do I play movie?</v> 2...
2,00:04:15.452,00:04:16.852,<v Hoai Thuc Nhi Le>Sets on my phone.</v> 24d1...
3,00:04:55.852,00:04:56.412,<v Hoai Thuc Nhi Le>Can you hear me?</v> 24d1b...
4,00:04:58.052,00:04:58.212,<v Hoai Thuc Nhi Le>OK.</v> 24d1b622-c884-45c5...
...,...,...,...
272,00:26:46.412,00:26:49.495,"<v Jennifer Loza-Marin>But it's OK, I just wan..."
273,00:26:49.495,00:26:50.092,<v Jennifer Loza-Marin>is informed.</v> 24d1b6...
274,00:26:51.712,00:26:52.232,<v Morgan Shapiro>Thank you.</v> 24d1b622-c884...
275,00:26:51.912,00:26:52.312,<v Lara Zambrano Bastidas>Thank you.</v> 24d1b...


In [6]:
type(df)

pandas.core.frame.DataFrame

In [7]:
df['Text'][100:110]

100    <v Hoai Thuc Nhi Le>I agree. OK, how about Lar...
101    <v Lara Zambrano Bastidas>Hi, I'm Laura.</v> 2...
102    <v Lara Zambrano Bastidas>I'm a senior marketi...
103    <v Lara Zambrano Bastidas>is notion.</v> 24d1b...
104    <v Lara Zambrano Bastidas>That's where I take ...
105    <v Lara Zambrano Bastidas>schedule stuff, thin...
106    <v Hoai Thuc Nhi Le>OK.</v> 24d1b622-c884-45c5...
107    <v Hoai Thuc Nhi Le>Oh great.</v> 24d1b622-c88...
108    <v Hoai Thuc Nhi Le>So now that we.</v> 24d1b6...
109    <v Hoai Thuc Nhi Le>A bit more familiar with e...
Name: Text, dtype: object

In [8]:
# Remove unnessesary characters

df['Text'] = df['Text'].str.split('</v>').str[0] \
            .str.split('<v').str[1]

In [9]:
df['Name'] = df['Text'].str.split('>').str[0]
df['Name']

0             Hoai Thuc Nhi Le
1             Hoai Thuc Nhi Le
2             Hoai Thuc Nhi Le
3             Hoai Thuc Nhi Le
4             Hoai Thuc Nhi Le
                ...           
272        Jennifer Loza-Marin
273        Jennifer Loza-Marin
274             Morgan Shapiro
275     Lara Zambrano Bastidas
276           Hoai Thuc Nhi Le
Name: Name, Length: 277, dtype: object

In [10]:
df['Text'] = df['Text'].str.split('>').str[1]

In [11]:
df = df.reindex(columns = ['Start Time','End Time','Name','Text'])
df

Unnamed: 0,Start Time,End Time,Name,Text
0,00:02:59.572,00:03:02.412,Hoai Thuc Nhi Le,I was thinking about today.
1,00:04:10.992,00:04:13.472,Hoai Thuc Nhi Le,How do I play movie?
2,00:04:15.452,00:04:16.852,Hoai Thuc Nhi Le,Sets on my phone.
3,00:04:55.852,00:04:56.412,Hoai Thuc Nhi Le,Can you hear me?
4,00:04:58.052,00:04:58.212,Hoai Thuc Nhi Le,OK.
...,...,...,...,...
272,00:26:46.412,00:26:49.495,Jennifer Loza-Marin,"But it's OK, I just wanted to let you guys kno..."
273,00:26:49.495,00:26:50.092,Jennifer Loza-Marin,is informed.
274,00:26:51.712,00:26:52.232,Morgan Shapiro,Thank you.
275,00:26:51.912,00:26:52.312,Lara Zambrano Bastidas,Thank you.


In [12]:
df[260:]

Unnamed: 0,Start Time,End Time,Name,Text
260,00:26:10.292,00:26:13.973,Hoai Thuc Nhi Le,Visa Gift card later in this week through your...
261,00:26:13.973,00:26:17.357,Hoai Thuc Nhi Le,"have any additional start, follow up questions..."
262,00:26:17.357,00:26:17.892,Hoai Thuc Nhi Le,sessions.
263,00:26:18.332,00:26:22.292,Hoai Thuc Nhi Le,Feel free to reach out to us via e-mail.
264,00:26:23.892,00:26:24.772,Hoai Thuc Nhi Le,Me or Jennifer?
265,00:26:25.052,00:26:29.907,Hoai Thuc Nhi Le,"Jennifer already send you the e-mail, so feel ..."
266,00:26:29.907,00:26:34.372,Hoai Thuc Nhi Le,her and thanks again. I hope you have a great ...
267,00:26:34.822,00:26:36.942,Morgan Shapiro,Thankful. I hope you have a good day as well.
268,00:26:37.952,00:26:38.552,Lara Zambrano Bastidas,Thank you.
269,00:26:38.232,00:26:40.392,Jennifer Loza-Marin,I wanted to make a correction.


In [13]:
df2 = df[:10]

In [14]:
df2

Unnamed: 0,Start Time,End Time,Name,Text
0,00:02:59.572,00:03:02.412,Hoai Thuc Nhi Le,I was thinking about today.
1,00:04:10.992,00:04:13.472,Hoai Thuc Nhi Le,How do I play movie?
2,00:04:15.452,00:04:16.852,Hoai Thuc Nhi Le,Sets on my phone.
3,00:04:55.852,00:04:56.412,Hoai Thuc Nhi Le,Can you hear me?
4,00:04:58.052,00:04:58.212,Hoai Thuc Nhi Le,OK.
5,00:04:59.832,00:05:00.192,Morgan Shapiro,Hello.
6,00:05:01.342,00:05:02.822,Hoai Thuc Nhi Le,"Hi, can you hear me?"
7,00:05:04.202,00:05:04.642,Morgan Shapiro,Yes.
8,00:05:05.592,00:05:11.410,Hoai Thuc Nhi Le,"OK, my name is Helen and I will be the facilit..."
9,00:05:07.142,00:05:07.622,Morgan Shapiro,You just.


In [21]:
# Define a function to merge consecutive speaker rows to improve readiness

def merge_speaker(df):
    merged_data = []
    prev_speaker = None
    temp_text = ''
    temp_start_time = ''
    temp_end_time = ''
    
    for idx, row in df.iterrows():
        current_speaker = row['Name']
        current_start_time = row['Start Time']
        current_end_time = row['End Time']
        
        # If it's the same speaker as the previous row, concatenate the text
        if current_speaker == prev_speaker:
            temp_text += ' ' + row['Text']
            temp_end_time = current_end_time
        else:
            # If not the same speaker, append the previous speaker's full text to the result
            if prev_speaker is not None:
                merged_data.append([temp_start_time, temp_end_time, prev_speaker, temp_text])
            # Start a new conversation for the current speaker
            prev_speaker = current_speaker
            temp_text = row['Text']
            temp_start_time = row['Start Time']
            temp_end_time = row['End Time']
    
    # Append the last conversation
    merged_data.append([temp_start_time, temp_end_time, prev_speaker, temp_text,])
    
    # Create a new DataFrame with the merged conversations
    merged_df = pd.DataFrame(merged_data, columns=['Start Time', 'End Time', 'Name', 'Text'])
    return merged_df

In [25]:
df = merge_speaker(df)

In [35]:
df[df['Start Time'] < '00:15:00.000']

Unnamed: 0,Start Time,End Time,Name,Text
0,00:02:59.572,00:04:58.212,Hoai Thuc Nhi Le,I was thinking about today. How do I play movi...
1,00:04:59.832,00:05:00.192,Morgan Shapiro,Hello.
2,00:05:01.342,00:05:02.822,Hoai Thuc Nhi Le,"Hi, can you hear me?"
3,00:05:04.202,00:05:04.642,Morgan Shapiro,Yes.
4,00:05:05.592,00:05:11.410,Hoai Thuc Nhi Le,"OK, my name is Helen and I will be the facilit..."
5,00:05:07.142,00:05:07.622,Morgan Shapiro,You just.
6,00:05:11.410,00:05:15.352,Hoai Thuc Nhi Le,we're gonna wait. Two more participants coming.
7,00:05:17.022,00:05:19.062,Morgan Shapiro,I'm sorry. Can you repeat that?
8,00:05:20.782,00:05:22.782,Hoai Thuc Nhi Le,"I mean, my name's Helen."
9,00:05:22.212,00:05:24.932,Morgan Shapiro,"You know, OK, yes."


In [41]:
df['Text'][33]

'And then on Tuesday I work PDF for some reason.'