In [None]:
import pandas as pd
import os
import ffmpeg
import numpy as np
import re

### Соотнесение списка глосс с предложениями

In [None]:
df = pd.read_csv('./data/train.csv')

In [None]:
df['start'] = df['start (sec)'].astype(float)
df['end'] = df['end (sec)'].astype(float)

In [None]:
def parse_gloss_cell(cell):
    entries = []
    if pd.isna(cell):
        return entries
    for line in cell.strip().splitlines():
        parts = line.strip().split('|')
        if len(parts) == 3:
            try:
                g_start = float(parts[0])
                g_end = float(parts[1])
                g_text = parts[2].strip()
                entries.append({'start': g_start, 'end': g_end, 'text': g_text})
            except ValueError:
                continue
    return entries

result_rows = []
for video, group in df.groupby('video_name'):
    group = group.sort_values('start').copy()
    first_row = group.iloc[0]
    r_glosses = parse_gloss_cell(first_row['R-gloss (на документ)'])
    l_glosses = parse_gloss_cell(first_row['L-gloss (на документ)'])
    
    all_glosses = r_glosses + l_glosses
    unique_glosses = {}
    for g in all_glosses:
        key = (g['start'], g['end'], g['text'])
        unique_glosses[key] = g
    glosses_list = list(unique_glosses.values())
    glosses_list.sort(key=lambda x: x['start'])
    
    sentences = group.to_dict('records')

    for i, sent in enumerate(sentences):
        sent['glosses'] = []

    for g in glosses_list:
        for i, sent in enumerate(sentences):
            s_start = sent['start']
            s_end = sent['end']
            if s_start <= g['start'] <= s_end:
                sentences[i]['glosses'].append(g['text'])
            if g['start'] < s_start and g['end'] > s_end:
                if g['text'] not in sentences[i]['glosses']:
                    sentences[i]['glosses'].append(g['text'])
                if i - 1 >= 0 and g['text'] not in sentences[i - 1]['glosses']:
                    sentences[i - 1]['glosses'].append(g['text'])
                if i + 1 < len(sentences) and g['text'] not in sentences[i + 1]['glosses']:
                    sentences[i + 1]['glosses'].append(g['text'])
    
    for sent in sentences:
        sent['glosses'] = " ".join(sent['glosses'])
    
    result_rows.extend(sentences)

result_df = pd.DataFrame(result_rows)

### Обработка видео

In [None]:
# для загрузки видео
name = ""

input_url = f"https://rsl.nstu.ru/video/{name}.webm" 
output_file = f"{name}.mp4"

ffmpeg.input(input_url).output(output_file, vcodec='libx264').run()

In [None]:
# для разделения на фрагменты
output_folder = "./data/video_segments"
os.makedirs(output_folder, exist_ok=True)

# создаем словарь с временными метками
result_df = pd.read_csv('./data/final_train.csv')
segment_dict = {}
videos = [
    "RSLN-n4-s2","RSLM-n1-s44-d","RSLM-n3-s55-d","RSLN-n5-s2","RSLM-n2-s57-d",
    "RSLM-n1-s57-d","RSLN-n6-s2","RSLN-n8-s2","RSLM-n2-s55-d",
    "RSLN-n3-s2","RSLN-n1-s2","RSLM-m4-s59-d","RSLM-n1-s56-d","RSLM-m4-s57-d",
    "RSLN-n9-s2","RSLM-m4-s54-d","RSLM-n3-s56-d","RSLM-m4-s44-d","RSLM-m5-s58-d",
    "RSLM-n1-s62","RSLM-n2-s56-d","RSLM-m1-s54-d","RSLM-m6-s55-d",
    "RSLM-m8-s58-d","RSLM-m2-s44-d","RSLN-n2-s2","RSLM-m6-s56-d","RSLM-m1-s59-d",
    "RSLM-m3-s54-d","RSLM-n1-s55-d","RSLM-m5-s56-d","RSLM-m3-s57-d","RSLM-b26-s60",
    "RSLM-m8-s55-d","RSLM-m2-s54-d","RSLM-m7-s55-d","RSLN-n1-s3","RSLM-m1-s57-d",
    "RSLM-m6-s58-d","RSLM-m2-s57-d","RSLM-m6-s43-d","RSLM-m3-s59-d",
    "RSLM-n1-s60","RSLM-b26-s61","RSLM-m7-s58-d","RSLM-m7-s56-d",
    "RSLN-b1-s64","RSLM-m2-s59-d","RSLN-b1-s4","RSLM-n1-s61","RSLM-b25-s60",
    "RSLM-n3-s61","RSLM-b25-s62","RSLM-n2-s61","RSLN-n1-s1"]
for v in videos:
    res = result_df[result_df['video_name'] == v]
    segment_dict[v] = []
    for i in range(len(res)):
        start, end = res.iloc[i]['start'], res.iloc[i]['end']
        segment_dict[v].append((start,end))

# разбиваем на сегменты
for v, s in segment_dict.items():
    segments = s
    base_name = v
    video_file = base_name + ".mp4"
    input_path = os.path.join("./data/videos", video_file)
    for i, (start, end) in enumerate(segments, start=1):
            output_file = os.path.join(output_folder, f"{base_name}-seg{i}.mp4")
            print(f"Processing segment {i} for {video_file}: {output_file}")
            
            try:
                (
                    ffmpeg
                    .input(input_path, ss=start, to=end)
                    .output(output_file, c='copy')
                    .run(overwrite_output=True)
                )
            except ffmpeg.Error as e:
                print(f"An error occurred while processing {video_file} segment {i}:")
                print(e.stderr.decode())
                raise

### Speaker и разделение на train/test 

In [None]:
def extract_speaker(video_name):
    match = re.search(r'-(s\d+)-', video_name)
    if match:
        return match.group(1)
    else:
        parts = video_name.split('-')
        for part in parts:
            if part.startswith('s') and part[1:].isdigit():
                return part
    return None

In [None]:
df['speaker'] = df['video_name'].apply(extract_speaker)
unique_speakers = df['speaker'].unique()

In [None]:
np.random.seed(42)
num_train = int(0.9 * len(unique_speakers))
train_speakers = np.random.choice(unique_speakers, num_train, replace=False)

df['is_train'] = df['speaker'].apply(lambda spk: spk in train_speakers)

In [None]:
df.to_csv('./data/train.csv')