## Pre Process

In [16]:
combined_df = pd.read_csv("Classified_Data.csv")

In [15]:
# Get session_ids that have at least one row with dicta_answer = 1
sessions_with_dicta = combined_df[combined_df['dicta_answer'] == 1]['session_id'].unique()

# Filter the dataframe to keep only those sessions
filtered_df = combined_df[combined_df['session_id'].isin(sessions_with_dicta)]

# Reset index
filtered_df = filtered_df.reset_index(drop=True)

# Print stats to verify
print("Original number of rows:", len(combined_df))
print("Original number of unique sessions:", combined_df['session_id'].nunique())
print("\nAfter filtering:")
print("Number of rows:", len(filtered_df))
print("Number of unique sessions:", filtered_df['session_id'].nunique())

Original number of rows: 11218049
Original number of unique sessions: 54957

After filtering:
Number of rows: 974867
Number of unique sessions: 1536


In [25]:

dicta_1_df = filtered_df[filtered_df['dicta_answer'] == 1]

print("Rows where dicta_answer = 1:")
print("\nTotal rows:", len(dicta_1_df))
print("\nData Length:")
print(len(filtered_df))


Rows where dicta_answer = 1:

Total rows: 10974

Data Length:
974867


In [27]:
filtered_df.to_csv("pred_data.csv")

In [31]:
filtered_df

Unnamed: 0,committee_name,session_id,chairperson,speaker_name,conversation,contain_offensive_words,dicta_answer
0,אל על,64670,אברהם הירשזון,יצחק הרצוג,. ולא על הצעת חבר-הכנסת בייגה שוחט?.,0,0.0
1,אל על,64670,אברהם הירשזון,יצחק הרצוג,". אתה אמרת, ההסכם בין משרד האוצר לבנק הפועלים..",0,0.0
2,אל על,64670,אברהם הירשזון,אברהם הירשזון,. לא..,0,0.0
3,אל על,64670,אברהם הירשזון,אברהם הירשזון,. אני קודם כל מצביע על עמדת הממשלה – בעד ונגד..,0,0.0
4,אל על,64670,אברהם הירשזון,חיים אורון,. אבל איך היא תתקיים?.,0,0.0
...,...,...,...,...,...,...,...
974862,ועדת החוקה חוק ומשפט,573449,ניסן סלומינסקי,פיליפ מרכוס,. תמיד - - - לפנות לבית המשפט לקבל סעד - - -.,0,0.0
974863,ועדת החוקה חוק ומשפט,573449,ניסן סלומינסקי,מוריה בקשי כהן,". אגב, זה בכל מקרה לא כאן, אלא בסעיף שידבר על ...",0,0.0
974864,ועדת החוקה חוק ומשפט,573449,ניסן סלומינסקי,מוריה בקשי כהן,. אבל אנחנו רוצים לחשוב על הנושא של הקטינים. ל...,0,0.0
974865,ועדת החוקה חוק ומשפט,573449,ניסן סלומינסקי,איילת ששון,. כתוב: וכן רשאי בית המשפט - - -.,0,0.0


In [32]:
import pandas as pd
import numpy as np
from tqdm import tqdm


def prepare_sequence_data(df, window_size=3, target_ratio=2.0):
    """
    Prepare sequence data maintaining target ratio of negative to positive samples.
    """
    sequences = []
    total_sessions = len(df['session_id'].unique())
    
    valid_sessions = df.groupby('session_id').filter(lambda x: len(x) >= window_size + 1)
    session_groups = valid_sessions.groupby('session_id')
    
    # First pass: collect all positive samples
    positive_sequences = []
    for session_id, session_df in tqdm(session_groups, total=total_sessions, desc="Collecting positive samples"):
        session_data = session_df.reset_index(drop=True)
        positive_indices = session_data[session_data['dicta_answer'] == 1].index
        positive_indices = [i for i in positive_indices if i >= window_size]
        
        for i in positive_indices:
            sequence = session_data.iloc[i-window_size:i+1].copy()
            sequence['position'] = range(window_size + 1)
            positive_sequences.append(sequence)
    
    total_positives = len(positive_sequences)
    target_negatives = int(total_positives * target_ratio)
    
    # Second pass: collect negative samples
    negative_sequences = []
    for session_id, session_df in tqdm(session_groups, total=total_sessions, desc="Collecting negative samples"):
        session_data = session_df.reset_index(drop=True)
        
        # Get all possible negative windows
        negative_indices = []
        for i in range(window_size, len(session_data)):
            if session_data.iloc[i]['dicta_answer'] == 0:
                # Check if this forms a valid sequence
                if all(pd.notna(session_data.iloc[i-window_size:i+1]['dicta_answer'])):
                    negative_indices.append(i)
        
        if negative_indices:
            # Calculate proportion of negatives to take from this session
            session_proportion = len(session_df) / len(valid_sessions)
            session_target = int(target_negatives * session_proportion)
            n_to_take = min(len(negative_indices), session_target)
            
            if n_to_take > 0:
                selected_indices = np.random.choice(negative_indices, size=n_to_take, replace=False)
                for idx in selected_indices:
                    sequence = session_data.iloc[idx-window_size:idx+1].copy()
                    sequence['position'] = range(window_size + 1)
                    negative_sequences.append(sequence)
        
        # Monitor memory usage
        if len(negative_sequences) > 10000:
            negative_sequences = pd.concat(negative_sequences, ignore_index=True)
            negative_sequences = [negative_sequences]
    
    # Combine all sequences
    all_sequences = positive_sequences + negative_sequences
    result = pd.concat(all_sequences, ignore_index=True)
    
    total_sequences = len(result) // (window_size + 1)
    final_positives = sum(result[result['position'] == window_size]['dicta_answer'] == 1)
    final_negatives = sum(result[result['position'] == window_size]['dicta_answer'] == 0)
    
    print(f"\nTotal sequences: {total_sequences}")
    print(f"Positive sequences: {final_positives}")
    print(f"Negative sequences: {final_negatives}")
    print(f"Ratio (positive:negative): 1:{final_negatives/final_positives:.2f}")
    
    return result

# Usage:
df_window3 = prepare_sequence_data(filtered_df, window_size=3)

Collecting positive samples: 100%|█████████▉| 1532/1536 [00:02<00:00, 572.52it/s]
Collecting negative samples: 100%|█████████▉| 1532/1536 [02:06<00:00, 12.09it/s]



Total sequences: 31960
Positive sequences: 10907
Negative sequences: 21053
Ratio (positive:negative): 1:1.93


In [33]:
df_window3.to_csv("pred_data_preprocessed.csv")

In [34]:
len(df_window3)

127840