In [5]:
import pandas as pd
import numpy as np
#from scipy import stats

## create LIWC summary data

In [104]:
film_ls=['theshoe','therock','theboyfriend','keithreynolds','cmiyc_long','busstop']
for film in film_ls:
    folder=f'../../Analysis Data/Prediction'
    df=pd.read_csv(f'{folder}/{film}_consensus_mapped_to_neuro.csv')

    df_LIWC = pd.read_csv(f'../../Analysis Data/LIWC/{film}_prediction_cleaned_LIWC22.csv')
    df_LIWC = df_LIWC[df_LIWC['phase_type']=='test']
    df_LIWC = df_LIWC[df_LIWC['prediction_number']==1]
    all_categories=df_LIWC.columns
    all_categories=all_categories[16:]
    LIWC_data=df_LIWC.groupby(['video_segment','counterbalance'])[all_categories].mean().reset_index()
    LIWC_data[['video_segment', 'counterbalance']] = LIWC_data[['video_segment', 'counterbalance']].astype(int)
    result=pd.merge(df, LIWC_data, on=['video_segment','counterbalance'], how='outer')
    result=result.rename(columns={'WC':'prediction_wc'})

    result.to_csv(f'../../Analysis Data/summary_files/LIWC/'+film+'_prediction_LIWC_summary.csv',index=False)

In [105]:
film_ls=['theshoe','therock','theboyfriend','keithreynolds','cmiyc_long','busstop']
for film in film_ls:
    folder=f'../../Analysis Data/Description'
    df=pd.read_csv(f'{folder}/{film}_consensus_mapped_to_neuro.csv')

    df_LIWC = pd.read_csv(f'../../Analysis Data/LIWC/{film}_description_cleaned_LIWC22.csv')
    df_LIWC = df_LIWC[df_LIWC['phase_type']=='test']
    all_categories=df_LIWC.columns
    all_categories=all_categories[18:]
    LIWC_data=df_LIWC.groupby(['description_stop','counterbalance'])[all_categories].mean().reset_index()
    LIWC_data[['description_stop', 'counterbalance']] = LIWC_data[['description_stop', 'counterbalance']].astype(int)
    result=pd.merge(df, LIWC_data, on=['description_stop','counterbalance'], how='outer')
    result=result.rename(columns={'WC':'description_wc'})

    result.to_csv(f'../../Analysis Data/summary_files/LIWC/'+film+'_description_LIWC_summary.csv',index=False)

## create valence and arousal summary

In [17]:
import numpy as np
import pandas as pd

begintime_ls = {
    'cmiyc_long': 40,
    'theshoe': 999,
    'keithreynolds': 1134,
    'busstop': 948,
    'theboyfriend': 528,
    'therock': 40
}

def split_into_bins(row, begin_time):
    start_time = row['start']
    end_time = row['end']
    bin_size = 10
    
    # Adjust start and end times relative to the begin_time
    adjusted_start = start_time - begin_time
    adjusted_end = end_time - begin_time
    
    start_bin = (adjusted_start // bin_size) * bin_size
    end_bin = (adjusted_end // bin_size) * bin_size
    
    results = []
    
    if start_bin == end_bin:
        # If the entire row falls within one bin
        duration = end_time - start_time
        results.append({
            'bin': start_bin,
            'duration': duration,
            'arousal_duration': duration if not np.isnan(row['Arousal_avg']) else 0,
            'arousal_weighted': row['Arousal_avg'] * duration if not np.isnan(row['Arousal_avg']) else 0,
            'valence_merged': row['valence_merged']
        })
    else:
        # If the row spans two bins
        first_bin_end = begin_time + start_bin + bin_size
        first_bin_duration = first_bin_end - start_time
        second_bin_duration = end_time - first_bin_end
        
        results.append({
            'bin': start_bin,
            'duration': first_bin_duration,
            'arousal_duration': first_bin_duration if not np.isnan(row['Arousal_avg']) else 0,
            'arousal_weighted': row['Arousal_avg'] * first_bin_duration if not np.isnan(row['Arousal_avg']) else 0,
            'valence_merged': row['valence_merged']
        })
        results.append({
            'bin': end_bin,
            'duration': second_bin_duration,
            'arousal_duration': second_bin_duration if not np.isnan(row['Arousal_avg']) else 0,
            'arousal_weighted': row['Arousal_avg'] * second_bin_duration if not np.isnan(row['Arousal_avg']) else 0,
            'valence_merged': row['valence_merged']
        })
    
    return results

def determine_valence(row):
    if row['pos_percent'] > 50:
        return 'pos'
    elif row['neg_percent'] > 50:
        return 'neg'
    else:
        return 'neutral'

def process_data_for_bins(current, film_name):
    begin_time = begintime_ls[film_name]
    
    # Apply the function to split rows and create a new dataframe
    split_data = current.apply(lambda row: split_into_bins(row, begin_time), axis=1).explode().apply(pd.Series)
    
    # Group by bin for arousal calculations
    arousal_data = split_data.groupby('bin').agg({
        'arousal_duration': 'sum',
        'arousal_weighted': 'sum'
    })
    arousal_data['arousal_avg'] = arousal_data['arousal_weighted'] / arousal_data['arousal_duration']
    
    # Group by bin and valence for valence calculations
    valence_data = split_data.groupby(['bin', 'valence_merged'])['duration'].sum().unstack(fill_value=0)
    
    # Ensure all valence categories exist
    for category in ['pos', 'neg', 'neutral']:
        if category not in valence_data.columns:
            valence_data[category] = 0
    
    # Calculate total duration and percentages for each valence category
    valence_data['total'] = valence_data['pos'] + valence_data['neg'] + valence_data['neutral']
    for category in ['pos', 'neg', 'neutral']:
        valence_data[f'{category}_percent'] = valence_data[category] / valence_data['total'] * 100
    
    valence_data['valence_binned'] = valence_data.apply(determine_valence, axis=1)
    
    # Combine arousal and valence data
    combined_data = pd.concat([arousal_data, valence_data], axis=1).reset_index()
    
    # Adjust bin values back to original timeline
    combined_data['bin'] = combined_data['bin'] + begin_time
    
    return combined_data

In [18]:
film_name={'theshoe':'5. The Shoe','therock':'8. The Rock','theboyfriend':'4. The Boyfriend','keithreynolds':'6. Keith Reynolds','cmiyc_long':'2. Catch Me If You Can','busstop':'12. Bus Stop'}

In [83]:
film_ls=['theshoe','therock','theboyfriend','keithreynolds','cmiyc_long','busstop']
data=pd.read_csv('../../Analysis Data/behavior_measurements/valence_arousal_summary.csv')

for film in film_ls:
    folder=f'../../Analysis Data/Prediction'
    df=pd.read_csv(f'{folder}/{film}_consensus_mapped_to_neuro.csv')
    
    current = data[data['SEG-12 title']==film_name[film]].copy()
    def convert_to_seconds(value):
        if pd.isna(value):
            return np.nan
        minutes, seconds = divmod(value, 1)
        seconds *= 100  # Convert decimal part to actual seconds
        return int(minutes * 60 + round(seconds))
    current.loc[:,'start']=current['SEG-1000 Start Time (m.ss)'].apply(convert_to_seconds)
    current.loc[:,'end']=current['SEG-1000 End Time (m.ss)'].apply(convert_to_seconds)
    
    combined_data = process_data_for_bins(current,film)
    combined_data.loc[:,'bin_end']=combined_data['bin']+10
    merge_data=combined_data[['bin_end','arousal_avg','valence_binned']]
    final=df.merge(merge_data,left_on='filmfest_offset',right_on='bin_end',how='left')
    final['valence'] = final['valence_binned'].apply(lambda x: 1 if x=='pos' else (-1 if x=='neg' else 0))
    final.to_csv('../../Analysis Data/summary_files/valence_arousal/'+film+'_valence_arousal_summary.csv',index=False)