In [1]:
import pandas as pd
import pathlib
import os
import re
from preprocess_data.preprocess import postprocess

## Section the preprocessed data into scenarios (merge sentences) and combine data

In [2]:
aphasia_type = ['wernicke', 'conduction', 'anomic', 'not aphasic', 'control']
fnames = ['../ab_data/processed_data/processeddata_'+x+'.csv' for x in aphasia_type]

dfs = [pd.read_csv(x,encoding='utf8').dropna() for x in fnames]

In [3]:
# check if all files exist 
# wernicke=63, conduction=150, anomic=277, NA=89, control=346

for type, df in zip(aphasia_type, dfs):
    print(type + ": " + str(len(df['source_file'].unique())))

wernicke: 63
conduction: 150
anomic: 277
not aphasic: 89
control: 346


In [4]:
def merge_scenario(data, label):
    columns = ['source_file', 'scenario', 'preprocessed_text', 'label']
    new_data = [] 
    text = ""
    # n = -1
    
    for index, row in data.iterrows():
        if index == 0:
            prev_scenario = row["scenario"] 
            prev_sourcefile = row["source_file"]
        current_scenario = row["scenario"]
        current_sourcefile = row["source_file"]
        
        if (current_scenario == prev_scenario and 
                current_sourcefile == prev_sourcefile):
            text += " " + row["preprocessed_text"]
            # n += 1
            
        else:
            # n = 0
            text = str(re.sub(' +', ' ', text)).lstrip().rstrip().replace('?','.').replace('!', '.')
            # if len(re.findall('\.',text)) == 3:
            new_data.append([row["source_file"], prev_scenario, text, label])
            text = row["preprocessed_text"]
        
        prev_scenario = current_scenario
        prev_sourcefile = current_sourcefile
            
    return pd.DataFrame(new_data, columns=columns)

In [5]:
merge_dfs = []
for type, df in zip(aphasia_type, dfs):
    x = merge_scenario(df,type.upper())
    merge_dfs.append(x)
    # print(x)
    x.to_csv('../ab_data/processed_data/processeddata_'+type+'_para.csv', index=False)

In [6]:
for type in aphasia_type:
    fp = '../ab_data/processed_data/processeddata_'+type+'_para.csv'
    df = pd.read_csv(fp)
    new_processed = []
    
    for index, row in df.iterrows():
        
        strs = row['preprocessed_text'].split('.')
        new_strs = ''
        for x in strs:
            # more than two consecutive fp/up
            y = re.sub('(ufp[\W\s]+){3,}', 'UP3 ', x)
            y = re.sub('(fp[\W\s]+){3,}', 'FP3 ', x)
            # two consecutive fp/up
            y = re.sub('(ufp[\W\s]+){2}', 'UP2 ', y)   
            y = re.sub('(fp[\W\s]+){2}', 'FP2 ', y)
            # one fp/up
            y = re.sub('ufp', 'UP1 ', y)
            y = re.sub('fp', 'FP1 ', y)
            if y:
                new_strs += y + '. '
        new_strs = postprocess(new_strs)          
        new_processed.append(str(new_strs))
    
    df['new_preprocessed_text'] = new_processed
    df.to_csv(fp, index=False)

## Make datasets for classification/interpreting
includes downsampling

In [7]:
conduction_df = pd.read_csv('../ab_data/processed_data/processeddata_conduction_para.csv')[['label', 'new_preprocessed_text', 'scenario']]
anomic_df = pd.read_csv('../ab_data/processed_data/processeddata_anomic_para.csv')[['label', 'new_preprocessed_text', 'scenario']]
wernicke_df = pd.read_csv('../ab_data/processed_data/processeddata_wernicke_para.csv')[['label', 'new_preprocessed_text', 'scenario']]
not_aphasic_df = pd.read_csv('../ab_data/processed_data/processeddata_not aphasic_para.csv')[['label', 'new_preprocessed_text', 'scenario']]
control_df = pd.read_csv('../ab_data/processed_data/processeddata_control_para.csv')[['label', 'new_preprocessed_text', 'scenario']]

fp = "../ab_data/experiment_data/"
print(len(conduction_df))
print(len(anomic_df))
print(len(wernicke_df))
print(len(not_aphasic_df))
print(len(control_df))

# random scenarios are same across the dfs (random_state=72)

# conduction vs anomic
pd.concat([conduction_df, anomic_df]).sample(frac=1, random_state=42).to_csv(fp+"conduction_anomic.csv")

# conduction vs anomic vs control
pd.concat([conduction_df, anomic_df, control_df]).sample(frac=1, random_state=42).to_csv(fp+"conduction_anomic_control.csv")

# control vs anomic
pd.concat([control_df, anomic_df]).sample(frac=1, random_state=42).to_csv(fp+"control_anomic.csv")

# wernicke vs anomic (downsample anomic to 522 random scenarios)
pd.concat([wernicke_df, anomic_df.sample(n=522, random_state=72)]).sample(frac=1, random_state=42).to_csv(fp+"wernicke_anomic.csv")

# wernicke vs anomic vs control (downsample control and anomic to 522 random scenarios)
pd.concat([wernicke_df, anomic_df.sample(n=522, random_state=72), control_df.sample(n=522, random_state=72)]).sample(frac=1, random_state=42).to_csv(fp+"wernicke_anomic_control.csv")

# control vs conduction 
pd.concat([control_df, conduction_df]).sample(frac=1, random_state=42).to_csv(fp+"control_conduction.csv")

# control vs wernicke (downsample control to 522 random scenarios)
pd.concat([control_df.sample(n=522, random_state=72), wernicke_df]).sample(frac=1, random_state=42).to_csv(fp+"control_wernicke.csv")

# anomic vs conduction vs wernicke (downsample anomic and conduction to 522 random scenarios
pd.concat([anomic_df.sample(n=522, random_state=72), conduction_df.sample(n=522, random_state=72), wernicke_df]).sample(frac=1, random_state=42).to_csv(fp+"anomic_conduction_wernicke.csv")

936
1848
397
723
2167
