In [1]:
import re # for punctuation strip
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from textblob import TextBlob

In [2]:
# Correct the typos on the text summaries
df = pd.read_csv('/Users/irmakergin/Desktop/data_all/organized_data_experiment/organized_data.csv')

# textblob is not accurate with correcting abbrevations
# and  corrects some words inaccurately so I am exclyding them for now.
# I am dealing with the abbrevations later in the code before I calculate the semantic similarity

exclusion_list = ['ive', 'im', "i've", "i'm",
    "youre", "you're","theyre", "they're", "shes", "she's","hes", "he's",
    "we're", "don't", "dont", "doesn't", "doesnt", "didn't", "didnt", 
                  "sheve", "she've", "he've", "heve", "they've", "theyve", 
                  "weve", "we've", "shouldn't", "shouldnt", "dina","franny",
                  "frannie","isnt", "isn't","aren't", "arent", 'dvd', "wasnt", "wasn't",
                 "werent", "weren't", "limo", "huh", "har", "audition", "phoney","phony", "grilled", "robot",
                 "evertte", "mom", "caller", "memorize", "sheila", 'filming', "tv", "penelope", "bagel",
                 "nanny", "candace","giggle","chunky", "smouldering", "sexy", "levine", "bunny","barney",
                 "pants", "laundry", "kidding"]

In [3]:
# correct the abbrevations and turn them into seperate words 
spelling_changes = {
    "frannie": ["franny"],
    "fanny": ["franny"],
    "ive": ["I", "have"],
    "i've": ["I", "have"],
    "im": ["I", "am"],
    "i'm": ["I", "am"],
    "youre": ["you", "are"],
    "you're": ["you", "are"],
    "theyre": ["they", "are"],
    "they're": ["they", "are"],
    "shes": ["she", "is"],
    "she's": ["she", "is"],
    "hes": ["he", "is"],
    "he's": ["he", "is"],
    "it's": ["it", "is"],
    "we're": ["we", "are"],
    "don't": ["do", "not"],
    "dont": ["do", "not"],
    "doesn't": ["does", "not"],
    "doesnt": ["does", "not"],
    "didn't": ["did", "not"],
    "didnt": ["did", "not"],
    "sheve": ["she", "have"],
    "she've": ["she", "have"],
    "he've": ["he", "have"],
    "heve": ["he", "have"],
    "they've": ["they", "have"],
    "theyve": ["they", "have"],
    "weve": ["we", "have"],
    "we've": ["we", "have"],
    "shouldn't": ["should", "not"],
    "shouldnt": ["should", "not"],
    "isnt": ["is", "not"],
    "isn't": ["is", "not"],
    "aren't": ["are", "not"],
    "arent": ["are", "not"],
    "wasnt": ["was", "not"], 
    "wasn't": ["was", "not"],
    "weren't": ["were", "not"],
    "werent": ["were", "not"]
}

In [4]:
# Function to correct words, excluding words start with capital letter and ones defined above
# First correct for "spelling changes", then with textblob
def preprocess_and_correct_text(text):
    if pd.isna(text):
        return text
    
    # Apply spelling changes
    words = text.split()
    preprocessed_words = []
    for word in words:
        word_lower = word.lower()
        if word_lower in spelling_changes:
            # Check if the replacement is a list and extend, otherwise append
            if isinstance(spelling_changes[word_lower], list):
                preprocessed_words.extend(spelling_changes[word_lower])
            else:
                preprocessed_words.append(spelling_changes[word_lower])
        else:
            preprocessed_words.append(word)
    preprocessed_text = ' '.join(preprocessed_words)
    
    # Apply TextBlob corrections, skipping the exclusion list
    corrected_words = []
    for word in preprocessed_text.split():
        if word.lower() not in exclusion_list and not word[0].isupper():
            corrected_word = str(TextBlob(word).correct())
        else:
            corrected_word = word
        corrected_words.append(corrected_word)
    
    return ' '.join(corrected_words)

In [5]:
df['textbox.text_corrected'] = df['textbox.text'].apply(preprocess_and_correct_text)

# Change order of columns

columns = list(df.columns)

# Find the position of 'textbox.text'
textbox_text_index = columns.index('textbox.text')

# Place 'textbox.text_corrected' right after 'textbox.text'
columns.remove('textbox.text_corrected')
columns.insert(textbox_text_index + 1, 'textbox.text_corrected')
df = df[columns]


In [6]:
print(df.columns)
df.head

Index(['participant', 'index', 'wav_file', 'old_name', 'duration',
       'segment_text', 'question', 'C1', 'C2', 'C3', 'C4', 'C_correct',
       'A.numClicks', 'B.numClicks', 'C.numClicks', 'D.numClicks',
       'multiple_choice_accuracy', 'likert_response', 'summary',
       'textbox.text', 'textbox.text_corrected', 'speech_rate',
       'digit_span_score', 'digit_in_noise_score', 'slider_values',
       'slider_time', 'slider_values_rescaled', 'slider_values_number',
       'trial_movement_score_magnitude', 'movement_score_all',
       'likert_rescaled'],
      dtype='object')


<bound method NDFrame.head of      participant  index                      wav_file  \
0            p_1      1   Someday_23_condition-2x.wav   
1            p_1      2   Someday_65_condition-2x.wav   
2            p_1      3  Someday_123_condition-5x.wav   
3            p_1      4   Someday_82_condition-5x.wav   
4            p_1      5   Someday_43_condition-4x.wav   
...          ...    ...                           ...   
2745         p_9    121   Someday_20_condition-4x.wav   
2746         p_9    122    Someday_1_condition-2x.wav   
2747         p_9    123       Someday_2_condition.wav   
2748         p_9    124   Someday_41_condition-4x.wav   
2749         p_9    125   Someday_64_condition-2x.wav   

                                               old_name   duration  \
0     Someday__Someday__Maybe__Unabridged___File_014...  14.649969   
1     Someday__Someday__Maybe__Unabridged___File_032...  14.044969   
2     Someday__Someday__Maybe__Unabridged___File_054...   6.032000   
3    

In [7]:
# add empty column to enter manual corrections
# Find the index of 'textbox.text_corrected' column
corrected_text_index = df.columns.get_loc('textbox.text_corrected') + 1

# Insert the new empty column 'textbox.text_corrected_manual' right after 'textbox.text_corrected'
df.insert(corrected_text_index, 'textbox.text_corrected_manual', '')


In [8]:
print(df.columns)

Index(['participant', 'index', 'wav_file', 'old_name', 'duration',
       'segment_text', 'question', 'C1', 'C2', 'C3', 'C4', 'C_correct',
       'A.numClicks', 'B.numClicks', 'C.numClicks', 'D.numClicks',
       'multiple_choice_accuracy', 'likert_response', 'summary',
       'textbox.text', 'textbox.text_corrected',
       'textbox.text_corrected_manual', 'speech_rate', 'digit_span_score',
       'digit_in_noise_score', 'slider_values', 'slider_time',
       'slider_values_rescaled', 'slider_values_number',
       'trial_movement_score_magnitude', 'movement_score_all',
       'likert_rescaled'],
      dtype='object')


In [9]:
# Save
df.to_csv('/Users/irmakergin/Desktop/data_all/organized_data_experiment/organized_data_no_typo.csv', index=False)