# Step 2.10: Checking ASR Outputs for the Feature

This code will check to see if the feature is present in the ASR output. If not, returns a 0. If so, returns a NaN to be analyzed manually.

## Required Packages

The following packages are necessary to run this code: os, [pandas](https://pypi.org/project/pandas/), [numpy](https://pypi.org/project/numpy/)

## Intitial Setup

In [None]:
# Import required packages
import pandas as pd
import numpy as np
import os

In [None]:
#filepath for the csv produced in Step 2.9
aint_file_path = "path"

be_file_path = "path"

done_file_path = "path"

#reads in the gold standard dataframe    
aint_gs_df = pd.read_csv(aint_file_path)

be_gs_df = pd.read_csv(be_file_path)

done_gs_df = pd.read_csv(done_file_path)

# Defining the Checking for Feature Function

This function takes the following arguments:
1. The feature
2. The cleaned ASR output as a string

In [None]:
def checkForFeature(feature, cleaned_ASR_output):
    
    """
    Checks to see if the feature is present in the ASR output.
    If not, returns a 0. If so, returns a NaN to be
    analyzed manually.
    """
    
    import numpy as np

    if type(cleaned_ASR_output) != str:
        
        return np.nan
    
    else:
        
        if feature == "ain't":
            
            cleaned_ASR_output = cleaned_ASR_output.replace("ai n't", "ain't")
            
        elif feature == "isn't":
        
            cleaned_ASR_output = cleaned_ASR_output.replace("is n't", "isn't")
        
        elif feature == "aren't":
        
            cleaned_ASR_output = cleaned_ASR_output.replace("are n't", "aren't")
            
        elif feature == "I'm not":
            
            feature = "i'mnot"
        
            cleaned_ASR_output = cleaned_ASR_output.replace("i 'm not", "i'mnot")
            
        elif feature == "didn't":
        
            cleaned_ASR_output = cleaned_ASR_output.replace("did n't", "didn't")
            
        elif feature == "haven't":
        
            cleaned_ASR_output = cleaned_ASR_output.replace("have n't", "haven't")
            
        elif feature == "hasn't":
        
            cleaned_ASR_output = cleaned_ASR_output.replace("has n't", "hasn't")
            
    
        if feature in cleaned_ASR_output.split():

            return 1

        else:

            return 0

## Executing the Code

In [None]:
# a list of column names to be appended next to
column_names = ["amazon_transcription_cleaned", 
                "deepspeech_transcription_cleaned", "google_transcription_cleaned", 
                "IBMWatson_transcription_cleaned", "microsoft_transcription_cleaned"]

### Feature: Ain't

Before running the code for the *ain't* variations, the variations will be split into separate dataframes to be processed. These will be concatenated again in the end.

In [None]:
aint_df = aint_gs_df[aint_gs_df["AintVariation"]=="ain't"]
isnt_df = aint_gs_df[aint_gs_df["AintVariation"]=="isn't"]
arent_df = aint_gs_df[aint_gs_df["AintVariation"]=="aren't"]
imnot_df = aint_gs_df[aint_gs_df["AintVariation"]=="I'm not"]
didnt_df = aint_gs_df[aint_gs_df["AintVariation"]=="didn't"]
havent_df = aint_gs_df[aint_gs_df["AintVariation"]=="haven't"]
hasnt_df = aint_gs_df[aint_gs_df["AintVariation"]=="hasn't"]

In [None]:
# Defines the feature
feature = "ain't"

# Appends new columns
for column_name in column_names:
    
    col_index = aint_df.columns.get_loc(column_name)
    
    aint_df.insert(col_index+1, f"{column_name}_containsFeature", np.nan)
            

# Loops through rows and executes function
for file_row in aint_df.itertuples():
    
    aint_df.loc[file_row.Index, "amazon_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.amazon_transcription_cleaned)
    
    aint_df.loc[file_row.Index, "deepspeech_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.deepspeech_transcription_cleaned)
    
    aint_df.loc[file_row.Index, "google_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.google_transcription_cleaned)
    
    aint_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.IBMWatson_transcription_cleaned)
    
    aint_df.loc[file_row.Index, "microsoft_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.microsoft_transcription_cleaned)

In [None]:
# Defines the feature
feature = "isn't"

# Appends new columns
for column_name in column_names:
    
    col_index = isnt_df.columns.get_loc(column_name)
    
    isnt_df.insert(col_index+1, f"{column_name}_containsFeature", np.nan)
            

# Loops through rows and executes function
for file_row in isnt_df.itertuples():
    
    isnt_df.loc[file_row.Index, "amazon_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.amazon_transcription_cleaned)
    
    isnt_df.loc[file_row.Index, "deepspeech_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.deepspeech_transcription_cleaned)
    
    isnt_df.loc[file_row.Index, "google_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.google_transcription_cleaned)
    
    isnt_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.IBMWatson_transcription_cleaned)
    
    isnt_df.loc[file_row.Index, "microsoft_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.microsoft_transcription_cleaned)

In [None]:
# Defines the feature
feature = "aren't"

# Appends new columns
for column_name in column_names:
    
    col_index = arent_df.columns.get_loc(column_name)
    
    arent_df.insert(col_index+1, f"{column_name}_containsFeature", np.nan)
            

# Loops through rows and executes function
for file_row in arent_df.itertuples():
    
    arent_df.loc[file_row.Index, "amazon_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.amazon_transcription_cleaned)
    
    arent_df.loc[file_row.Index, "deepspeech_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.deepspeech_transcription_cleaned)
    
    arent_df.loc[file_row.Index, "google_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.google_transcription_cleaned)
    
    arent_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.IBMWatson_transcription_cleaned)
    
    arent_df.loc[file_row.Index, "microsoft_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.microsoft_transcription_cleaned)

In [None]:
# Defines the feature
feature = "I'm not"

# Appends new columns
for column_name in column_names:
    
    col_index = imnot_df.columns.get_loc(column_name)
    
    imnot_df.insert(col_index+1, f"{column_name}_containsFeature", np.nan)
            

# Loops through rows and executes function
for file_row in imnot_df.itertuples():
    
    imnot_df.loc[file_row.Index, "amazon_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.amazon_transcription_cleaned)
    
    imnot_df.loc[file_row.Index, "deepspeech_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.deepspeech_transcription_cleaned)
    
    imnot_df.loc[file_row.Index, "google_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.google_transcription_cleaned)
    
    imnot_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.IBMWatson_transcription_cleaned)
    
    imnot_df.loc[file_row.Index, "microsoft_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.microsoft_transcription_cleaned)

In [None]:
# Defines the feature
feature = "didn't"

# Appends new columns
for column_name in column_names:
    
    col_index = didnt_df.columns.get_loc(column_name)
    
    didnt_df.insert(col_index+1, f"{column_name}_containsFeature", np.nan)
            

# Loops through rows and executes function
for file_row in didnt_df.itertuples():
    
    didnt_df.loc[file_row.Index, "amazon_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.amazon_transcription_cleaned)
    
    didnt_df.loc[file_row.Index, "deepspeech_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.deepspeech_transcription_cleaned)
    
    didnt_df.loc[file_row.Index, "google_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.google_transcription_cleaned)
    
    didnt_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.IBMWatson_transcription_cleaned)
    
    didnt_df.loc[file_row.Index, "microsoft_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.microsoft_transcription_cleaned)

In [None]:
# Defines the feature
feature = "haven't"

# Appends new columns
for column_name in column_names:
    
    col_index = havent_df.columns.get_loc(column_name)
    
    havent_df.insert(col_index+1, f"{column_name}_containsFeature", np.nan)
            

# Loops through rows and executes function
for file_row in havent_df.itertuples():
    
    havent_df.loc[file_row.Index, "amazon_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.amazon_transcription_cleaned)
    
    havent_df.loc[file_row.Index, "deepspeech_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.deepspeech_transcription_cleaned)
    
    havent_df.loc[file_row.Index, "google_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.google_transcription_cleaned)
    
    havent_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.IBMWatson_transcription_cleaned)
    
    havent_df.loc[file_row.Index, "microsoft_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.microsoft_transcription_cleaned)

In [None]:
# Defines the feature
feature = "hasn't"

# Appends new columns
for column_name in column_names:
    
    col_index = hasnt_df.columns.get_loc(column_name)
    
    hasnt_df.insert(col_index+1, f"{column_name}_containsFeature", np.nan)
            

# Loops through rows and executes function
for file_row in hasnt_df.itertuples():
    
    hasnt_df.loc[file_row.Index, "amazon_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.amazon_transcription_cleaned)
    
    hasnt_df.loc[file_row.Index, "deepspeech_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.deepspeech_transcription_cleaned)
    
    hasnt_df.loc[file_row.Index, "google_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.google_transcription_cleaned)
    
    hasnt_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.IBMWatson_transcription_cleaned)
    
    hasnt_df.loc[file_row.Index, "microsoft_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.microsoft_transcription_cleaned)

In [None]:
aint_gs_df = pd.concat([aint_df, isnt_df, arent_df, imnot_df, didnt_df, havent_df, hasnt_df])

### Feature: Be

In [None]:
# Defines the feature
feature = "be"

# Appends new columns
for column_name in column_names:
    
    col_index = be_gs_df.columns.get_loc(column_name)
    
    be_gs_df.insert(col_index+1, f"{column_name}_containsFeature", np.nan)
            

# Loops through rows and executes function
for file_row in be_gs_df.itertuples():
    
    be_gs_df.loc[file_row.Index, "amazon_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.amazon_transcription_cleaned)
    
    be_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.deepspeech_transcription_cleaned)
    
    be_gs_df.loc[file_row.Index, "google_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.google_transcription_cleaned)
    
    be_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.IBMWatson_transcription_cleaned)
    
    be_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.microsoft_transcription_cleaned)

### Feature: Done

In [None]:
# Defines the feature
feature = "done"

# Appends new columns
for column_name in column_names:
    
    col_index = done_gs_df.columns.get_loc(column_name)
    
    done_gs_df.insert(col_index+1, f"{column_name}_containsFeature", np.nan)
            

# Loops through rows and executes function
for file_row in done_gs_df.itertuples():
    
    done_gs_df.loc[file_row.Index, "amazon_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.amazon_transcription_cleaned)
    
    done_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.deepspeech_transcription_cleaned)
    
    done_gs_df.loc[file_row.Index, "google_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.google_transcription_cleaned)
    
    done_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.IBMWatson_transcription_cleaned)
    
    done_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned_containsFeature"] = checkForFeature(feature, file_row.microsoft_transcription_cleaned)

## Sorting the Dataframes by File and Line

This will sort the dataframes first by filename and then by line number. Doing this each step will ensure consistency across the board.

### Feature: Ain't

In [None]:
aint_gs_df = aint_gs_df.sort_values(by=['File', 'Line'])

### Feature: Be

In [None]:
be_gs_df = be_gs_df.sort_values(by=['File', 'Line'])

### Feature: Done

In [None]:
done_gs_df = done_gs_df.sort_values(by=['File', 'Line'])

## Exporting Dataframes to CSV Files

This will export the dataframes to CSV files.

In [None]:
# Designate the output path where the CSVs will be stored
csv_output_path = "path"

### Feature: Ain't

In [None]:
aint_gs_df.to_csv(f"{csv_output_path}aint_variations_checkForFeature.csv", index=False)

### Feature: Be

In [None]:
be_gs_df.to_csv(f"{csv_output_path}be_checkForFeature.csv", index=False)

### Feature: Done

In [None]:
done_gs_df.to_csv(f"{csv_output_path}done_checkForFeature.csv", index=False)