Data Cleaning

In [None]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Import necessary libraries
import os
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [None]:

# Define the path to the folder containing your text files
label_path = '/content/drive/MyDrive/TextClassification/Dataset/EmoEvaluation'

# Function to remove unwanted lines from a file
def remove_unwanted_lines(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Filter lines that don't start with "C-E" or "A-E"
    cleaned_lines = [line for line in lines if not (line.startswith("C-E") or line.startswith("A-E"))]

    # Write the cleaned lines back to the file
    with open(file_path, 'w') as file:
        file.writelines(cleaned_lines)

# List all .txt files in the folder
label_files = [f for f in os.listdir(label_path) if f.endswith('.txt')]

if label_files:
    for file in label_files:
        file_path = os.path.join(label_path, file)

        # Remove unwanted lines from the file
        remove_unwanted_lines(file_path)
        print(f"Processed {file}")
else:
    print("No .txt files found in the labels directory.")


Processed Ses05F_script01_2.txt
Processed Ses05F_impro02.txt
Processed Ses05F_impro05.txt
Processed Ses05M_impro03.txt
Processed Ses05M_script02_1.txt
Processed Ses05F_script03_2.txt
Processed Ses05M_script01_3.txt
Processed Ses05M_impro01.txt
Processed Ses05F_impro07.txt
Processed Ses05F_impro06.txt
Processed Ses05M_script02_2.txt
Processed Ses05M_impro05.txt
Processed Ses05F_impro08.txt
Processed Ses05M_script01_1b.txt
Processed Ses05M_impro04.txt
Processed Ses05F_script02_1.txt
Processed Ses05F_script02_2.txt
Processed Ses05M_impro06.txt
Processed Ses05M_script03_1.txt
Processed Ses05M_impro07.txt
Processed Ses05F_script01_3.txt
Processed Ses05F_script03_1.txt
Processed Ses05M_script01_1.txt
Processed Ses05F_impro04.txt
Processed Ses05M_script03_2.txt
Processed Ses05F_impro03.txt
Processed Ses05M_impro08.txt
Processed Ses05M_script01_2.txt
Processed Ses05M_impro02.txt
Processed Ses05F_script01_1.txt
Processed Ses05F_impro01.txt


In [None]:
# Dataset paths
label_path = "/content/drive/MyDrive/TextClassification/Dataset/EmoEvaluation"

In [None]:
# Create output directories inside dataset path
converted_labels_path = "/content/drive/MyDrive/TextClassification/Dataset/converted_labels"

# Create the directories if they don't already exist
os.makedirs(converted_labels_path, exist_ok=True)

In [None]:
# Create output folder if it doesn't exist
if not os.path.exists(converted_labels_path):
    os.makedirs(converted_labels_path)

# Function to process each text file and convert it to CSV
def convert_txt_to_csv(file_path, output_file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Prepare a list to hold the rows of the DataFrame
    data = []

    # Process the lines to extract meaningful data
    for line in lines:
        # Skip unwanted lines starting with "C-E" or "A-E"
        if line.startswith("C-E") or line.startswith("A-E"):
            continue

        # Split by tabs and spaces to separate the columns
        split_line = line.strip().split('\t')

        if len(split_line) == 4:
            # If there are 4 items, we treat it as a valid row (e.g., [START_TIME - END_TIME], TURN_NAME, EMOTION, [V, A, D])
            data.append(split_line)

    # Create a DataFrame from the data
    df = pd.DataFrame(data, columns=["% [START_TIME - END_TIME]", "TURN_NAME", "EMOTION", "[V, A, D]"])

    # Save the DataFrame as a CSV file
    df.to_csv(output_file_path, index=False)

# List all .txt files in the folder
label_files = [f for f in os.listdir(label_path) if f.endswith('.txt')]

if label_files:
    for file in label_files:
        file_path = os.path.join(label_path, file)
        output_file_path = os.path.join(converted_labels_path, file.replace('.txt', '.csv'))

        # Convert the .txt file to .csv
        convert_txt_to_csv(file_path, output_file_path)
        print(f"Converted {file} to CSV.")
else:
    print("No .txt files found in the labels directory.")

Converted Ses05F_script01_2.txt to CSV.
Converted Ses05F_impro02.txt to CSV.
Converted Ses05F_impro05.txt to CSV.
Converted Ses05M_impro03.txt to CSV.
Converted Ses05M_script02_1.txt to CSV.
Converted Ses05F_script03_2.txt to CSV.
Converted Ses05M_script01_3.txt to CSV.
Converted Ses05M_impro01.txt to CSV.
Converted Ses05F_impro07.txt to CSV.
Converted Ses05F_impro06.txt to CSV.
Converted Ses05M_script02_2.txt to CSV.
Converted Ses05M_impro05.txt to CSV.
Converted Ses05F_impro08.txt to CSV.
Converted Ses05M_script01_1b.txt to CSV.
Converted Ses05M_impro04.txt to CSV.
Converted Ses05F_script02_1.txt to CSV.
Converted Ses05F_script02_2.txt to CSV.
Converted Ses05M_impro06.txt to CSV.
Converted Ses05M_script03_1.txt to CSV.
Converted Ses05M_impro07.txt to CSV.
Converted Ses05F_script01_3.txt to CSV.
Converted Ses05F_script03_1.txt to CSV.
Converted Ses05M_script01_1.txt to CSV.
Converted Ses05F_impro04.txt to CSV.
Converted Ses05M_script03_2.txt to CSV.
Converted Ses05F_impro03.txt to CSV

In [None]:
# Specify the folder path where your .txt files are located
transcriptions_path = '/content/drive/MyDrive/TextClassification/Dataset/transcriptions'

# Specify the path for the new folder where you want to save the .csv files
output_folder = '/content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions'

# Ensure the new folder exists
os.makedirs(output_folder, exist_ok=True)

# Column names to add
column_names = ['TURN_NAME', '% [START_TIME - END_TIME]', 'TEXT']

# Iterate through all files in the folder
for filename in os.listdir(transcriptions_path):
    file_path = os.path.join(transcriptions_path, filename)

    # Only process .txt files
    if filename.endswith('.txt'):
        try:
            # Open the file and process each line
            with open(file_path, 'r') as file:
                lines = file.readlines()

            # Prepare lists to hold data for the DataFrame
            turn_names = []
            timestamps = []
            texts = []

            # Process each line in the file
            for line in lines:
                # Use regex to extract speaker, timestamp, and text
                match = re.match(r'([^\[]+)\s+\[([^\]]+)\]:\s*(.*)', line.strip())
                if match:
                    turn_name = match.group(1).strip()
                    timestamp = match.group(2).strip()
                    text = match.group(3).strip()

                    turn_names.append(turn_name)
                    timestamps.append(timestamp)
                    texts.append(text)

            # Create a DataFrame
            df = pd.DataFrame({
                'TURN_NAME': turn_names,
                '% [START_TIME - END_TIME]': timestamps,
                'TEXT': texts
            })

            # Create new file path for saving as .csv in the new folder
            csv_file_path = os.path.join(output_folder, filename.replace('.txt', '.csv'))

            # Save the DataFrame as a .csv file in the new folder
            df.to_csv(csv_file_path, index=False)

            print(f"File {filename} converted to CSV and saved as {csv_file_path}.")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

File Ses05F_impro07.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05F_impro07.csv.
File Ses05M_script03_1.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05M_script03_1.csv.
File Ses05F_impro03.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05F_impro03.csv.
File Ses05F_impro08.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05F_impro08.csv.
File Ses05M_script01_2.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05M_script01_2.csv.
File Ses05M_impro03.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05M_impro03.csv.
File Ses05F_impro02.txt converted to CSV and saved as /content/drive/MyDrive/TextClassificatio

In [None]:
import os
import pandas as pd

# Path to the folder containing the files
folder_path = '/content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions'

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    if filename.endswith('.csv'):  # Modify for other file formats like .xlsx if needed
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Find columns with the pattern % [START_TIME - END_TIME]
        columns_to_drop = [col for col in df.columns if '%' in col and '[' in col and ']' in col]

        # Drop the columns
        df.drop(columns=columns_to_drop, inplace=True)

        # Save the updated file back
        df.to_csv(file_path, index=False)
        print(f"Updated {filename} - Removed columns: {columns_to_drop}")

Updated Ses05F_impro07.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_impro03.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_script02_1.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05F_impro02.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_script01_3.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05F_script01_1.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05F_impro03.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_script01_2.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_impro04.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05F_impro08.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_script03_1.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05F_script01_3.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_script01_1b.csv - Removed columns: ['% [START_TIME - EN

In [None]:
import pandas as pd
import os

# Define the paths to your folders
labels_folder = '/content/drive/MyDrive/TextClassification/Dataset/converted_labels'
transcriptions_folder = '/content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions'

# List all files in the folders (assuming the files have a .csv extension)
labels_files = [f for f in os.listdir(labels_folder) if f.endswith('.csv')]
transcriptions_files = [f for f in os.listdir(transcriptions_folder) if f.endswith('.csv')]

# Check column names for each file in the labels folder
for label_file in labels_files:
    label_path = os.path.join(labels_folder, label_file)
    labels_df = pd.read_csv(label_path, sep="\t")
    print(f"Columns in label file {label_file}: {labels_df.columns.tolist()}")

# Check column names for each file in the transcriptions folder
for transcription_file in transcriptions_files:
    transcription_path = os.path.join(transcriptions_folder, transcription_file)
    transcriptions_df = pd.read_csv(transcription_path, sep=",")
    print(f"Columns in transcription file {transcription_file}: {transcriptions_df.columns.tolist()}")

Columns in label file Ses05F_script01_2.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05M_impro03.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05F_script03_2.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05F_impro07.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05M_impro01.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05M_script02_2.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05F_impro06.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05M_script01_3.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05F_impro02.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05M_script02_1.csv: ['% [START_TIME - END_TIM

In [None]:
import pandas as pd
import os

# Paths to your input files
file1_path = '/content/drive/MyDrive/TextClassification/Dataset/converted_labels/Ses05M_script03_2.csv'
file2_path = '/content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05M_script03_2.csv'

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Merge the files based on the "TURN_NAME" column
merged_df = pd.merge(df1, df2, on="TURN_NAME", how="outer")  # Use 'outer' to keep all rows

# Create a new folder to save the merged file
output_folder = '/content/drive/MyDrive/TextClassification/Dataset/merged_data'
os.makedirs(output_folder, exist_ok=True)

# Get the file name from the original file path (keeping the name of the first file as an example)
file_name = os.path.basename(file1_path)

# Save the merged file in the new folder with the same name
output_path = os.path.join(output_folder, file_name)
merged_df.to_csv(output_path, index=False)

print(f"Merged file saved as {output_path}")


Merged file saved as /content/drive/MyDrive/TextClassification/Dataset/merged_data/Ses05M_script03_2.csv


In [None]:
import pandas as pd

# Path to your CSV file
file_path = '/content/drive/MyDrive/TextClassification/Dataset/merged_data/Ses05M_script03_2.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# 1. Check if there are any missing values in any column
missing_values = df[df.isna().any(axis=1)]  # Rows with missing values in any column
if not missing_values.empty:
    print("Rows with missing values:")
    print(missing_values)
else:
    print("No missing values in any column.")

# 2. Check for empty rows (rows where all columns are NaN)
empty_rows = df[df.isna().all(axis=1)]
if not empty_rows.empty:
    print("\nEmpty rows found (rows where all values are NaN):")
    print(empty_rows)
else:
    print("\nNo empty rows found.")


No missing values in any column.

No empty rows found.


In [None]:
import pandas as pd

# Path to the merged file
file_path = '/content/drive/MyDrive/TextClassification/Dataset/merged_data/Ses05M_script02_2.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Check if the specific row 'Ses05F_impro01_FXX0' has missing EMOTION and delete it if so
df = df[~((df['TURN_NAME'] == 'Ses05M_script02_2_FXX0') & df['EMOTION'].isna())]

# Save the modified DataFrame back to the same CSV file
df.to_csv(file_path, index=False)

print(f"Updated file saved as {file_path}")


Updated file saved as /content/drive/MyDrive/TextClassification/Dataset/merged_data/Ses05M_script02_2.csv


################################################################################

Start of Data pre-processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
