Data Cleaning

In [None]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Import necessary libraries
import os
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [None]:

# Define the path to the folder containing your text files
label_path = '/content/drive/MyDrive/TextClassification/Dataset/EmoEvaluation'

# Function to remove unwanted lines from a file
def remove_unwanted_lines(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Filter lines that don't start with "C-E" or "A-E"
    cleaned_lines = [line for line in lines if not (line.startswith("C-E") or line.startswith("A-E"))]

    # Write the cleaned lines back to the file
    with open(file_path, 'w') as file:
        file.writelines(cleaned_lines)

# List all .txt files in the folder
label_files = [f for f in os.listdir(label_path) if f.endswith('.txt')]

if label_files:
    for file in label_files:
        file_path = os.path.join(label_path, file)

        # Remove unwanted lines from the file
        remove_unwanted_lines(file_path)
        print(f"Processed {file}")
else:
    print("No .txt files found in the labels directory.")


Processed Ses05F_script01_2.txt
Processed Ses05F_impro02.txt
Processed Ses05F_impro05.txt
Processed Ses05M_impro03.txt
Processed Ses05M_script02_1.txt
Processed Ses05F_script03_2.txt
Processed Ses05M_script01_3.txt
Processed Ses05M_impro01.txt
Processed Ses05F_impro07.txt
Processed Ses05F_impro06.txt
Processed Ses05M_script02_2.txt
Processed Ses05M_impro05.txt
Processed Ses05F_impro08.txt
Processed Ses05M_script01_1b.txt
Processed Ses05M_impro04.txt
Processed Ses05F_script02_1.txt
Processed Ses05F_script02_2.txt
Processed Ses05M_impro06.txt
Processed Ses05M_script03_1.txt
Processed Ses05M_impro07.txt
Processed Ses05F_script01_3.txt
Processed Ses05F_script03_1.txt
Processed Ses05M_script01_1.txt
Processed Ses05F_impro04.txt
Processed Ses05M_script03_2.txt
Processed Ses05F_impro03.txt
Processed Ses05M_impro08.txt
Processed Ses05M_script01_2.txt
Processed Ses05M_impro02.txt
Processed Ses05F_script01_1.txt
Processed Ses05F_impro01.txt


In [None]:
# Dataset paths
label_path = "/content/drive/MyDrive/TextClassification/Dataset/EmoEvaluation"

In [None]:
# Create output directories inside dataset path
converted_labels_path = "/content/drive/MyDrive/TextClassification/Dataset/converted_labels"

# Create the directories if they don't already exist
os.makedirs(converted_labels_path, exist_ok=True)

In [None]:
# Create output folder if it doesn't exist
if not os.path.exists(converted_labels_path):
    os.makedirs(converted_labels_path)

# Function to process each text file and convert it to CSV
def convert_txt_to_csv(file_path, output_file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Prepare a list to hold the rows of the DataFrame
    data = []

    # Process the lines to extract meaningful data
    for line in lines:
        # Skip unwanted lines starting with "C-E" or "A-E"
        if line.startswith("C-E") or line.startswith("A-E"):
            continue

        # Split by tabs and spaces to separate the columns
        split_line = line.strip().split('\t')

        if len(split_line) == 4:
            # If there are 4 items, we treat it as a valid row (e.g., [START_TIME - END_TIME], TURN_NAME, EMOTION, [V, A, D])
            data.append(split_line)

    # Create a DataFrame from the data
    df = pd.DataFrame(data, columns=["% [START_TIME - END_TIME]", "TURN_NAME", "EMOTION", "[V, A, D]"])

    # Save the DataFrame as a CSV file
    df.to_csv(output_file_path, index=False)

# List all .txt files in the folder
label_files = [f for f in os.listdir(label_path) if f.endswith('.txt')]

if label_files:
    for file in label_files:
        file_path = os.path.join(label_path, file)
        output_file_path = os.path.join(converted_labels_path, file.replace('.txt', '.csv'))

        # Convert the .txt file to .csv
        convert_txt_to_csv(file_path, output_file_path)
        print(f"Converted {file} to CSV.")
else:
    print("No .txt files found in the labels directory.")

Converted Ses05F_script01_2.txt to CSV.
Converted Ses05F_impro02.txt to CSV.
Converted Ses05F_impro05.txt to CSV.
Converted Ses05M_impro03.txt to CSV.
Converted Ses05M_script02_1.txt to CSV.
Converted Ses05F_script03_2.txt to CSV.
Converted Ses05M_script01_3.txt to CSV.
Converted Ses05M_impro01.txt to CSV.
Converted Ses05F_impro07.txt to CSV.
Converted Ses05F_impro06.txt to CSV.
Converted Ses05M_script02_2.txt to CSV.
Converted Ses05M_impro05.txt to CSV.
Converted Ses05F_impro08.txt to CSV.
Converted Ses05M_script01_1b.txt to CSV.
Converted Ses05M_impro04.txt to CSV.
Converted Ses05F_script02_1.txt to CSV.
Converted Ses05F_script02_2.txt to CSV.
Converted Ses05M_impro06.txt to CSV.
Converted Ses05M_script03_1.txt to CSV.
Converted Ses05M_impro07.txt to CSV.
Converted Ses05F_script01_3.txt to CSV.
Converted Ses05F_script03_1.txt to CSV.
Converted Ses05M_script01_1.txt to CSV.
Converted Ses05F_impro04.txt to CSV.
Converted Ses05M_script03_2.txt to CSV.
Converted Ses05F_impro03.txt to CSV

In [None]:
# Specify the folder path where your .txt files are located
transcriptions_path = '/content/drive/MyDrive/TextClassification/Dataset/transcriptions'

# Specify the path for the new folder where you want to save the .csv files
output_folder = '/content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions'

# Ensure the new folder exists
os.makedirs(output_folder, exist_ok=True)

# Column names to add
column_names = ['TURN_NAME', '% [START_TIME - END_TIME]', 'TEXT']

# Iterate through all files in the folder
for filename in os.listdir(transcriptions_path):
    file_path = os.path.join(transcriptions_path, filename)

    # Only process .txt files
    if filename.endswith('.txt'):
        try:
            # Open the file and process each line
            with open(file_path, 'r') as file:
                lines = file.readlines()

            # Prepare lists to hold data for the DataFrame
            turn_names = []
            timestamps = []
            texts = []

            # Process each line in the file
            for line in lines:
                # Use regex to extract speaker, timestamp, and text
                match = re.match(r'([^\[]+)\s+\[([^\]]+)\]:\s*(.*)', line.strip())
                if match:
                    turn_name = match.group(1).strip()
                    timestamp = match.group(2).strip()
                    text = match.group(3).strip()

                    turn_names.append(turn_name)
                    timestamps.append(timestamp)
                    texts.append(text)

            # Create a DataFrame
            df = pd.DataFrame({
                'TURN_NAME': turn_names,
                '% [START_TIME - END_TIME]': timestamps,
                'TEXT': texts
            })

            # Create new file path for saving as .csv in the new folder
            csv_file_path = os.path.join(output_folder, filename.replace('.txt', '.csv'))

            # Save the DataFrame as a .csv file in the new folder
            df.to_csv(csv_file_path, index=False)

            print(f"File {filename} converted to CSV and saved as {csv_file_path}.")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

File Ses05F_impro07.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05F_impro07.csv.
File Ses05M_script03_1.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05M_script03_1.csv.
File Ses05F_impro03.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05F_impro03.csv.
File Ses05F_impro08.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05F_impro08.csv.
File Ses05M_script01_2.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05M_script01_2.csv.
File Ses05M_impro03.txt converted to CSV and saved as /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05M_impro03.csv.
File Ses05F_impro02.txt converted to CSV and saved as /content/drive/MyDrive/TextClassificatio

In [None]:
import os
import pandas as pd

# Path to the folder containing the files
folder_path = '/content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions'

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    if filename.endswith('.csv'):  # Modify for other file formats like .xlsx if needed
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Find columns with the pattern % [START_TIME - END_TIME]
        columns_to_drop = [col for col in df.columns if '%' in col and '[' in col and ']' in col]

        # Drop the columns
        df.drop(columns=columns_to_drop, inplace=True)

        # Save the updated file back
        df.to_csv(file_path, index=False)
        print(f"Updated {filename} - Removed columns: {columns_to_drop}")

Updated Ses05F_impro07.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_impro03.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_script02_1.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05F_impro02.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_script01_3.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05F_script01_1.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05F_impro03.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_script01_2.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_impro04.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05F_impro08.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_script03_1.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05F_script01_3.csv - Removed columns: ['% [START_TIME - END_TIME]']
Updated Ses05M_script01_1b.csv - Removed columns: ['% [START_TIME - EN

In [None]:
import pandas as pd
import os

# Define the paths to your folders
labels_folder = '/content/drive/MyDrive/TextClassification/Dataset/converted_labels'
transcriptions_folder = '/content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions'

# List all files in the folders (assuming the files have a .csv extension)
labels_files = [f for f in os.listdir(labels_folder) if f.endswith('.csv')]
transcriptions_files = [f for f in os.listdir(transcriptions_folder) if f.endswith('.csv')]

# Check column names for each file in the labels folder
for label_file in labels_files:
    label_path = os.path.join(labels_folder, label_file)
    labels_df = pd.read_csv(label_path, sep="\t")
    print(f"Columns in label file {label_file}: {labels_df.columns.tolist()}")

# Check column names for each file in the transcriptions folder
for transcription_file in transcriptions_files:
    transcription_path = os.path.join(transcriptions_folder, transcription_file)
    transcriptions_df = pd.read_csv(transcription_path, sep=",")
    print(f"Columns in transcription file {transcription_file}: {transcriptions_df.columns.tolist()}")

Columns in label file Ses05F_script01_2.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05M_impro03.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05F_script03_2.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05F_impro07.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05M_impro01.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05M_script02_2.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05F_impro06.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05M_script01_3.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05F_impro02.csv: ['% [START_TIME - END_TIME],TURN_NAME,EMOTION,"[V, A, D]"']
Columns in label file Ses05M_script02_1.csv: ['% [START_TIME - END_TIM

In [None]:
import pandas as pd
import os

# Paths to your input files
file1_path = '/content/drive/MyDrive/TextClassification/Dataset/converted_labels/Ses05M_script03_2.csv'
file2_path = '/content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions/Ses05M_script03_2.csv'

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Merge the files based on the "TURN_NAME" column
merged_df = pd.merge(df1, df2, on="TURN_NAME", how="outer")  # Use 'outer' to keep all rows

# Create a new folder to save the merged file
output_folder = '/content/drive/MyDrive/TextClassification/Dataset/merged_data'
os.makedirs(output_folder, exist_ok=True)

# Get the file name from the original file path (keeping the name of the first file as an example)
file_name = os.path.basename(file1_path)

# Save the merged file in the new folder with the same name
output_path = os.path.join(output_folder, file_name)
merged_df.to_csv(output_path, index=False)

print(f"Merged file saved as {output_path}")


Merged file saved as /content/drive/MyDrive/TextClassification/Dataset/merged_data/Ses05M_script03_2.csv


In [None]:
import pandas as pd

# Path to your CSV file
file_path = '/content/drive/MyDrive/TextClassification/Dataset/merged_data/Ses05M_script03_2.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# 1. Check if there are any missing values in any column
missing_values = df[df.isna().any(axis=1)]  # Rows with missing values in any column
if not missing_values.empty:
    print("Rows with missing values:")
    print(missing_values)
else:
    print("No missing values in any column.")

# 2. Check for empty rows (rows where all columns are NaN)
empty_rows = df[df.isna().all(axis=1)]
if not empty_rows.empty:
    print("\nEmpty rows found (rows where all values are NaN):")
    print(empty_rows)
else:
    print("\nNo empty rows found.")


No missing values in any column.

No empty rows found.


In [None]:
import pandas as pd

# Path to the merged file
file_path = '/content/drive/MyDrive/TextClassification/Dataset/merged_data/Ses05M_script02_2.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Check if the specific row 'Ses05F_impro01_FXX0' has missing EMOTION and delete it if so
df = df[~((df['TURN_NAME'] == 'Ses05M_script02_2_FXX0') & df['EMOTION'].isna())]

# Save the modified DataFrame back to the same CSV file
df.to_csv(file_path, index=False)

print(f"Updated file saved as {file_path}")


Updated file saved as /content/drive/MyDrive/TextClassification/Dataset/merged_data/Ses05M_script02_2.csv


################################################################################

Start of Data pre-processing

In [None]:
import os
import pandas as pd
import re

# Define the contraction-expansion function
def expand_contractions(text, contractions):
    if not text:
        return text

    contraction_pattern = r"\b(" + "|".join(re.escape(contraction) for contraction in contractions.keys()) + r")\b"
    pattern = re.compile(contraction_pattern, re.IGNORECASE)

    def replace_contraction(match):
        word = match.group(0)
        lower_word = word.lower()

        if lower_word in contractions:
            replacement = contractions[lower_word]

            if word[0].isupper():
                replacement = replacement.capitalize()

            return replacement
        else:
            return word

    return pattern.sub(replace_contraction, text)

# Path to the folder containing your original files
folder_path = '/content/drive/MyDrive/TextClassification/Dataset/merged_data'

# Define the new folder path to save processed files
output_folder_path = '/content/drive/MyDrive/TextClassification/Preproccessing/contradictionWords'

# Ensure the output folder exists
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Define the current list of common contractions
contractions = {
    "can't": "cannot",
    "I'm": "I am",
    "don't": "do not",
    "that's": "that is",
    "didn't": "did not",
    "isn't": "is not",
    "they're": "they are",
    "There's": "There is",
    "doesn't": "does not",
    "hadn't": "had not",
    "It's": "It is",
    "woman's": "woman is",  # Assuming possessive "woman's" means "woman is"
    "won't": "will not",
    "You're": "You are",
    "aren't": "are not",
    "They're": "They are",
    "wasn't": "was not",
    "I'll": "I will",
    "needn't": "need not",
    "wouldn't": "would not",
    "I've": "I have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "what's": "what is",
    "where's": "where is",
    "how's": "how is",
    "who's": "who is",
    "mightn't": "might not",  # Added contraction
    "shouldn't": "should not",  # Added contraction
    "mustn't": "must not"  # Added contraction
}

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):  # Check for CSV files
        file_path = os.path.join(folder_path, filename)

        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(file_path)

        # Check if the 'TEXT' column exists
        if 'TEXT' in df.columns:
            # Fill NaN values with an empty string and ensure all data is in string format
            df['TEXT'] = df['TEXT'].fillna('').astype(str)


            # Apply contraction expansion to the 'TEXT' column
            df['TEXT'] = df['TEXT'].apply(lambda x: expand_contractions(x, contractions))

            # Define the output file path (same file name but in the new folder)
            output_file_path = os.path.join(output_folder_path, filename)

            # Save the updated DataFrame to the new folder
            df.to_csv(output_file_path, index=False)

            print(f"Processed and saved to {output_file_path}")


Processed and saved to /content/drive/MyDrive/TextClassification/Preproccessing/contradictionWords/Ses05M_impro06.csv
Processed and saved to /content/drive/MyDrive/TextClassification/Preproccessing/contradictionWords/Ses05M_script01_1.csv
Processed and saved to /content/drive/MyDrive/TextClassification/Preproccessing/contradictionWords/Ses05M_impro08.csv
Processed and saved to /content/drive/MyDrive/TextClassification/Preproccessing/contradictionWords/Ses05M_impro07.csv
Processed and saved to /content/drive/MyDrive/TextClassification/Preproccessing/contradictionWords/Ses05M_impro05.csv
Processed and saved to /content/drive/MyDrive/TextClassification/Preproccessing/contradictionWords/Ses05M_script01_1b.csv
Processed and saved to /content/drive/MyDrive/TextClassification/Preproccessing/contradictionWords/Ses05F_script02_2.csv
Processed and saved to /content/drive/MyDrive/TextClassification/Preproccessing/contradictionWords/Ses05F_script02_1.csv
Processed and saved to /content/drive/MyDri

In [None]:
import os

# Define the folder path where your original files are located
folder_path = "/content/drive/MyDrive/TextClassification/Dataset/merged_data"

# List all files in the folder
files_in_folder = [filename for filename in os.listdir(folder_path) if filename.endswith('.csv')]

# Print the filenames and the total count
print("Filenames in the original folder:")
for file in files_in_folder:
    print(file)

print(f"\nTotal number of files: {len(files_in_folder)}")


Filenames in the original folder:
Ses05M_impro06.csv
Ses05M_script01_1.csv
Ses05M_impro08.csv
Ses05M_impro07.csv
Ses05M_impro05.csv
Ses05M_script01_1b.csv
Ses05F_script02_2.csv
Ses05F_script02_1.csv
Ses05F_script01_3.csv
Ses05F_script03_1.csv
Ses05M_impro04.csv
Ses05M_impro01.csv
Ses05M_impro03.csv
Ses05M_impro02.csv
Ses05F_script03_2.csv
Ses05F_script01_2.csv
Ses05F_impro07.csv
Ses05F_impro08.csv
Ses05F_script01_1.csv
Ses05F_impro06.csv
Ses05F_impro03.csv
Ses05F_impro01.csv
Ses05F_impro04.csv
Ses05F_impro02.csv
Ses05F_impro05.csv
Ses05M_script01_2.csv
Ses05M_script01_3.csv
Ses05M_script02_1.csv
Ses05M_script02_2.csv
Ses05M_script03_1.csv
Ses05M_script03_2.csv

Total number of files: 31


In [5]:
import nltk
nltk.data.clear_cache()  # Clear the cache to avoid conflicts
nltk.download('punkt')   # Redownload the 'punkt' tokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
nltk.download('punkt_tab')  # Try downloading the punkt_tab tokenizer just in case

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
import nltk
print(nltk.data.path)  # This will show the directories NLTK is checking for resources

['/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [8]:
from nltk.tokenize import word_tokenize
import pandas as pd
import os

folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/contradictionWords"  # Original folder path with expanded contractions
output_path = "/content/drive/MyDrive/TextClassification/Preproccessing/Tokenization"  # New folder for tokenized files

# Ensure the output directory exists
os.makedirs(output_path, exist_ok=True)

def tokenize_text(text):
    """
    Tokenize the text using word_tokenize or other methods.
    For example, using NLTK's word_tokenize
    """
    if isinstance(text, str):
        return word_tokenize(text)
    return []

def check_and_tokenize_in_folder(folder_path, output_path):
    # Loop over files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):  # Adjust if files are in another format (e.g., .json, .xlsx)
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)  # Read the CSV file

            if 'TEXT' in df.columns:
                # Check each entry in the TEXT column and tokenize if necessary
                for index, text in df['TEXT'].items():
                    if isinstance(text, str) and not pd.isna(text) and not isinstance(text, list):
                        print(f"File: {filename}, Row {index} is not tokenized. Tokenizing now...")
                        # Tokenize the text
                        df.at[index, 'TEXT'] = tokenize_text(text)
                    else:
                        print(f"File: {filename}, Row {index} is already tokenized.")

                # Add print statement here to confirm the file saving path
                output_file_path = os.path.join(output_path, filename)
                print(f"Saving file: {output_file_path}")  # This will print the output file path
                df.to_csv(output_file_path, index=False)  # Save to the new folder

            else:
                print(f"No 'TEXT' column found in {filename}")

# Usage
check_and_tokenize_in_folder(folder_path, output_path)


File: expanded_Ses05M_script03_2.csv, Row 0 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 1 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 2 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 3 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 4 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 5 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 6 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 7 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 8 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 9 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 10 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, Row 11 is not tokenized. Tokenizing now...
File: expanded_Ses05M_script03_2.csv, 

In [12]:
import os

# Define the folder path where your original files are located
folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/Tokenization"

# List all files in the folder
files_in_folder = [filename for filename in os.listdir(folder_path) if filename.endswith('.csv')]

# Print the filenames and the total count
print("Filenames in the original folder:")
for file in files_in_folder:
    print(file)

print(f"\nTotal number of files: {len(files_in_folder)}")


Filenames in the original folder:
Ses05F_impro01.csv
Ses05F_impro05.csv
Ses05F_impro06.csv
Ses05F_script01_3.csv
Ses05F_impro07.csv
Ses05F_impro03.csv
Ses05F_impro04.csv
Ses05F_script01_1.csv
Ses05F_impro02.csv
Ses05F_script01_2.csv
Ses05F_impro08.csv
Ses05F_script02_2.csv
Ses05F_script03_1.csv
Ses05F_script02_1.csv
Ses05M_impro02.csv
Ses05M_impro01.csv
Ses05M_impro03.csv
Ses05M_impro06.csv
Ses05F_script03_2.csv
Ses05M_impro04.csv
Ses05M_impro05.csv
Ses05M_impro07.csv
Ses05M_impro08.csv
Ses05M_script01_1b.csv
Ses05M_script01_1.csv
Ses05M_script01_3.csv
Ses05M_script01_2.csv
Ses05M_script02_2.csv
Ses05M_script02_1.csv
Ses05M_script03_2.csv
Ses05M_script03_1.csv

Total number of files: 31


In [10]:
import pandas as pd

# File paths
file1_path = "/content/drive/MyDrive/TextClassification/Preproccessing/Tokenization/Ses05M_script03_2.csv"
file2_path = "/content/drive/MyDrive/TextClassification/Preproccessing/Tokenization/expanded_Ses05M_script03_2.csv"

# Load the CSV files into pandas DataFrames
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Check if 'TURN_NAME' column exists in both files
if 'TURN_NAME' in df1.columns and 'TURN_NAME' in df2.columns:
    # Compare the 'TURN_NAME' columns between both DataFrames
    if df1['TURN_NAME'].equals(df2['TURN_NAME']):
        print("The TURN_NAME columns are identical in both files.")
    else:
        print("The TURN_NAME columns are different in both files.")
else:
    print("One or both of the files are missing the 'TURN_NAME' column.")

The TURN_NAME columns are identical in both files.


In [11]:
import os

# Define the file path
file_path = "/content/drive/MyDrive/TextClassification/Preproccessing/Tokenization/expanded_Ses05M_script03_2.csv"

# Check if the file exists before deleting
if os.path.exists(file_path):
    os.remove(file_path)
    print(f"{file_path} has been deleted.")
else:
    print(f"{file_path} does not exist.")

/content/drive/MyDrive/TextClassification/Preproccessing/Tokenization/expanded_Ses05M_script03_2.csv has been deleted.


In [13]:
import pandas as pd
import os
import string

# Function to remove punctuation and periods from tokenized text
def remove_punctuation_and_periods(tokens):
    # Remove punctuation and periods
    cleaned_tokens = [token for token in tokens if token not in string.punctuation and token != '.']
    return cleaned_tokens

# Function to check if there are still any punctuation or periods left in the tokenized text
def check_for_punctuation(tokens):
    # Check if there are any punctuation or period tokens left
    remaining_punctuation = [token for token in tokens if token in string.punctuation or token == '.']
    return len(remaining_punctuation) > 0

def process_files_in_folder(input_folder_path, output_folder_path):
    # Ensure the output directory exists
    os.makedirs(output_folder_path, exist_ok=True)

    # Loop over files in the input folder
    for filename in os.listdir(input_folder_path):
        if filename.endswith('.csv'):  # Adjust if files are in another format (e.g., .json, .xlsx)
            file_path = os.path.join(input_folder_path, filename)
            df = pd.read_csv(file_path)  # Read the CSV file

            # Assuming the column with tokenized text is called 'TEXT', adjust as needed
            if 'TEXT' in df.columns:
                # Process each entry in the TEXT column
                for index, text in df['TEXT'].items():
                    # If the text is in string format (tokenized as a string), convert it to a list
                    if isinstance(text, str):
                        try:
                            tokens = eval(text)  # Converts string representation of list into an actual list
                        except Exception as e:
                            print(f"Error evaluating tokens in File: {filename}, Row {index}: {e}")
                            tokens = []
                    else:
                        tokens = text

                    # Remove punctuation and periods from the tokenized text
                    updated_tokens = remove_punctuation_and_periods(tokens)

                    # Check if any punctuation or period still exists
                    if check_for_punctuation(updated_tokens):
                        print(f"File: {filename}, Row {index} still contains punctuation or period.")
                    else:
                        print(f"File: {filename}, Row {index} has no punctuation or period left.")

                    # Update the 'TEXT' column with the cleaned tokens
                    df.at[index, 'TEXT'] = updated_tokens

                # Save the updated dataframe to the output folder (new path)
                output_file_path = os.path.join(output_folder_path, filename)
                try:
                    df.to_csv(output_file_path, index=False)  # Save to the new folder
                    print(f"File {filename} processed and saved to {output_file_path}.")
                except Exception as e:
                    print(f"Error saving File: {filename}: {e}")

            else:
                print(f"No 'TEXT' column found in {filename}")

# Usage
input_folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/Tokenization"
output_folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/removingPunctuations"
process_files_in_folder(input_folder_path, output_folder_path)


File: Ses05F_impro01.csv, Row 0 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 1 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 2 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 3 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 4 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 5 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 6 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 7 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 8 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 9 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 10 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 11 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 12 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 13 has no punctuation or period left.
File: Ses05F_impro01.csv, Row 14 has no punctuation or per

In [14]:
import os
import pandas as pd

# Folder paths
folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/removingPunctuations"  # Folder where tokenized files are
output_path = "/content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase"  # Folder where the lowercase files will be saved

# Ensure the output directory exists
os.makedirs(output_path, exist_ok=True)

def convert_to_lowercase(text):
    """Converts text to lowercase"""
    if isinstance(text, str):
        return text.lower()
    return text

def process_and_save_in_folder(folder_path, output_path):
    # Loop over files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):  # Only process CSV files
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)  # Read the CSV file

            if 'TEXT' in df.columns:
                # Apply lowercase conversion to the 'TEXT' column
                df['TEXT'] = df['TEXT'].apply(convert_to_lowercase)

                # Define the output file path (same filename in the new folder)
                output_file_path = os.path.join(output_path, filename)
                print(f"Saving to: {output_file_path}")  # Print where the file will be saved

                # Save the updated DataFrame to the new folder
                df.to_csv(output_file_path, index=False)

            else:
                print(f"No 'TEXT' column found in {filename}")

# Usage
process_and_save_in_folder(folder_path, output_path)

Saving to: /content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase/Ses05F_impro01.csv
Saving to: /content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase/Ses05F_impro05.csv
Saving to: /content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase/Ses05F_impro06.csv
Saving to: /content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase/Ses05F_impro07.csv
Saving to: /content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase/Ses05F_impro02.csv
Saving to: /content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase/Ses05F_impro03.csv
Saving to: /content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase/Ses05F_impro04.csv
Saving to: /content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase/Ses05F_impro08.csv
Saving to: /content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase/Ses05F_script01_1.cs

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [60]:
import pandas as pd
import os
from nltk.corpus import stopwords
import ast  # Safer alternative to eval

# Initialize the stopwords set
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from tokenized text
def remove_stopwords(tokens):
    # Remove stopwords from the token list
    cleaned_tokens = [token for token in tokens if token.lower() not in stop_words]
    removed_stopwords = [token for token in tokens if token.lower() in stop_words]
    return cleaned_tokens, removed_stopwords

def remove_stopwords_in_folder(input_folder_path, output_folder_path):
    # Ensure the output folder exists
    os.makedirs(output_folder_path, exist_ok=True)

    # Loop over files in the input folder
    for filename in os.listdir(input_folder_path):
        if filename.endswith('.csv'):  # Adjust if files are in another format (e.g., .json, .xlsx)
            file_path = os.path.join(input_folder_path, filename)
            df = pd.read_csv(file_path)  # Read the CSV file

            # Check if the 'TEXT' column exists
            if 'TEXT' in df.columns:
                print(f"\nProcessing File: {filename}")
                stopwords_removed_in_file = False  # Track if stopwords are removed in the file

                # Process each entry in the TEXT column
                for index, text in df['TEXT'].items():
                    # If the text is in string format (tokenized as a string), convert it to a list
                    if isinstance(text, str):
                        try:
                            tokens = ast.literal_eval(text)  # Converts string representation of list into an actual list safely
                        except (ValueError, SyntaxError):
                            tokens = text.split()  # Split text into words if it's not a proper list format
                    else:
                        tokens = text if isinstance(text, list) else text.split()  # If it's already a list, keep it; else split the string

                    # Show the original tokens before removing stopwords
                    print(f"Original Tokens (Row {index}): {tokens}")

                    # Remove stopwords
                    updated_tokens, removed_stopwords = remove_stopwords(tokens)

                    # Show the updated tokens and which stopwords were removed
                    print(f"Updated Tokens (Row {index}): {updated_tokens}")
                    print(f"Stopwords Removed (Row {index}): {removed_stopwords}")

                    # If stopwords were removed, update the flag
                    if removed_stopwords:
                        stopwords_removed_in_file = True
                        print(f"Stopwords were removed for Row {index} in file {filename}.")
                    else:
                        print(f"No stopwords removed for Row {index} in file {filename}.")

                    # Update the 'TEXT' column with the cleaned tokens
                    df.at[index, 'TEXT'] = updated_tokens

                # Define the path for saving the updated file to the output folder
                output_file_path = os.path.join(output_folder_path, filename)

                # After processing all rows, save the updated dataframe back to the output file
                df.to_csv(output_file_path, index=False)  # Save to the new folder

                # If stopwords were removed from any row in the file, display the message
                if stopwords_removed_in_file:
                    print(f"Stopwords were removed in at least one row in the file: {filename}")
                else:
                    print(f"No stopwords removed in the file: {filename}")

                print(f"Updated file saved: {output_file_path}")

            else:
                print(f"No 'TEXT' column found in {filename}")

# Usage
input_folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/ConvertingIntoLowerCase"
output_folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/RemovedStopWords"
remove_stopwords_in_folder(input_folder_path, output_folder_path)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Stopwords Removed (Row 45): ['what', 'is', 'you']
Stopwords were removed for Row 45 in file Ses05F_script02_2.csv.
Original Tokens (Row 46): ['there', "'s", 'nothing', 'to', 'it', 'we', 'come', 'down', 'here', 'the', 'grunion', 'arrives', 'and', 'they', 'do', 'their', 'little', 'fish', 'thing', 'and', 'we', 'say', 'oh', 'wow', 'look', 'at', 'the', 'little', 'fish', 'they', 'go', 'home', 'we', 'go', 'home']
Updated Tokens (Row 46): ["'s", 'nothing', 'come', 'grunion', 'arrives', 'little', 'fish', 'thing', 'say', 'oh', 'wow', 'look', 'little', 'fish', 'go', 'home', 'go', 'home']
Stopwords Removed (Row 46): ['there', 'to', 'it', 'we', 'down', 'here', 'the', 'and', 'they', 'do', 'their', 'and', 'we', 'at', 'the', 'they', 'we']
Stopwords were removed for Row 46 in file Ses05F_script02_2.csv.
Original Tokens (Row 47): ['why', 'not']
Updated Tokens (Row 47): []
Stopwords Removed (Row 47): ['why', 'not']
Stopwords were removed fo

In [61]:
import pandas as pd
import os
import ast  # Safer alternative to eval

def remove_empty_rows_in_folder(input_folder_path, output_folder_path):
    # Ensure the output folder exists
    os.makedirs(output_folder_path, exist_ok=True)

    # Loop over files in the input folder
    for filename in os.listdir(input_folder_path):
        if filename.endswith('.csv'):  # Adjust if files are in another format (e.g., .json, .xlsx)
            file_path = os.path.join(input_folder_path, filename)
            df = pd.read_csv(file_path)  # Read the CSV file

            # Check if the 'TEXT' column exists
            if 'TEXT' in df.columns:
                print(f"\nProcessing File: {filename}")

                # Define a function to check if a string is an empty list
                def is_empty_list(text):
                    try:
                        # If text is a string representation of a list (e.g., '[]'), convert it to an actual list
                        evaluated_text = ast.literal_eval(text)
                        return evaluated_text == []  # True if the evaluated text is an empty list
                    except (ValueError, SyntaxError):
                        # If text cannot be evaluated to a list, it's not an empty list
                        return text == ""  # Check if the text is an empty string

                # Identify rows where TEXT is empty or an empty list
                rows_to_delete = df[df['TEXT'].apply(is_empty_list)].index.tolist()

                # If there are rows to delete, display them and remove them
                if rows_to_delete:
                    print(f"\nDeleted Rows in file {filename}:")
                    for row in rows_to_delete:
                        print(f"Row {row} deleted: {df.iloc[row]}")  # Display the deleted row
                    # Drop the rows
                    df.drop(rows_to_delete, inplace=True)

                # Define the path for saving the updated file to the output folder
                output_file_path = os.path.join(output_folder_path, filename)

                # Save the cleaned dataframe back to the output file
                df.to_csv(output_file_path, index=False)  # Save to the new folder

                print(f"Updated file saved: {output_file_path}")
            else:
                print(f"No 'TEXT' column found in {filename}")

# Usage
input_folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/RemovedStopWords"
output_folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingEmptyText"
remove_empty_rows_in_folder(input_folder_path, output_folder_path)


Processing File: Ses05F_impro01.csv

Deleted Rows in file Ses05F_impro01.csv:
Row 15 deleted: % [START_TIME - END_TIME]       [135.3800 - 136.7300]
TURN_NAME                         Ses05F_impro01_F015
EMOTION                                           fru
[V, A, D]                    [2.5000, 3.0000, 3.0000]
TEXT                                               []
Name: 15, dtype: object
Row 18 deleted: % [START_TIME - END_TIME]       [150.9500 - 152.5100]
TURN_NAME                         Ses05F_impro01_F018
EMOTION                                           ang
[V, A, D]                    [1.5000, 4.0000, 4.0000]
TEXT                                               []
Name: 18, dtype: object
Updated file saved: /content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingEmptyText/Ses05F_impro01.csv

Processing File: Ses05F_impro05.csv

Deleted Rows in file Ses05F_impro05.csv:
Row 25 deleted: % [START_TIME - END_TIME]       [228.3200 - 230.1800]
TURN_NAME                        

In [62]:
import pandas as pd

def delete_rows_by_turn_name(file_path, turn_names_to_delete):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Check if the 'TURN_NAME' column exists
    if 'TURN_NAME' in df.columns:
        # Delete rows where 'TURN_NAME' matches any of the values in the list
        df = df[~df['TURN_NAME'].isin(turn_names_to_delete)]

        # Save the updated DataFrame back to the same file path
        df.to_csv(file_path, index=False)  # Overwrites the original file
        print(f"Rows with TURN_NAME values {turn_names_to_delete} have been deleted and saved back to {file_path}.")
    else:
        print(f"The column 'TURN_NAME' was not found in {file_path}.")

# Specify the file paths and TURN_NAME values to delete
file1_path = "/content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingEmptyText/Ses05F_impro02.csv"
turn_names1_to_delete = ["Ses05F_impro02_M012"]  # TURN_NAME values for file1

file2_path = "/content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingEmptyText/Ses05F_impro04.csv"
turn_names2_to_delete = ["Ses05F_impro04_F026"]  # TURN_NAME values for file2

# Call the function for each file with its corresponding TURN_NAME values
delete_rows_by_turn_name(file1_path, turn_names1_to_delete)
delete_rows_by_turn_name(file2_path, turn_names2_to_delete)

Rows with TURN_NAME values ['Ses05F_impro02_M012'] have been deleted and saved back to /content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingEmptyText/Ses05F_impro02.csv.
Rows with TURN_NAME values ['Ses05F_impro04_F026'] have been deleted and saved back to /content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingEmptyText/Ses05F_impro04.csv.


In [65]:
import os
import pandas as pd
import ast

# Specify the folder containing the original files
folder_path = '/content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingEmptyText'

# Specify the folder where the cleaned files will be saved
new_folder_path = '/content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingUnwantedWords'

# Ensure the new folder exists
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)

# List of unwanted words to remove
unwanted_words = ["'s", "''", "'m", '--', '``', "'b", '...', '..', "'d", 'i-', 'is-', "'ll", 'c', "n't", "'re", 'it-','is-', 'we-']

# Function to clean text
def clean_text(text):
    # Convert string representation of list into a real list if it's in string format
    if isinstance(text, str):
        text = ast.literal_eval(text)

    # Remove unwanted words
    cleaned_text = [word for word in text if word not in unwanted_words]

    return cleaned_text

# Process all CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):  # Adjust if you're working with other file types
        file_path = os.path.join(folder_path, filename)
        print(f"Processing file: {filename}")

        # Read the CSV file
        try:
            df = pd.read_csv(file_path)
            print(f"Read {filename} successfully.")
        except Exception as e:
            print(f"Error reading {filename}: {e}")
            continue

        # Check if "TEXT" column exists (all uppercase now)
        if 'TEXT' in df.columns:
            print(f"Found 'TEXT' column in {filename}. Cleaning...")

            # Apply cleaning function to the "TEXT" column
            df['TEXT'] = df['TEXT'].apply(clean_text)

            # Create the path for the cleaned file in the new folder
            new_file_path = os.path.join(new_folder_path, filename)

            try:
                # Save the cleaned file to the new folder
                df.to_csv(new_file_path, index=False)
                print(f"Processed and saved {filename} to {new_folder_path}")
            except Exception as e:
                print(f"Error saving {filename}: {e}")
        else:
            print(f"'TEXT' column not found in {filename}. Skipping...")

print("Cleaning complete for all files.")


Processing file: Ses05F_impro01.csv
Read Ses05F_impro01.csv successfully.
Found 'TEXT' column in Ses05F_impro01.csv. Cleaning...
Processed and saved Ses05F_impro01.csv to /content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingUnwantedWords
Processing file: Ses05F_impro05.csv
Read Ses05F_impro05.csv successfully.
Found 'TEXT' column in Ses05F_impro05.csv. Cleaning...
Processed and saved Ses05F_impro05.csv to /content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingUnwantedWords
Processing file: Ses05F_impro06.csv
Read Ses05F_impro06.csv successfully.
Found 'TEXT' column in Ses05F_impro06.csv. Cleaning...
Processed and saved Ses05F_impro06.csv to /content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingUnwantedWords
Processing file: Ses05F_impro07.csv
Read Ses05F_impro07.csv successfully.
Found 'TEXT' column in Ses05F_impro07.csv. Cleaning...
Processed and saved Ses05F_impro07.csv to /content/drive/MyDrive/TextClassification/Preproccessing/AfterDe

In [66]:
import pandas as pd

def delete_rows_by_turn_name(file_path, turn_names_to_delete):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Debugging: Print columns to ensure 'TURN_NAME' exists and is correct
    print(f"Columns in {file_path}: {df.columns}")

    # Check if the 'TURN_NAME' column exists
    if 'TURN_NAME' in df.columns:
        # Strip any extra spaces from column names
        df.columns = df.columns.str.strip()

        # Delete rows where 'TURN_NAME' matches any of the values in the list
        df = df[~df['TURN_NAME'].isin(turn_names_to_delete)]

        # Save the updated DataFrame back to the same file path
        df.to_csv(file_path, index=False)  # Overwrites the original file
        print(f"Rows with TURN_NAME values {turn_names_to_delete} have been deleted and saved back to {file_path}.")
    else:
        print(f"The column 'TURN_NAME' was not found in {file_path}.")

# Specify the file paths and TURN_NAME values to delete
file3_path = "/content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingUnwantedWords/Ses05M_script02_1.csv"
turn_names3_to_delete = ["Ses05M_script02_1_F016"]  # TURN_NAME values for file3

# Call the function for each file with its corresponding TURN_NAME values
delete_rows_by_turn_name(file3_path, turn_names3_to_delete)


Columns in /content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingUnwantedWords/Ses05M_script02_1.csv: Index(['% [START_TIME - END_TIME]', 'TURN_NAME', 'EMOTION', '[V, A, D]',
       'TEXT'],
      dtype='object')
Rows with TURN_NAME values ['Ses05M_script02_1_F016'] have been deleted and saved back to /content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingUnwantedWords/Ses05M_script02_1.csv.


In [67]:
import os
import pandas as pd
import spacy

# Initialize the spaCy model for lemmatization
nlp = spacy.load('en_core_web_sm')

# Folder paths
source_folder = '/content/drive/MyDrive/TextClassification/Preproccessing/AfterDeletingUnwantedWords'
destination_folder = '/content/drive/MyDrive/TextClassification/Preproccessing/Lemmatization'

# Function to lemmatize the text
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Loop through all files in the source folder
for filename in os.listdir(source_folder):
    file_path = os.path.join(source_folder, filename)

    if file_path.endswith('.csv'):  # Check for CSV files, adjust for other formats as needed
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Ensure the column name is correct
        if 'TEXT' in df.columns:  # Replace 'TEXT' with the actual column name if needed
            # Apply lemmatization to the 'TEXT' column
            df['TEXT'] = df['TEXT'].apply(lemmatize_text)

            # Create a new folder if it doesn't exist
            if not os.path.exists(destination_folder):
                os.makedirs(destination_folder)

            # Save the DataFrame to a new CSV in the destination folder
            output_file_path = os.path.join(destination_folder, filename)
            df.to_csv(output_file_path, index=False)
            print(f"Lemmatized file saved: {output_file_path}")



Lemmatized file saved: /content/drive/MyDrive/TextClassification/Preproccessing/Lemmatization/Ses05F_impro01.csv
Lemmatized file saved: /content/drive/MyDrive/TextClassification/Preproccessing/Lemmatization/Ses05F_impro05.csv
Lemmatized file saved: /content/drive/MyDrive/TextClassification/Preproccessing/Lemmatization/Ses05F_impro06.csv
Lemmatized file saved: /content/drive/MyDrive/TextClassification/Preproccessing/Lemmatization/Ses05F_impro07.csv
Lemmatized file saved: /content/drive/MyDrive/TextClassification/Preproccessing/Lemmatization/Ses05F_impro02.csv
Lemmatized file saved: /content/drive/MyDrive/TextClassification/Preproccessing/Lemmatization/Ses05F_impro03.csv
Lemmatized file saved: /content/drive/MyDrive/TextClassification/Preproccessing/Lemmatization/Ses05F_impro04.csv
Lemmatized file saved: /content/drive/MyDrive/TextClassification/Preproccessing/Lemmatization/Ses05F_impro08.csv
Lemmatized file saved: /content/drive/MyDrive/TextClassification/Preproccessing/Lemmatization/Se

In [None]:
______________________________________________