In [1]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
#Import necessary libraries
import os
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [34]:
# Dataset paths
transcriptions_path = "/content/drive/MyDrive/TextClassification/Dataset/transcriptions"
label_path = "/content/drive/MyDrive/TextClassification/Dataset/EmoEvaluation"

# Function to count files and display the first line (header) of .txt files
def count_files_and_show_headers(directory):
    file_count = 0
    headers = []

    # Iterate over each file in the directory
    for file in os.listdir(directory):
        file_path = os.path.join(directory, file)

        if os.path.isfile(file_path):
            file_count += 1

            # Check if the file is a .txt file and read the first line
            if file.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as f:
                    # Read the first line as the header (or whatever header logic fits your case)
                    first_line = f.readline().strip()  # Adjust based on how the header looks
                    headers.append(first_line)

    return file_count, headers

# Count files and get headers from both directories
transcriptions_files_count, transcriptions_headers = count_files_and_show_headers(transcriptions_path)
label_files_count, label_headers = count_files_and_show_headers(label_path)

# Display the results
print(f"Number of files in 'transcriptions' folder: {transcriptions_files_count}")
print(f"Headers in 'transcriptions' files: {transcriptions_headers}")
print(f"Number of files in 'EmoEvaluation' folder: {label_files_count}")
print(f"Headers in 'EmoEvaluation' files: {label_headers}")

Number of files in 'transcriptions' folder: 31
Headers in 'transcriptions' files: ['Ses05F_impro07_F000 [002.7258-004.6600]: So guess what.', 'Ses05M_script03_1_M000 [003.1147-005.5100]: Oh Good God.', 'Ses05F_impro03_F000 [002.8098-004.5479]: Okay, so big news.', 'Ses05F_impro08_F000 [001.9734-003.7200]: Hi, sir.  How can I help you?', 'Ses05M_script01_2_F000 [004.9869-008.2400]: Why did you invite her here?', 'Ses05M_impro03_M000 [005.1000-007.0500]: Guess what?', 'Ses05F_impro02_F000 [005.1700-007.6605]: baby, I need you to sit down.', "Ses05M_script02_1_M000 [003.7900-024.8400]: What time is it?  They're supposed to run around midnight.  Oh, this is great, isn't it?  Look at the night we've got here.  It couldn't be better.  Actually, I wanted to go a little further up the coast you know to get away from all the lights and the people.  I was afraid we might miss it though, so how are you doing?", "Ses05M_impro04_M000 [005.3295-012.8500]: Uh, God.  I don't know what to do anymore.  

In [31]:
# Dataset paths
transcriptions_path = "/content/drive/MyDrive/TextClassification/Dataset/transcriptions"
label_path = "/content/drive/MyDrive/TextClassification/Dataset/EmoEvaluation"

In [24]:
# Create output directories inside dataset path
converted_transcriptions_path = "/content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions"
converted_labels_path = "/content/drive/MyDrive/TextClassification/Dataset/converted_labels"

# Create the directories if they don't already exist
os.makedirs(converted_transcriptions_path, exist_ok=True)
os.makedirs(converted_labels_path, exist_ok=True)

In [25]:
# Convert transcriptions folder from .txt to .csv and save in the 'converted_transcriptions' folder
if os.path.exists(transcriptions_path):
    print(f"Transcriptions dataset is located at: {transcriptions_path}")

    # List all .txt files in the transcriptions directory
    transcriptions_files = [f for f in os.listdir(transcriptions_path) if f.endswith('.txt')]

    if transcriptions_files:
        for file in transcriptions_files:
            file_path = os.path.join(transcriptions_path, file)

            with open(file_path, 'r') as f:
                transcription_data = f.read().strip()  # Read the content of the text file

            # Convert the transcription data to a DataFrame
            transcription_df = pd.DataFrame([transcription_data], columns=["Transcription"])

            # Save transcription DataFrame as CSV in the 'converted_transcriptions' folder with original name
            output_file_path = os.path.join(converted_transcriptions_path, file.replace('.txt', '.csv'))  # Keep the original filename with .csv extension
            transcription_df.to_csv(output_file_path, index=False)
            print(f"Saved {file} as CSV in {converted_transcriptions_path}.")
    else:
        print("No .txt files found in the transcriptions directory.")
else:
    print("Transcriptions dataset folder not found.")



Transcriptions dataset is located at: /content/drive/MyDrive/TextClassification/Dataset/transcriptions
Saved Ses05F_impro07.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions.
Saved Ses05M_script03_1.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions.
Saved Ses05F_impro03.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions.
Saved Ses05F_impro08.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions.
Saved Ses05M_script01_2.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions.
Saved Ses05M_impro03.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions.
Saved Ses05F_impro02.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions.
Saved Ses05M_script02_1.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions.


In [26]:
# Convert labels folder from .txt to .csv and save in the 'converted_labels' folder
if os.path.exists(label_path):
    print(f"Labels dataset is located at: {label_path}")

    # List all .txt files in the labels directory
    label_files = [f for f in os.listdir(label_path) if f.endswith('.txt')]

    if label_files:
        for file in label_files:
            file_path = os.path.join(label_path, file)

            with open(file_path, 'r') as f:
                label_data = f.read().strip()  # Read the content of the text file

            # Convert the label data to a DataFrame
            label_df = pd.DataFrame([label_data], columns=["Label"])

            # Save label DataFrame as CSV in the 'converted_labels' folder with original name
            output_file_path = os.path.join(converted_labels_path, file.replace('.txt', '.csv'))  # Keep the original filename with .csv extension
            label_df.to_csv(output_file_path, index=False)
            print(f"Saved {file} as CSV in {converted_labels_path}.")
    else:
        print("No .txt files found in the labels directory.")
else:
    print("Labels dataset folder not found.")


Labels dataset is located at: /content/drive/MyDrive/TextClassification/Dataset/EmoEvaluation
Saved Ses05F_script01_2.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_labels.
Saved Ses05F_impro02.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_labels.
Saved Ses05F_impro05.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_labels.
Saved Ses05M_impro03.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_labels.
Saved Ses05M_script02_1.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_labels.
Saved Ses05F_script03_2.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_labels.
Saved Ses05M_script01_3.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_labels.
Saved Ses05M_impro01.txt as CSV in /content/drive/MyDrive/TextClassification/Dataset/converted_labels.
Saved Ses05F_impro07.txt as CSV in /content/drive/MyDrive/TextClassifi

In [41]:
# Dataset paths
convertedTranscriptions_path = "/content/drive/MyDrive/TextClassification/Dataset/converted_transcriptions"
convertedLabel_path = "/content/drive/MyDrive/TextClassification/Dataset/converted_labels"

# Function to count files and show first rows of .csv files
def count_files_and_show_headers(directory):
    file_count = 0
    headers = []

    # Iterate over each file in the directory
    for file in os.listdir(directory):
        file_path = os.path.join(directory, file)

        if os.path.isfile(file_path):
            file_count += 1

            # Check if the file is a .csv file and read the first row
            if file.endswith('.csv'):
                try:
                    # Read the first row to get the content (or first column, depending on format)
                    df = pd.read_csv(file_path, nrows=1)  # Only read the first row
                    headers.append(df.iloc[0, 0])  # Grab the first element (text/content of first column)
                except Exception as e:
                    print(f"Error reading {file}: {e}")

    return file_count, headers

# Count files and get headers from both directories
transcriptions_files_count, transcriptions_headers = count_files_and_show_headers(convertedTranscriptions_path)
label_files_count, label_headers = count_files_and_show_headers(convertedLabel_path)

# Display the results
print(f"Number of files in 'transcriptions' folder: {transcriptions_files_count}")
print(f"Headers in 'transcriptions' files: {transcriptions_headers}")
print(f"Number of files in 'EmoEvaluation' folder: {label_files_count}")
print(f"Headers in 'EmoEvaluation' files: {label_headers}")

Number of files in 'transcriptions' folder: 31
Headers in 'transcriptions' files: ["Ses05F_impro07_F000 [002.7258-004.6600]: So guess what.\nSes05F_impro07_M000 [004.2400-005.2953]: what\nSes05F_impro07_F001 [005.0200-007.8100]: I got into college.\nSes05F_impro07_M001 [007.1500-010.0800]: Shut up.  What'd you get, where'd you get accepted to? please\nSes05F_impro07_F002 [009.2300-010.2500]: U.S.C..\nSes05F_impro07_M002 [010.1100-012.3400]: Oh, sweet.\nSes05F_impro07_F003 [011.0800-012.5700]: yeah.\nSes05F_impro07_M003 [012.3600-014.2600]: So you're not going to leave me.\nSes05F_impro07_F004 [013.6500-014.7900]: No.\nSes05F_impro07_M004 [014.4500-017.7449]: oh, good good good good good good.\nSes05F_impro07_F005 [014.9700-016.7000]: I'll stay in town.\nSes05F_impro07_M005 [017.7762-019.9400]: Are you going to live on campus?  Where are you going to live?\nSes05F_impro07_F006 [019.3700-021.6800]: Uh- I don't know.\nSes05F_impro07_M006 [021.0600-025.8200]: Uh! stay over here.  Like trav

In [None]:

next? combinations