# Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Define paths using raw strings for clarity
text_path = r'C:\Users\wangshuqi\Desktop\ML_Group1\Tweets\Group1\Data\dataset\text.csv'
emotion_stimulus_path = r'C:\Users\wangshuqi\Desktop\ML_Group1\Tweets\Group1\Data\dataset\emotion-stimulus.csv'
dailydialog_path = r'C:\Users\wangshuqi\Desktop\ML_Group1\Tweets\Group1\Data\dataset\dailydialog.csv'
emotion_sentimen_dataset_path = r'C:\Users\wangshuqi\Desktop\ML_Group1\Tweets\Group1\Data\dataset\emotion_sentimen_dataset.csv'

sampled_text_path = r'C:\Users\wangshuqi\Desktop\ML_Group1\Tweets\Group1\Data\Subdataset\sampled_text.csv'
sampled_dailydialog_path = r'C:\Users\wangshuqi\Desktop\ML_Group1\Tweets\Group1\Data\Subdataset\sampled_dailydialog.csv'
sampled_emotion_sentimen_dataset_path = r'C:\Users\wangshuqi\Desktop\ML_Group1\Tweets\Group1\Data\Subdataset\sampled_emotion_sentimen_dataset_path.csv'

combined_data_path = r'C:\Users\wangshuqi\Desktop\ML_Group1\Tweets\Group1\Data\Subdataset\combined_dataset.csv'
combined_data_path2 = r'C:\Users\wangshuqi\Desktop\ML_Group1\Tweets\Group1\Data\Subdataset\combined_dataset2.csv'
final_combined_data_path = r'C:\Users\wangshuqi\Desktop\ML_Group1\Tweets\Group1\Data\Subdataset\final_combined_dataset.csv'


# Sampling Text.csv dataset

In [2]:
# Load the dataset
data = pd.read_csv(text_path)

# Define the labels to extract and the number of samples for each label
selected_labels = [0, 1, 3, 4, 5]
samples_per_label = {0: 9000, 1: 3000, 3: 11000, 4: 18100, 5: 20000}  # Adjusted number for 'fear'

# Check if each label has enough samples
counts = data['label'].value_counts()
sufficient_data = all(counts.get(label, 0) >= samples_per_label[label] for label in selected_labels)

# Extract data based on labels
if sufficient_data:
    sampled_data = pd.concat([data[data['label'] == label].sample(samples_per_label[label], random_state=1) for label in selected_labels])
else:
    sampled_data = pd.concat([data[data['label'] == label].sample(min(counts.get(label, 0), samples_per_label[label]), random_state=1) for label in selected_labels])

# Replace numeric labels with emotion descriptors
emotion_map = {0: 'sadness', 1: 'joy', 3: 'anger', 4: 'fear', 5: 'surprise'}
sampled_data['label'] = sampled_data['label'].map(emotion_map)

# Keep only the 'label' and 'text' columns, and adjust the order
sampled_data = sampled_data[['label', 'text']]

# Save the extracted data to a new CSV file
sampled_data.to_csv(sampled_text_path, index=False)


# Combine with emotion-stimulus.csv dataset

In [3]:
import pandas as pd

# Load the emotion-stimulus dataset
emotion_data = pd.read_csv(emotion_stimulus_path)

# Filter the emotion-stimulus dataset to include only specified emotions
required_emotions = ['happy', 'sad', 'anger', 'fear', 'surprise']
emotion_data = emotion_data[emotion_data['Emotion'].isin(required_emotions)]

# Replace the 'Emotion' column labels to match the existing labels
emotion_map_new = {
    'happy': 'happiness',
    'sad': 'sadness',
    'anger': 'anger',
    'fear': 'fear',
    'surprise': 'surprise'
}
emotion_data['Emotion'] = emotion_data['Emotion'].map(emotion_map_new)

# Rename columns to match the combined dataset format
emotion_data.rename(columns={'Emotion': 'label', 'Text': 'text'}, inplace=True)

# Load the previously created sampled dataset
sampled_text_data = pd.read_csv(sampled_text_path)

# Concatenate the balanced dataset with the filtered emotion-stimulus dataset
combined_data = pd.concat([sampled_text_data, emotion_data], ignore_index=True)

# Save the combined dataset to a new CSV file
combined_data.to_csv(combined_data_path, index=False)


# Sampling dailydialog.csv dataset

In [4]:
import pandas as pd

# Load the dataset
data = pd.read_csv(dailydialog_path)

# Define the labels to extract and their corresponding emotion descriptors
label_map = {
    0: 'neutral',
    1: 'anger',
    3: 'fear',
    4: 'joy',  
    5: 'sadness',
    6: 'surprise'
}
selected_labels = list(label_map.keys())
samples_per_label = 13000

# Filter and sample data based on specified labels
sampled_data = pd.DataFrame()
for label in selected_labels:
    filtered_data = data[data['Emotion'] == label]
    if len(filtered_data) >= samples_per_label:
        sampled = filtered_data.sample(n=samples_per_label, random_state=1)
    else:
        sampled = filtered_data
    sampled_data = pd.concat([sampled_data, sampled])

# Map the numerical labels to emotion descriptors
sampled_data['Emotion'] = sampled_data['Emotion'].map(label_map)

# Rename columns to match the expected format
sampled_data.rename(columns={'Emotion': 'label', 'Text': 'text'}, inplace=True)

# Save the extracted and mapped data to a new CSV file
sampled_data.to_csv(sampled_dailydialog_path, index=False)


# Combine with sampled dailydialog.csv dataset

In [5]:

# Load the datasets
combined_data = pd.read_csv(combined_data_path)
dailydialog_data = pd.read_csv(sampled_dailydialog_path)

# Concatenate the two datasets
combined_data2 = pd.concat([combined_data, dailydialog_data], ignore_index=True)

# Save the final combined dataset to a new CSV file
combined_data2.to_csv(combined_data_path2, index=False)


# Sampling emotion_sentimen_dataset

In [6]:
# Load the dataset
data = pd.read_csv(emotion_sentimen_dataset_path)

# Remove the first column
data_cleaned = data.drop(data.columns[0], axis=1)

# Swap the 'text' and 'Emotion' columns
data_cleaned = data_cleaned[['Emotion', 'text']]

# Rename the columns
data_cleaned.columns = ['label', 'text']

# Filter out rows where 'label' is 'neutral', 'boredom', or 'empty'
labels_to_remove = ['neutral', 'boredom', 'empty']
data_filtered = data_cleaned[~data_cleaned['label'].isin(labels_to_remove)]

# Replace 'fun' with 'joy'
data_filtered['label'] = data_filtered['label'].replace('fun', 'joy')

# Save the filtered dataset to a new CSV file
data_filtered.to_csv(sampled_emotion_sentimen_dataset_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['label'] = data_filtered['label'].replace('fun', 'joy')


# final combined dataset

In [10]:

# Load the datasets
combined_data2 = pd.read_csv(combined_data_path2)
sampled_emotion_sentimen_dataset = pd.read_csv(sampled_emotion_sentimen_dataset_path)

# Concatenate the two datasets
final_combined_data = pd.concat([combined_data2,sampled_emotion_sentimen_dataset], ignore_index=True)

# List of emotions to remove
emotions_to_remove = ['worry', 'relief', 'enthusiasm', 'hate']

# Filter out rows with these emotions
final_combined_data = final_combined_data[~final_combined_data['label'].isin(emotions_to_remove)]

# Save the final combined dataset to a new CSV file
final_combined_data.to_csv(final_combined_data_path, index=False)


# Split and Save Datasets

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv(final_combined_data_path)

# remove duplicates
data_cleaned = data.drop_duplicates()

# Split the data into training and remaining data with 80% for training
train_data, remaining_data = train_test_split(data_cleaned, test_size=0.2, random_state=42)

# Split the remaining data into validation and test data equally
validation_data, test_data = train_test_split(remaining_data, test_size=0.5, random_state=42)

# Save the datasets to CSV files
train_data.to_csv('C:\\Users\\wangshuqi\\Desktop\\ML_Group1\\Tweets\\Group1\\Data\\train_data.csv', index=False)
validation_data.to_csv('C:\\Users\\wangshuqi\\Desktop\\ML_Group1\\Tweets\\Group1\\Data\\validation_data.csv', index=False)
test_data.to_csv('C:\\Users\\wangshuqi\\Desktop\\ML_Group1\\Tweets\\Group1\\Data\\test_data.csv', index=False)
