In [1]:
pip install pandas pyarrow


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

# Load the Parquet file
parquet_file = 'train-00000-of-00001.parquet'
data = pd.read_parquet(parquet_file)

# Save to CSV
csv_file = 'train.csv'
data.to_csv(csv_file, index=False)

print(f"File converted to {csv_file}")


File converted to train.csv


In [3]:

# Use Jupyter's file download widget
from IPython.display import FileLink

# Display the direct download option
FileLink(csv_file)



In [4]:
import pandas as pd

# Read the CSV files
emotions1_df = pd.read_csv('emotions.csv')
emotions2_df = pd.read_csv('train.csv')

# Print the first few rows of each dataframe
print("Emotions1 DataFrame Head:")
print(emotions1_df.head())

print("\nEmotions2 DataFrame Head:")
print(emotions2_df.head())


Emotions1 DataFrame Head:
                                                text  label
0      i just feel really helpless and heavy hearted      4
1  ive enjoyed being able to slouch about relax a...      0
2  i gave up my internship with the dmrg and am f...      4
3                         i dont know i feel so lost      0
4  i am a kindergarten teacher and i am thoroughl...      4

Emotions2 DataFrame Head:
                                                text  emotion
0  i should have been at the pub instead of which...  sadness
1  ill just have to make some local friends i can...    anger
2                                i didnt feel so hot     love
3  i would further suggest people might feel more...     love
4                i am feeling irritable cranky often    anger


In [5]:
# Define a mapping from numeric labels to emotion words
label_mapping = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

# Apply the mapping to the 'label' column
emotions1_df['label'] = emotions1_df['label'].map(label_mapping)

# Print the first few rows to verify the change
print("Emotions DataFrame with Categorical Labels:")
print(emotions1_df.head())

Emotions DataFrame with Categorical Labels:
                                                text    label
0      i just feel really helpless and heavy hearted     fear
1  ive enjoyed being able to slouch about relax a...  sadness
2  i gave up my internship with the dmrg and am f...     fear
3                         i dont know i feel so lost  sadness
4  i am a kindergarten teacher and i am thoroughl...     fear


In [6]:
# Rename the column 'emotions' to 'label' in train_df
emotions2_df = emotions2_df.rename(columns={'emotion': 'label'})

# Verify the change
print(emotions2_df.head())


                                                text    label
0  i should have been at the pub instead of which...  sadness
1  ill just have to make some local friends i can...    anger
2                                i didnt feel so hot     love
3  i would further suggest people might feel more...     love
4                i am feeling irritable cranky often    anger


In [7]:
# Count rows before appending
emotions1_count = emotions1_df.shape[0]
emotions2_count = emotions2_df.shape[0]
print(f"Number of rows in emotions_df: {emotions1_count}")
print(f"Number of rows in train_df: {emotions2_count}")

Number of rows in emotions_df: 416809
Number of rows in train_df: 3432


In [8]:
# Ensure they have the same columns if needed
combined_df = pd.concat([emotions1_df, emotions2_df], ignore_index=True)

# Print the first few rows of the combined dataframe
print("Combined DataFrame:")
print(combined_df.head())

emotions_count = combined_df.shape[0]
print(f"Number of rows in emotions_df: {emotions_count}")

Combined DataFrame:
                                                text    label
0      i just feel really helpless and heavy hearted     fear
1  ive enjoyed being able to slouch about relax a...  sadness
2  i gave up my internship with the dmrg and am f...     fear
3                         i dont know i feel so lost  sadness
4  i am a kindergarten teacher and i am thoroughl...     fear
Number of rows in emotions_df: 420241


In [9]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer, stemmer, and stop words list
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Define the text preprocessing function with stemming
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters, digits, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize the text into words
    words = text.split()
    
    # Remove lemmatize, and stem remaining words
    processed_words = [
        stemmer.stem(lemmatizer.lemmatize(word))  # Apply both lemmatization and stemming
        for word in words if len(word) > 1 # Remove one letter words like "t"
    ]
    
    # Join words back into a single string
    return ' '.join(processed_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/katherine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/katherine/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
# Apply the preprocessing function to the 'text' column in the combined DataFrame
combined_df['preprocessed_text'] = combined_df['text'].apply(preprocess_text)
# Print the first few rows of the updated DataFrame
print("\nUpdated Combined DataFrame with processed text:")
print(combined_df.head())


Updated Combined DataFrame with processed text:
                                                text    label  \
0      i just feel really helpless and heavy hearted     fear   
1  ive enjoyed being able to slouch about relax a...  sadness   
2  i gave up my internship with the dmrg and am f...     fear   
3                         i dont know i feel so lost  sadness   
4  i am a kindergarten teacher and i am thoroughl...     fear   

                                   preprocessed_text  
0          just feel realli helpless and heavi heart  
1  ive enjoy be abl to slouch about relax and unw...  
2  gave up my internship with the dmrg and am fee...  
3                             dont know feel so lost  
4  am kindergarten teacher and am thoroughli wear...  


In [11]:
combined_df.to_csv('processed_emotions_dataset.csv')