In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import os

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words and t not in punctuation]
    return " ".join(tokens)

# Load raw data
input_path = r"D:\Python\ResuMetrics\data\training_data\sentiment_training.csv"
output_path = r"D:\Python\ResuMetrics\data\training_data\sentiment_training_processed.csv"
df = pd.read_csv(input_path)

# Remove nulls
df = df.dropna()

# Preprocess text
df['feedback'] = df['feedback'].apply(preprocess_text)

# Validate labels
df = df[df['sentiment'].isin(['positive', 'negative', 'neutral'])]

# Save processed data
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)

# Verify
print("Processed Data:")
print(df.head())
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())
print("\nMissing Values:")
print(df.isnull().sum())

[nltk_data] Downloading package punkt to C:\Users\ACER/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ACER/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Processed Data:
                                            feedback sentiment
0  team collaboration fantastic feel supported ev...  positive
1  workload overwhelming 's little recognition ef...  negative
2       enjoy flexible hours trust company places us  positive
3  management ignores suggestions feels demotivating  negative
4  new project management tools made work much ea...  positive

Sentiment Distribution:
sentiment
positive    108
negative    108
Name: count, dtype: int64

Missing Values:
feedback     0
sentiment    0
dtype: int64
