In [2]:
# Import necessary libraries
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk


# Define constants
POSITIVE = 1
NEGATIVE = 0
NEUTRAL = -1

# Define lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess the text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Lemmatize and remove stop words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    return ' '.join(words)

# Load the data
train_data = pd.read_csv('/Users/meetsmacbook/Downloads/archive (10)/train.csv')
test_data = pd.read_csv('/Users/meetsmacbook/Downloads/archive (10)/test.csv')

# Combine title and description into a single text column
train_data['text'] = train_data['Title'] + ' ' + train_data['Description']
test_data['text'] = test_data['Title'] + ' ' + test_data['Description']

# Preprocess the text columns in the data
train_data['processed_text'] = train_data['text'].apply(preprocess_text)
test_data['processed_text'] = test_data['text'].apply(preprocess_text)




In [3]:
# Instantiate the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function to get sentiment labels using VADER
def get_sentiment(text):
    # Calculate sentiment scores using VADER
    sentiment_scores = analyzer.polarity_scores(text)
    compound_score = sentiment_scores['compound']
    
    # Define sentiment thresholds
    if compound_score >= 0.05:
        return POSITIVE
    elif compound_score <= -0.05:
        return NEGATIVE
    else:
        return NEUTRAL

# Apply the function to the processed text column
train_data['sentiment_label'] = train_data['processed_text'].apply(get_sentiment)
test_data['sentiment_label'] = test_data['processed_text'].apply(get_sentiment)


# Print the first few rows to verify the sentiment labels
print(train_data[['text', 'sentiment_label']].head())
print(test_data[['text', 'sentiment_label']].head())

                                                text  sentiment_label
0  Wall St. Bears Claw Back Into the Black (Reute...               -1
1  Carlyle Looks Toward Commercial Aerospace (Reu...                1
2  Oil and Economy Cloud Stocks' Outlook (Reuters...                0
3  Iraq Halts Oil Exports from Main Southern Pipe...                0
4  Oil prices soar to all-time record, posing new...                0
                                                text  sentiment_label
0  Fears for T N pension after talks Unions repre...                0
1  The Race is On: Second Private Team Sets Launc...                1
2  Ky. Company Wins Grant to Study Peptides (AP) ...                1
3  Prediction Unit Helps Forecast Wildfires (AP) ...                1
4  Calif. Aims to Limit Farm-Related Smog (AP) AP...                0


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Class Index      120000 non-null  int64 
 1   Title            120000 non-null  object
 2   Description      120000 non-null  object
 3   text             120000 non-null  object
 4   processed_text   120000 non-null  object
 5   sentiment_label  120000 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 5.5+ MB


In [4]:

# Save the updated data with sentiment labels to CSV files
train_data.to_csv('/Users/meetsmacbook/Downloads/train_with_sentiment.csv', index=False)
test_data.to_csv('/Users/meetsmacbook/Downloads/test_with_sentiment.csv', index=False)

In [None]:
df