# Sentiment Analysis Notebook
This notebook performs sentiment analysis using a logistic regression model with TF-IDF vectorization.

In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/mymac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mymac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1. Load and Explore Data
Load the training and test datasets, and display basic information.

In [4]:
# Load data
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

# Display dataset information
print("Train shape:", train_df.shape)
print(train_df.head())
print("\nTest shape:", test_df.shape)
print(test_df.head())

Train shape: (24732, 4)
       textID                                               text  \
0  28ac06f416                        good luck with your auction   
1  92098cf9a7  Hmm..You can`t judge a book by looking at its ...   
2  7858ff28f2   Hello, yourself. Enjoy London. Watch out for ...   
3  b0c9c67f32         We can`t even call you from belgium  sucks   
4  7b36e9e7a5                                 not so good mood..   

                                       selected_text sentiment  
0                        good luck with your auction  positive  
1  Hmm..You can`t judge a book by looking at its ...   neutral  
2                                    They`re mental.  negative  
3                                            m  suck  negative  
4                                 not so good mood..  negative  

Test shape: (2748, 3)
       textID                                               text  \
0  102f98e5e2                          Happy Mother`s Day hahaha   
1  033b399113  Sor

## 2. Preprocess Data
Define a function to clean and preprocess the text data.

In [5]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing
X = train_df['text']
y = train_df['sentiment']
X = X.apply(preprocess_text)

## 3. Split Data
Split the training data into training and validation sets.

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 4. Build and Train Model
Create a pipeline with TF-IDF vectorization and logistic regression, and perform hyperparameter tuning.

In [7]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1, 2),
        max_features=5000
    )),
    ("clf", LogisticRegression(
        C=1.0,
        max_iter=1000,
        random_state=42
    ))
])

param_grid = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__max_features": [5000, 10000],
    "clf__C": [0.1, 1, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, scoring='f1_macro', cv=3, verbose=1)
grid_search.fit(X_train, y_train)

# Use the best model
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 12 candidates, totalling 36 fits


## 5. Evaluate Model
Evaluate the model on the validation set using the Macro F1 score.

In [8]:
y_val_pred = best_model.predict(X_val)
f1_macro = f1_score(y_val, y_val_pred, average='macro')
print(f"\nF1 macro after tuning: {f1_macro:.4f}")
print("Best parameters:", grid_search.best_params_)


F1 macro after tuning: 0.6910
Best parameters: {'clf__C': 1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}


## 6. Predict on Test Data
Use the trained model to predict sentiment for the test dataset.

In [9]:
X_test = test_df['text'].apply(preprocess_text)
test_predictions = best_model.predict(X_test)
print("\nExample of 10 sentiment predictions for test data:")
print(test_predictions[:10])


Example of 10 sentiment predictions for test data:
['positive' 'negative' 'positive' 'negative' 'positive' 'negative'
 'neutral' 'negative' 'positive' 'neutral']


## 7. Save Predictions
Save the predictions to a CSV file for submission.

In [10]:
submission_df = pd.DataFrame({
    'textID': test_df['textID'],
    'text': test_df['text'],
    'selected_text': test_df['selected_text'],
    'predicted_sentiment': test_predictions
})

submission_df.to_csv("my_submission.csv", index=False)
print("\nFile 'my_submission.csv' has been saved.")


File 'my_submission.csv' has been saved.
