# Flexible Sentiment Analysis using LSTM
This notebook can handle binary or multi-class sentiment datasets automatically.

Requirements:
- Dataset CSV with one text column and one label column (any name).
- Labels can be integers (0,1,..) or strings ('positive', 'negative', etc.).

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Load dataset
Upload a CSV file with one text column and one label column.

In [None]:
from google.colab import files
uploaded = files.upload()  # Works in Google Colab

for fn in uploaded.keys():
    df = pd.read_csv(fn)

print('Dataset shape:', df.shape)
df.head()

## Detect text and label columns automatically
We will assume the text column contains strings and the label column is categorical or numeric.

In [None]:
# Identify text column (object type)
text_column = df.select_dtypes(include='object').columns[0]

# Identify label column (non-object type)
label_column = df.select_dtypes(exclude='object').columns[0]

print('Text column:', text_column)
print('Label column:', label_column)

In [None]:
# Prepare text and labels
texts = df[text_column].astype(str).values
labels_raw = df[label_column].values

# Encode labels
le = LabelEncoder()
labels_encoded = le.fit_transform(labels_raw)

# Determine number of classes
num_classes = len(np.unique(labels_encoded))
print('Number of classes:', num_classes)

# One-hot encode if multi-class
if num_classes > 2:
    labels = to_categorical(labels_encoded, num_classes=num_classes)
else:
    labels = labels_encoded

print('Labels shape:', labels.shape)

In [None]:
# Tokenize text
vocab_size = 10000
maxlen = 200
embedding_dim = 128
batch_size = 64
epochs = 5

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=maxlen)
y = labels

print('X shape:', X.shape)
print('y shape:', y.shape)

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Training samples:', X_train.shape[0])
print('Testing samples:', X_test.shape[0])

In [None]:
# Build LSTM model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
if num_classes > 2:
    model.add(Dense(num_classes, activation='softmax'))
else:
    model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='categorical_crossentropy' if num_classes > 2 else 'binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
model.summary()

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.2
)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

In [None]:
# Plot training history
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='train_accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()