In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam


nltk.download('punkt')


file_path = r'C:\Users\User\Desktop\Tamil_sentiments.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')


df_cleaned = df.dropna(axis=1, how='all')


def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    text = word_tokenize(text)  # Tokenize
    text = [word for word in text if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(text)


df_cleaned['cleaned_text'] = df_cleaned['Text'].apply(preprocess_text)

# Handle NaN sentiments by filling with a placeholder
df_cleaned['Sentiment'] = df_cleaned['Sentiment'].fillna('unknown_state')

# Encode the labels
sentiment_mapping = {
    'Negative': 0,
    'Positive': 1,
    'Mixed_feelings': 2,
    'not-Tamil': 3,
    'unknown_state': 4
}
df_cleaned['label'] = df_cleaned['Sentiment'].map(sentiment_mapping)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_cleaned['cleaned_text'], df_cleaned['label'], test_size=0.2, random_state=42)

# Define maximum number of words and maximum sequence length
MAX_WORDS = 5000
MAX_SEQUENCE_LENGTH = 100

# Tokenize the text
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['cleaned_text'] = df_cleaned['Text'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Sentiment'] = df_cleaned['Sentiment'].fillna('unknown_state')
A value is tryi

In [2]:
# Define the model
model = Sequential()
model.add(Embedding(MAX_WORDS, 100, input_length=MAX_SEQUENCE_LENGTH))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.2, verbose=1)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f'Test Accuracy: {accuracy}')

# Make predictions
y_pred = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)

# Classification report
print(classification_report(y_test, y_pred_classes, target_names=sentiment_mapping.keys()))


Epoch 1/5




[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 115ms/step - accuracy: 0.6361 - loss: 1.1560 - val_accuracy: 0.6868 - val_loss: 0.9280
Epoch 2/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 119ms/step - accuracy: 0.6970 - loss: 0.8643 - val_accuracy: 0.7027 - val_loss: 0.8471
Epoch 3/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 130ms/step - accuracy: 0.7380 - loss: 0.7340 - val_accuracy: 0.7031 - val_loss: 0.8688
Epoch 4/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 125ms/step - accuracy: 0.7875 - loss: 0.6109 - val_accuracy: 0.6820 - val_loss: 0.9186
Epoch 5/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 132ms/step - accuracy: 0.8187 - loss: 0.5187 - val_accuracy: 0.6844 - val_loss: 0.9893
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.6710 - loss: 0.9922
Test Accuracy: 0.6887122392654419
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [8]:
def predict_sentiment(text):
    processed_text = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([processed_text])
    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
    pred = model.predict(padded)
    label = np.argmax(pred, axis=1)[0]
    for sentiment, index in sentiment_mapping.items():
        if index == label:
            return sentiment

# Take user input for a new Tamil sentence
new_sentence = input("Enter a Tamil sentence in Roman script: ")
predicted_sentiment = predict_sentiment(new_sentence)
print(f'Sentiment: {predicted_sentiment}')


Enter a Tamil sentence in Roman script:  Thala mass  Hvy sprt kerala Surya anna fans'


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Sentiment: Positive
