In [1]:
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('https://raw.githubusercontent.com/GuviMentor88/Training-Datasets/refs/heads/main/twitter_training.csv', header=None)
df.columns = ["tweet_id", "entity", "sentiment", "tweet"]
df = df[["sentiment", "tweet"]]
# Download stopwords dataset
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:


# ... (rest of the code)

def clean_text(text):
    # Check if text is a string before applying lowercase conversion
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
        text = re.sub(r"@\w+|\#", "", text)  # Remove mentions (@username) and hashtags
        text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
        text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
        return text
    else:
        # Handle non-string values (e.g., return an empty string or NaN)
        return ""  # or return float('nan')

# Apply cleaning to tweets, handling potential errors
df["clean_tweet"] = df["tweet"].apply(clean_text)

print(df[["tweet", "clean_tweet"]].head(10))  # Show cleaned tweets

                                               tweet  \
0  im getting on borderlands and i will murder yo...   
1  I am coming to the borders and I will kill you...   
2  im getting on borderlands and i will kill you ...   
3  im coming on borderlands and i will murder you...   
4  im getting on borderlands 2 and i will murder ...   
5  im getting into borderlands and i can murder y...   
6  So I spent a few hours making something for fu...   
7  So I spent a couple of hours doing something f...   
8  So I spent a few hours doing something for fun...   
9  So I spent a few hours making something for fu...   

                                         clean_tweet  
0                      im getting borderlands murder  
1                                coming borders kill  
2                        im getting borderlands kill  
3                       im coming borderlands murder  
4                    im getting borderlands 2 murder  
5                      im getting borderlands murder 

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert cleaned tweets into numerical features
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words
X = vectorizer.fit_transform(df["clean_tweet"]).toarray()

print("Shape of TF-IDF transformed data:", X.shape)


Shape of TF-IDF transformed data: (74682, 5000)


In [5]:
print(df["sentiment"].isna().sum())  # Check number of NaN values
print(df[df["sentiment"].isna()])  # Display rows with NaN values




0
Empty DataFrame
Columns: [sentiment, tweet, clean_tweet]
Index: []


In [6]:
df['sentiment'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [None]:
# Convert string labels to integers, then remove NaN values
#df["sentiment"] = df["sentiment"].map(label_map)

# Drop NaN rows after mapping
df.dropna(subset=["sentiment"], inplace=True)

# Convert to integer type safely
#df["sentiment"] = df["sentiment"].astype(int)

print(df["sentiment"].isna().sum())  # Should print 0
print(df["sentiment"].dtype)  # Should be int


0
object


In [None]:
# Convert string labels to integers
label_map = {"Irrelevant":3,"Positive": 2, "Neutral": 1, "Negative": 0}
df["sentiment"] = df["sentiment"].map(label_map)

# Drop rows where sentiment is NaN after mapping
df.dropna(subset=["sentiment"], inplace=True)

# Convert to integer type
df["sentiment"] = df["sentiment"].astype(int)


In [None]:
print(df["sentiment"].isna().sum())  # Should print 0
print(df["sentiment"].dtype)  # Should be int
print(df["sentiment"].unique())  # Should show [0, 1, 2] only


0
int64
[2 1 0 3]


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pickle
# Define vocabulary size and sequence length
vocab_size = 5000
max_length = 100

# Tokenize text
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df["clean_tweet"])
X_seq = tokenizer.texts_to_sequences(df["clean_tweet"])
X_padded = pad_sequences(X_seq, maxlen=max_length, padding="post")

# Convert labels to integers
#label_map = {"Positive": 2, "Neutral": 1, "Negative": 0}
#df["sentiment"] = df["sentiment"].map(label_map).astype(int)
# Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Tokenizer saved as tokenizer.pkl")
# Convert labels to categorical format
y = tf.keras.utils.to_categorical(df["sentiment"], num_classes=4)

# Split data into training and validation
X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Define CNN Model
model = Sequential([
    Embedding(vocab_size, 128, input_length=max_length),
    SpatialDropout1D(0.2),
    Conv1D(128, 5, activation="relu"),  # 1D Convolution Layer
    GlobalMaxPooling1D(),  # Reduces sequence length
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(4, activation="softmax")  # 3-class classification
])

# Compile the model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, verbose=1)

# Save model
model.save("cnn_sentiment_model.h5")


Tokenizer saved as tokenizer.pkl
Epoch 1/10




[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 50ms/step - accuracy: 0.4928 - loss: 1.1471 - val_accuracy: 0.7088 - val_loss: 0.7496
Epoch 2/10
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 50ms/step - accuracy: 0.7576 - loss: 0.6484 - val_accuracy: 0.7775 - val_loss: 0.5810
Epoch 3/10
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 52ms/step - accuracy: 0.8436 - loss: 0.4174 - val_accuracy: 0.8104 - val_loss: 0.5269
Epoch 4/10
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 50ms/step - accuracy: 0.8804 - loss: 0.3142 - val_accuracy: 0.8168 - val_loss: 0.5253
Epoch 5/10
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 50ms/step - accuracy: 0.8982 - loss: 0.2627 - val_accuracy: 0.8210 - val_loss: 0.5619
Epoch 6/10
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 50ms/step - accuracy: 0.9126 - loss: 0.2221 - val_accuracy: 0.8256 - val_loss: 0.5496
Epoch 7/10



In [None]:
import tensorflow as tf
model = tf.keras.models.load_model("cnn_sentiment_model.h5")
# Specify either .keras or .h5 extension
model.save("saved_model.keras")  # Or model.save("saved_model.h5")



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Load test dataset (Replace with actual test data)
X_test = [...]  # Your test sequences
y_test = [...]  # True labels

# Predict sentiment classes
y_pred = np.argmax(model.predict(X_test), axis=1)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class="ovr")

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")


ValueError: Unrecognized data type: x=[Ellipsis] (of type <class 'list'>)

In [None]:
pip install streamlit


Collecting streamlit
  Downloading streamlit-1.43.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.43.1-py2.py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
import mysql.connector

# Replace with your actual database details
db_config = {
    "host": "sentiment-db.cbkegyy2203z.ap-south-1.rds.amazonaws.com",
    "user": "admin",
    "password": "harikrishnanaero",
    "database": "sentiment_analysis"
}

try:
    conn = mysql.connector.connect(**db_config)
    cursor = conn.cursor()
    cursor.execute("SELECT NOW();")
    result = cursor.fetchone()
    print(f"✅ Successfully connected to MySQL! Server time: {result[0]}")
    conn.close()
except Exception as e:
    print(f"❌ Connection failed: {e}")


✅ Successfully connected to MySQL! Server time: 2025-03-09 19:59:16


In [None]:
import streamlit as st
import mysql.connector
from datetime import datetime

# Database connection
db_config = {
    "host": "sentiment-db.cbkegyy2203z.ap-south-1.rds.amazonaws.com",
    "user": "admin",
    "password": "your_password",
    "database": "sentiment_analysis"
}

def log_user_login(username):
    try:
        conn = mysql.connector.connect(**db_config)
        cursor = conn.cursor()
        query = "INSERT INTO user_logins (username, login_time) VALUES (%s, %s);"
        cursor.execute(query, (username, datetime.now()))
        conn.commit()
        conn.close()
        st.success(f"✅ User {username} login recorded in MySQL!")
    except Exception as e:
        st.error(f"❌ Error logging user login: {e}")

# Streamlit UI
st.title("Sentiment Analysis Web App")

username = st.text_input("Enter Username:")
if st.button("Login"):
    log_user_login(username)


