In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
dataset = pd.read_csv("/content/reddit_preprocessing.csv")
cleaned_dataset = dataset.dropna()

In [3]:
X_cleaned = cleaned_dataset['clean_comment']
y_cleaned = cleaned_dataset['category']

In [4]:
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

In [5]:
tfidf_cleaned = TfidfVectorizer(ngram_range=(1,3),max_features=10000)
X_train_tfidf_cleaned = tfidf_cleaned.fit_transform(X_train_cleaned)
X_test_tfidf_cleaned = tfidf_cleaned.transform(X_test_cleaned)

In [6]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [7]:
import lightgbm as lgb
import optuna
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split , cross_val_score, GridSearchCV


In [8]:

def objective(trial):
  param = {
    "objective": "multiclass",
    "num_class": 3,
    "Learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
    "n_estimators": trial.suggest_int("n_estimators", 50, 500),
    "max_depth": trial.suggest_int("max_depth", 3, 20),
    "metric": "multi_logloss",
    "is_unbalance": True,
    "class_weight": "balanced",
  }
  # Define the LightGBM model with the trial parameters
  model = lgb.LGBMClassifier(**param)
  # Perform cross-validation
  scores = cross_val_score (model, X_train_tfidf_cleaned, y_train_cleaned, cv=3, scoring='accuracy')
  # Return the average score across folds
  return scores.mean()

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

In [None]:
best_params = study.best_params
print("Best hyperparameters:", best_params)

In [31]:
best_model = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=3,
    metric="multi_logloss",
    is_unbalance=True,
    class_weight="balanced",
    reg_alpha = 0.1,
    reg_lambda = 0.1,
    learning_rate= 0.08,
    max_depth= 20,
    n_estimators= 367,
)
best_model.fit(X_train_tfidf_cleaned, y_train_cleaned)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.136907 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 131995
[LightGBM] [Info] Number of data points in the train set: 29329, number of used features: 4439
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [32]:
y_train_pred = best_model.predict(X_train_tfidf_cleaned)



In [33]:
accuracy_train = accuracy_score(y_train_cleaned, y_train_pred)
print("Training Accuracy:", accuracy_train)

Training Accuracy: 0.9271369634150499


In [34]:
report_train = classification_report(y_train_cleaned, y_train_pred)
print("Training Classification Report:\n", report_train)

Training Classification Report:
               precision    recall  f1-score   support

          -1       0.91      0.90      0.91      6601
           0       0.88      0.98      0.93     10134
           1       0.98      0.90      0.94     12594

    accuracy                           0.93     29329
   macro avg       0.92      0.93      0.92     29329
weighted avg       0.93      0.93      0.93     29329



In [35]:
y_pred = best_model.predict(X_test_tfidf_cleaned)



In [36]:
accuracy = accuracy_score(y_test_cleaned, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8633574253375154


In [37]:
report = classification_report(y_test_cleaned, y_pred)
print("Test Classification Report:\n", report)

Test Classification Report:
               precision    recall  f1-score   support

          -1       0.80      0.78      0.79      1647
           0       0.84      0.97      0.90      2510
           1       0.92      0.82      0.87      3176

    accuracy                           0.86      7333
   macro avg       0.85      0.86      0.85      7333
weighted avg       0.87      0.86      0.86      7333



In [40]:
import re
import numpy as np

def preprocess_comment(comment):
    comment = comment.lower()
    comment = re.sub(r"http\S+|www\S+|https\S+", '', comment, flags=re.MULTILINE)
    comment = re.sub(r'\W', ' ', comment)  # keep letters + spaces
    comment = re.sub(r'\s+', ' ', comment).strip()
    return comment

def predict_sentiment(comment, tfidf_vectorizer, lgbm_model):
  cleaned_comment = preprocess_comment(comment)
  comment_tfidf = tfidf_vectorizer.transform([cleaned_comment])
  prediction = lgbm_model.predict(comment_tfidf)
  prediction_proba = lgbm_model.predict_proba(comment_tfidf)
  pred_class = int(np.argmax(prediction_proba))
  confidence = float(np.max(prediction_proba))

  return {
      'sentiment_class': int(prediction[0]),
      'confidence': confidence
  }


# comment = "I absolutely love this video!."
# result = predict_sentiment(comment, tfidf_cleaned, best_model)
# print(f"Predicted Sentiment: {result['sentiment_class']}, Confidence:{result['confidence']}")

In [51]:
# Example usage:
comment1 = "I absolutely love this videol"
comment2 = "The explanations were confusing and the video quality was poor."
comment3 = "I didn't learn anything useful. Really disappointed."
comment4 = "Wow, the explanation was so clear and helpful. Definitely subscribing!"
comment5 = "This is the worst video I've seen on this topic, very misleading"
comment6 = "Not much to say about this, just a standard video."
comment7 = "The video is okay, but I expected more depth in the content."
comment8 = "Superb content! Mazaa aa gaya dekh ke. Best video on this topic!"
comment9 = "Poor video quality aur explanation bhi weak tha."
comment10 = "Yeh video theek tha, but I was expecting more depth."
result = predict_sentiment (comment10, tfidf_cleaned, best_model)
print(f"Predicted Sentiment: {result['sentiment_class']}, Confidence: {result['confidence']}")

Predicted Sentiment: 0, Confidence: 0.7813419986979802


