In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

# Load your dataset
df = pd.read_csv('labelled_data.csv')

# Prepare text data
texts = df['Comment Text']
labels = df['Sentiment Label']

# Map the sentiment labels to numerical values
label_mapping = {'NEG': 0, 'NEU': 1, 'POS': 2}
labels = labels.map(label_mapping)

# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer and transform the text data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the XGBoost classifier with progress bar
classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
for _ in tqdm(range(1), desc="Training Progress"):
    classifier.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

Training Progress: 100%|██████████| 1/1 [00:36<00:00, 36.02s/it]

              precision    recall  f1-score   support

           0       0.82      0.43      0.56      1045
           1       0.88      0.96      0.92     11197
           2       0.89      0.79      0.83      4339

    accuracy                           0.88     16581
   macro avg       0.86      0.73      0.77     16581
weighted avg       0.88      0.88      0.87     16581






In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from tqdm import tqdm

# Load your dataset
df = pd.read_csv('labelled_data.csv')

# Prepare text data
texts = df['Comment Text']
labels = df['Sentiment Label']

# Map the sentiment labels to numerical values
label_mapping = {'NEG': 0, 'NEU': 1, 'POS': 2}
labels = labels.map(label_mapping)

# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer and transform the text data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize the XGBoost classifier
classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Perform grid search with progress bar
with tqdm(total=1, desc="Grid Search Progress"):
    grid_search.fit(X_train_tfidf, y_train)
    tqdm.write(f"Best parameters found: {grid_search.best_params_}")

# Use the best estimator from the grid search
best_classifier = grid_search.best_estimator_

# Predict and evaluate
y_pred = best_classifier.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


Grid Search Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 81 candidates, totalling 405 fits


Parameters: { "use_label_encoder" } are not used.

Grid Search Progress:   0%|          | 0/1 [3:22:12<?, ?it/s]


Best parameters found: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.7}
              precision    recall  f1-score   support

           0       0.78      0.52      0.63      1045
           1       0.90      0.95      0.93     11197
           2       0.89      0.83      0.86      4339

    accuracy                           0.89     16581
   macro avg       0.86      0.77      0.80     16581
weighted avg       0.89      0.89      0.89     16581



In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid
from sklearn.metrics import classification_report
from tqdm import tqdm

# Load your dataset
df = pd.read_csv('labelled_data.csv')

# Prepare text data
texts = df['Comment Text']
labels = df['Sentiment Label']

# Map the sentiment labels to numerical values
label_mapping = {'NEG': 0, 'NEU': 1, 'POS': 2}
labels = labels.map(label_mapping)

# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer and transform the text data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize the Random Forest classifier
classifier = RandomForestClassifier(random_state=42)

# Define a narrower parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=0)

# Perform grid search with progress bar
param_combinations = list(ParameterGrid(param_grid))
with tqdm(total=len(param_combinations), desc="Grid Search Progress") as pbar:
    for params in param_combinations:
        grid_search.estimator.set_params(**params)
        grid_search.fit(X_train_tfidf, y_train)
        pbar.update(1)

print(f"Best parameters found: {grid_search.best_params_}")

# Use the best estimator from the grid search
best_classifier = grid_search.best_estimator_

# Predict and evaluate
y_pred = best_classifier.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


Grid Search Progress: 100%|██████████| 16/16 [2:46:48<00:00, 625.56s/it]  
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters found: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1045
           1       0.69      1.00      0.82     11197
           2       0.99      0.09      0.16      4339

    accuracy                           0.70     16581
   macro avg       0.56      0.36      0.32     16581
weighted avg       0.73      0.70      0.59     16581



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import joblib

# Load your dataset
df = pd.read_csv('labelled_data.csv')

# Prepare text data
texts = df['Comment Text']
labels = df['Sentiment Label']

# Map the sentiment labels to numerical values
label_mapping = {'NEG': 0, 'NEU': 1, 'POS': 2}
labels = labels.map(label_mapping)

# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer and transform the text data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the XGBoost classifier with the best parameters and progress bar
best_params = {
    'learning_rate': 0.2,
    'max_depth': 7,
    'n_estimators': 300,
    'subsample': 0.7,
    'use_label_encoder': False,
    'eval_metric': 'logloss',
    'random_state': 42
}
classifier = XGBClassifier(**best_params)
for _ in tqdm(range(1), desc="Training Progress"):
    classifier.fit(X_train_tfidf, y_train)

# Save the model and vectorizer
joblib.dump(classifier, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

# Predict and evaluate
y_pred = classifier.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

Training Progress: 100%|██████████| 1/1 [01:50<00:00, 110.11s/it]


              precision    recall  f1-score   support

           0       0.78      0.52      0.63      1045
           1       0.90      0.95      0.93     11197
           2       0.89      0.83      0.86      4339

    accuracy                           0.89     16581
   macro avg       0.86      0.77      0.80     16581
weighted avg       0.89      0.89      0.89     16581

