In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Load your dataset
df = pd.read_csv('labelled_data.csv')

# Prepare text data
texts = df['Comment Text']
labels = df['Sentiment Label']

# Map the sentiment labels to numerical values
label_mapping = {'NEG': 0, 'NEU': 1, 'POS': 2}
labels = labels.map(label_mapping)

# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer and transform the text data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize individual models
log_clf = LogisticRegression(max_iter=1000)
rf_clf = RandomForestClassifier(n_estimators=100)

# Initialize and train the XGBoost classifier with progress bar
xgb_clf = XGBClassifier(learning_rate=0.2, max_depth=7, n_estimators=300, subsample=0.7, use_label_encoder=False, eval_metric='logloss', random_state=42)
for _ in tqdm(range(1), desc="Training XGBoost"):
    xgb_clf.fit(X_train_tfidf, y_train)

# Create the ensemble model
ensemble_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rf_clf), ('xgb', xgb_clf)],
    voting='soft'  # 'soft' voting uses predicted probabilities for classification
)

# Train the ensemble model
ensemble_clf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = ensemble_clf.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['NEG', 'NEU', 'POS'])

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Parameters: { "use_label_encoder" } are not used.

Training XGBoost: 100%|██████████| 1/1 [02:16<00:00, 136.41s/it]
Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8971714613111392
Classification Report:
              precision    recall  f1-score   support

         NEG       0.83      0.51      0.63      1045
         NEU       0.90      0.96      0.93     11197
         POS       0.89      0.84      0.86      4339

    accuracy                           0.90     16581
   macro avg       0.87      0.77      0.81     16581
weighted avg       0.90      0.90      0.89     16581



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Load your dataset
df = pd.read_csv('labelled_data.csv')

# Prepare text data
texts = df['Comment Text']
labels = df['Sentiment Label']

# Map the sentiment labels to numerical values
label_mapping = {'NEG': 0, 'NEU': 1, 'POS': 2}
labels = labels.map(label_mapping)

# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer and transform the text data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize individual models
log_clf = LogisticRegression(max_iter=1000)
rf_clf = RandomForestClassifier(n_estimators=100)

# Initialize and train the XGBoost classifier with progress bar
xgb_clf = XGBClassifier(learning_rate=0.2, max_depth=7, n_estimators=300, subsample=0.7, use_label_encoder=False, eval_metric='logloss', random_state=42)
for _ in tqdm(range(1), desc="Training XGBoost"):
    xgb_clf.fit(X_train_tfidf, y_train)

# Create the ensemble model
ensemble_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rf_clf), ('xgb', xgb_clf)],
    voting='soft'  # 'soft' voting uses predicted probabilities for classification
)

# Define k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_results = cross_val_score(ensemble_clf, X_train_tfidf, y_train, cv=kfold, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation accuracy scores: {cv_results}')
print(f'Mean cross-validation accuracy: {cv_results.mean()}')

# Train the ensemble model on the entire training set
ensemble_clf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = ensemble_clf.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['NEG', 'NEU', 'POS'])

print(f'Accuracy on test set: {accuracy}')
print('Classification Report:')
print(report)


Parameters: { "use_label_encoder" } are not used.

Training XGBoost: 100%|██████████| 1/1 [01:49<00:00, 109.25s/it]
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validation accuracy scores: [0.89807765 0.8977761  0.8928006  0.88986054 0.89882388]
Mean cross-validation accuracy: 0.8954677546005911


Parameters: { "use_label_encoder" } are not used.



Accuracy on test set: 0.8975333212713347
Classification Report:
              precision    recall  f1-score   support

         NEG       0.83      0.51      0.64      1045
         NEU       0.90      0.96      0.93     11197
         POS       0.89      0.84      0.86      4339

    accuracy                           0.90     16581
   macro avg       0.87      0.77      0.81     16581
weighted avg       0.90      0.90      0.89     16581

