In [2]:
#pip install transformers datasets torch
#pip install fasttext

In [None]:
import pandas as pd
import fasttext
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import os

In [1]:
class FastTextSentiment:
    def __init__(self):
        self.model = None
        
    def prepare_fasttext_format(self, df, output_file, text_column='review_text', label_column='sentiment'):
        """
        Convert DataFrame to FastText format
        Format: __label__positive This is a great product!
        """
        print(f"Preparing FastText format for {len(df)} samples...")
        
        with open(output_file, 'w', encoding='utf-8') as f:
            for _, row in df.iterrows():
                text = str(row[text_column]).replace('\n', ' ').strip()
                label = f"__label__{row[label_column]}"
                f.write(f"{label} {text}\n")
        print(f"Saved FastText data to: {output_file}")
    
    def train_model(self, train_file, model_save_path='sentiment_model.bin'):
        """Train FastText model"""
        print("Training FastText model...")
        
        self.model = fasttext.train_supervised(
            input=train_file,
            epoch=25,
            lr=1.0,
            wordNgrams=2,  # Uses bigrams for better context
            dim=300,
            loss='softmax',
            verbose=2
        )
        
        # Save the model
        self.model.save_model(model_save_path)
        print(f"Model saved to: {model_save_path}")
        
        return self.model
    
    def predict_sentiment(self, texts):
        """Predict sentiment for a list of texts"""
        if not self.model:
            raise ValueError("Model not trained yet!")
        
        if isinstance(texts, str):
            texts = [texts]
        
        predictions = []
        confidences = []
        
        for text in texts:
            # FastText expects clean text
            clean_text = str(text).replace('\n', ' ').strip()
            preds, scores = self.model.predict(clean_text, k=1)  # k=1 for top prediction
            
            # Extract label and confidence
            label = preds[0].replace('__label__', '')
            confidence = scores[0]
            
            predictions.append(label)
            confidences.append(confidence)
        
        return predictions, confidences
    
    def evaluate_model(self, test_file):
        """Comprehensive evaluation on test data"""
        print("\n" + "="*50)
        print("EVALUATION RESULTS")
        print("="*50)
        
        # Load test data for evaluation
        test_results = self.model.test(test_file)
        print(f"FastText Test Results: Precision@1: {test_results[1]:.3f}, Recall@1: {test_results[2]:.3f}")
        
        # More detailed evaluation
        true_labels = []
        pred_labels = []
        
        with open(test_file, 'r', encoding='utf-8') as f:
            for line in f:
                true_label = line.split()[0].replace('__label__', '')
                text = ' '.join(line.split()[1:])
                
                pred_label, _ = self.predict_sentiment(text)
                
                true_labels.append(true_label)
                pred_labels.append(pred_label[0])
        
        # Detailed metrics
        print(f"\nAccuracy: {accuracy_score(true_labels, pred_labels):.3f}")
        print(f"\nClassification Report:")
        print(classification_report(true_labels, pred_labels, target_names=['negative', 'neutral', 'positive']))
        
        # Confusion matrix
        print(f"\nConfusion Matrix:")
        print(confusion_matrix(true_labels, pred_labels, labels=['negative', 'neutral', 'positive']))
        
        return true_labels, pred_labels

# Main execution function
def run_fasttext_experiment(csv_file, text_col='review_text', label_col='sentiment'):
    """
    Complete pipeline: Load data, train FastText, evaluate on test set
    """
    # Load your dataset
    print("Loading dataset...")
    df = pd.read_csv(csv_file)
    print(f"Dataset loaded: {len(df)} total reviews")
    print(f"Label distribution:\n{df[label_col].value_counts()}")
    
    # Split data (80% train, 20% test)
    from sklearn.model_selection import train_test_split
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[label_col])
    
    print(f"Training set: {len(train_df)} samples")
    print(f"Test set: {len(test_df)} samples")
    
    # Initialize FastText trainer
    ft = FastTextSentiment()
    
    # Prepare FastText format files
    train_file = "fasttext_train.txt"
    test_file = "fasttext_test.txt"
    
    ft.prepare_fasttext_format(train_df, train_file, text_col, label_col)
    ft.prepare_fasttext_format(test_df, test_file, text_col, label_col)
    
    # Train model
    model_path = "sentiment_model.bin"
    ft.train_model(train_file, model_path)
    
    # Evaluate on test set
    true_labels, pred_labels = ft.evaluate_model(test_file)
    
    # Test with some examples
    print("\n" + "="*50)
    print("SAMPLE PREDICTIONS")
    print("="*50)
    
    sample_texts = test_df[text_col].head(5).tolist()
    sample_true = test_df[label_col].head(5).tolist()
    
    predictions, confidences = ft.predict_sentiment(sample_texts)
    
    for i, (text, true, pred, conf) in enumerate(zip(sample_texts, sample_true, predictions, confidences)):
        print(f"\nSample {i+1}:")
        print(f"Text: {text[:100]}...")
        print(f"True: {true} | Predicted: {pred} | Confidence: {conf:.3f}")
        print(f"✓ Correct" if true == pred else "✗ Wrong")
    
    return ft, true_labels, pred_labels

# If you want to load a pre-trained model later
def load_and_test_model(model_path, test_csv_file, text_col='review_text', label_col='sentiment'):
    """Load pre-trained model and test on new data"""
    ft = FastTextSentiment()
    ft.model = fasttext.load_model(model_path)
    
    test_df = pd.read_csv(test_csv_file)
    
    print(f"Testing on {len(test_df)} samples...")
    
    predictions, confidences = ft.predict_sentiment(test_df[text_col].tolist())
    true_labels = test_df[label_col].tolist()
    
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy on test data: {accuracy:.3f}")
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions))
    
    return predictions, confidences, true_labels

In [19]:
# Run the complete experiment
if __name__ == "__main__":
    # Replace with your actual CSV file path
    csv_file = "all_reviews.csv"
    
    # Run complete training and evaluation
    model, true_labels, pred_labels = run_fasttext_experiment(
        csv_file, 
        text_col='review_text',  # change if your text column has different name
        label_col='sentiment'    # change if your label column has different name
    )

Loading dataset...
Dataset loaded: 131669 total reviews
Label distribution:
sentiment
pos    87138
neu    24704
neg    19827
Name: count, dtype: int64
Training set: 105335 samples
Test set: 26334 samples
Preparing FastText format for 105335 samples...
Saved FastText data to: fasttext_train.txt
Preparing FastText format for 26334 samples...
Saved FastText data to: fasttext_test.txt
Training FastText model...


Read 35M words
Number of words:  1578209
Number of labels: 3
Progress: 100.0% words/sec/thread:  465326 lr:  0.000000 avg.loss:  0.063214 ETA:   0h 0m 0s words/sec/thread:  496991 lr:  0.995774 avg.loss:  0.763226 ETA:   0h 9m54sm52s lr:  0.940355 avg.loss:  0.524938 ETA:   0h 8m57sh 8m53s% words/sec/thread:  515901 lr:  0.927077 avg.loss:  0.501388 ETA:   0h 8m53s 506414 lr:  0.913104 avg.loss:  0.479386 ETA:   0h 8m55s  8.8% words/sec/thread:  505969 lr:  0.911976 avg.loss:  0.476139 ETA:   0h 8m55s ETA:   0h 8m54s 504126 lr:  0.906478 avg.loss:  0.461753 ETA:   0h 8m53s words/sec/thread:  502472 lr:  0.900295 avg.loss:  0.447105 ETA:   0h 8m52s ETA:   0h 8m46s 499028 lr:  0.882821 avg.loss:  0.416308 ETA:   0h 8m45s 0.414190 ETA:   0h 8m43s 0.870309 avg.loss:  0.394961 ETA:   0h 8m35s 503202 lr:  0.862573 avg.loss:  0.379193 ETA:   0h 8m29s 507037 lr:  0.829559 avg.loss:  0.327062 ETA:   0h 8m 5s avg.loss:  0.295235 ETA:   0h 7m51s words/sec/thread:  505274 lr:  0.793475 avg.loss:  

Model saved to: sentiment_model.bin

EVALUATION RESULTS
FastText Test Results: Precision@1: 0.812, Recall@1: 0.812

Accuracy: 0.813

Classification Report:
              precision    recall  f1-score   support

    negative       0.79      0.71      0.75      3965
     neutral       0.55      0.48      0.51      4941
    positive       0.88      0.93      0.90     17428

    accuracy                           0.81     26334
   macro avg       0.74      0.71      0.72     26334
weighted avg       0.80      0.81      0.81     26334


Confusion Matrix:


ValueError: At least one label specified must be in y_true