In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the improved dataset
print("=== LOADING IMPROVED DATASET ===")
df = pd.read_csv("personal_expense_classification.csv")  # Use the CSV I created above
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())

# Basic data exploration
print("\n=== DATA EXPLORATION ===")
print("Category distribution:")
category_counts = df['category'].value_counts()
print(category_counts)
print(f"\nTotal categories: {df['category'].nunique()}")
print(f"Categories: {df['category'].unique()}")

# Check for data quality
print("\n=== DATA QUALITY CHECK ===")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

# Check description-category consistency
print("\nSample descriptions per category:")
for category in df['category'].unique():
    print(f"\n{category.upper()}:")
    sample_descriptions = df[df['category'] == category]['description'].head(3).tolist()
    for desc in sample_descriptions:
        print(f"  - {desc}")

# Data preprocessing
print("\n=== DATA PREPROCESSING ===")

# Remove unnecessary columns
df_clean = df.drop(['expense_id', 'amount', 'merchant'], axis=1)

# Encode categories
category_mapping = {
    'shopping': 0, 
    'technology': 1, 
    'food': 2, 
    'entertainment': 3, 
    'transport': 4
}
df_clean['category_encoded'] = df_clean['category'].map(category_mapping)

print("Category encoding:")
for cat, code in category_mapping.items():
    count = (df_clean['category_encoded'] == code).sum()
    print(f"  {cat}: {code} ({count} samples)")

# Prepare features and target
X = df_clean['description']
y = df_clean['category_encoded']

print(f"\nFeature shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Feature extraction comparison
print("\n=== FEATURE EXTRACTION COMPARISON ===")

# Method 1: Count Vectorizer
count_vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_count = count_vectorizer.fit_transform(X)

# Method 2: TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(X)

print(f"Count Vectorizer shape: {X_count.shape}")
print(f"TF-IDF Vectorizer shape: {X_tfidf.shape}")

# Show some vocabulary
print("\nTop 20 words in vocabulary:")
feature_names = tfidf_vectorizer.get_feature_names_out()
print(feature_names[:20])

# Model testing
print("\n=== MODEL COMPARISON ===")

# Split data (using stratify to maintain class balance)
X_train_count, X_test_count, y_train, y_test = train_test_split(
    X_count, y, test_size=0.2, random_state=42, stratify=y
)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

# Define models to test
models = {
    'Naive Bayes (Count)': (MultinomialNB(), X_train_count, X_test_count),
    'Naive Bayes (TF-IDF)': (MultinomialNB(), X_train_tfidf, X_test_tfidf),
    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=1000), X_train_tfidf, X_test_tfidf),
    'Random Forest': (RandomForestClassifier(n_estimators=100, random_state=42), X_train_count, X_test_count),
    'SVM': (SVC(random_state=42), X_train_tfidf, X_test_tfidf)
}

results = {}
best_score = 0
best_model_name = ""
best_predictions = None

print("Model Performance:")
print("-" * 60)
print(f"{'Model':<25} {'Accuracy':<12} {'Percentage':<12}")
print("-" * 60)

for model_name, (model, X_train, X_test) in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy
    
    print(f"{model_name:<25} {accuracy:<12.4f} {accuracy*100:<12.2f}%")
    
    # Track best model
    if accuracy > best_score:
        best_score = accuracy
        best_model_name = model_name
        best_predictions = y_pred

print("-" * 60)
print(f"Best Model: {best_model_name} with {best_score*100:.2f}% accuracy")

# Detailed analysis of best model
print(f"\n=== DETAILED ANALYSIS - {best_model_name} ===")

# Classification report
category_names = ['shopping', 'technology', 'food', 'entertainment', 'transport']
print("\nClassification Report:")
print(classification_report(y_test, best_predictions, 
                          target_names=category_names,
                          digits=4))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, best_predictions)
cm_df = pd.DataFrame(cm, index=category_names, columns=category_names)
print(cm_df)

# Show some example predictions
print(f"\n=== PREDICTION EXAMPLES ===")
print("Sample correct and incorrect predictions:")

# Get test indices to match with original descriptions
test_indices = y_test.index if hasattr(y_test, 'index') else range(len(y_test))

correct_predictions = []
incorrect_predictions = []

for i, (actual, predicted) in enumerate(zip(y_test, best_predictions)):
    if len(correct_predictions) < 5 and actual == predicted:
        correct_predictions.append((i, actual, predicted))
    elif len(incorrect_predictions) < 5 and actual != predicted:
        incorrect_predictions.append((i, actual, predicted))

print("\n✅ CORRECT PREDICTIONS:")
for i, actual, predicted in correct_predictions:
    # Get the description from test set
    test_idx = list(test_indices)[i]
    description = X.iloc[test_idx]
    print(f"'{description}' → {category_names[actual]} ✓")

print("\n❌ INCORRECT PREDICTIONS:")
for i, actual, predicted in incorrect_predictions:
    test_idx = list(test_indices)[i]
    description = X.iloc[test_idx]
    print(f"'{description}' → Actual: {category_names[actual]}, Predicted: {category_names[predicted]} ✗")

# Feature importance (for Random Forest if it's the best)
if 'Random Forest' in best_model_name:
    print(f"\n=== FEATURE IMPORTANCE ===")
    # Get the best model
    best_rf_model = models[best_model_name][0]
    feature_names = count_vectorizer.get_feature_names_out()
    
    # Get feature importances
    importances = best_rf_model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    print("Top 15 most important words:")
    for i in range(min(15, len(indices))):
        print(f"{i+1:2d}. {feature_names[indices[i]]:<15} ({importances[indices[i]]:.4f})")

# Summary
print(f"\n=== SUMMARY ===")
print(f"Dataset size: {df.shape[0]} samples")
print(f"Features: {X_count.shape[1]} unique words")
print(f"Categories: {len(category_names)}")
print(f"Best model: {best_model_name}")
print(f"Best accuracy: {best_score*100:.2f}%")
print(f"Improvement over random guessing: {(best_score - 0.2)*100:.2f} percentage points")

# Tips for further improvement
print(f"\n=== TIPS FOR FURTHER IMPROVEMENT ===")
print("1. Collect more training data (especially for underrepresented categories)")
print("2. Use word embeddings (Word2Vec, GloVe) instead of bag-of-words")
print("3. Try ensemble methods combining multiple models")
print("4. Add merchant information as an additional feature")
print("5. Use more sophisticated text preprocessing (stemming, lemmatization)")
print("6. Consider using neural networks for better text understanding")

=== LOADING IMPROVED DATASET ===
Dataset shape: (200, 5)

First 5 rows:
  expense_id  amount   merchant                     description       category
0     EXP001   45.50     Amazon    wireless headphones purchase       shopping
1     EXP002   12.99    Netflix  monthly streaming subscription  entertainment
2     EXP003    8.75  Starbucks       morning coffee and muffin           food
3     EXP004   15.20       Uber         ride to downtown office      transport
4     EXP005   89.99   Best Buy     smartphone screen protector     technology

=== DATA EXPLORATION ===
Category distribution:
category
shopping         40
entertainment    40
food             40
transport        40
technology       40
Name: count, dtype: int64

Total categories: 5
Categories: ['shopping' 'entertainment' 'food' 'transport' 'technology']

=== DATA QUALITY CHECK ===
Missing values: 0
Duplicate rows: 0

Sample descriptions per category:

SHOPPING:
  - wireless headphones purchase
  - household cleaning supplies
 