# Text Preprocessing and Feature Extraction

In this notebook, we will walk through various text preprocessing techniques and feature extraction methods using TF-IDF.

In [None]:
import pandas as pd
from src.preprocessing import preprocess_text
from src.feature_extraction import extract_tfidf_features
from src.sentiment_classifier import train_classifier, evaluate_classifier

# Load dataset
df = pd.read_csv('data/imdb_reviews.csv')

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Extract TF-IDF features
X_tfidf, vectorizer = extract_tfidf_features(df['cleaned_review'])
y = df['sentiment']

# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train the sentiment classifier
model = train_classifier(X_train, y_train)

# Evaluate the classifier
accuracy, report = evaluate_classifier(model, X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(report)