In [5]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import string

# Load data
file_path = '/Users/harshdave/Downloads/data_stories_one_shot.csv'
df = pd.read_csv(file_path)

# Step 1: Basic Cleaning (Lowercase + Punctuation Removal)
df['processed'] = df['Sentence'].apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))

# Step 2: Label Setup
df['label'] = df['Stage'].apply(lambda x: 'Show' if x == 1 else 'Tell')
df['label_encoded'] = LabelEncoder().fit_transform(df['label'])

# Step 3: Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='linear'),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

# Step 4: TF-IDF Vectorization + 5-Fold Cross-Validation (Manual Loop)
results_cv = {}
skf = StratifiedKFold(n_splits=5)

for name, model in models.items():
    pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('clf', model)])
    accuracies = []
    for train_index, test_index in skf.split(df['processed'], df['label_encoded']):
        X_train, X_test = df['processed'].iloc[train_index], df['processed'].iloc[test_index]
        y_train, y_test = df['label_encoded'].iloc[train_index], df['label_encoded'].iloc[test_index]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
    
    results_cv[name] = sum(accuracies) / len(accuracies)

# Step 5: Leave-One-Plot-Out Cross-Validation (Logistic Regression)
scores_lopo = []
unique_plots = df['Plot_Name'].unique()

for plot in unique_plots:
    # Split data based on current plot being used for testing
    train_data = df[df['Plot_Name'] != plot]
    test_data = df[df['Plot_Name'] == plot]
    
    X_train, X_test = train_data['processed'], test_data['processed']
    y_train, y_test = train_data['label_encoded'], test_data['label_encoded']
    
    pipeline_lr = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression(max_iter=1000))])
    pipeline_lr.fit(X_train, y_train)
    y_pred = pipeline_lr.predict(X_test)
    
    scores_lopo.append(accuracy_score(y_test, y_pred))

# Step 6: Display Results
print("📊 5-Fold Cross-Validation Accuracy:")
for model_name, acc in results_cv.items():
    print(f"{model_name}: {acc:.4f}")

print(f"\n🔁 Leave-One-Plot-Out Accuracy (Logistic Regression): {sum(scores_lopo) / len(scores_lopo):.4f}")


📊 5-Fold Cross-Validation Accuracy:
Logistic Regression: 0.6846
SVM: 0.8385
Naive Bayes: 0.7308
Random Forest: 0.6923

🔁 Leave-One-Plot-Out Accuracy (Logistic Regression): 0.6796
