In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack

# Step 1: Load and preprocess the data
df = pd.read_csv('BABE_scraped.csv')
df['content'] = df['content'].str.lower()  # Convert text to lowercase
df.dropna(subset=['content'], inplace=True)  # Drop rows with missing values in the 'content' column

# Step 2: Feature extraction
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['content'])
y = df['type_class']

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train the Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Step 5: Evaluate the model on the test set
y_test_pred = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

# Step 6: Print classification report for test set
print("Classification Report for Test Set:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Additional: Cross-validation with custom features
# Add article length as a feature
df['article_length'] = df['content'].apply(len)
X_custom = hstack([X, df['article_length'].values.reshape(-1, 1)])

# Perform cross-validation
cv_scores = cross_val_score(clf, X_custom, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Test Accuracy: 0.7138461538461538
Classification Report for Test Set:
              precision    recall  f1-score   support

           0       0.70      0.89      0.78       164
           1       1.00      0.00      0.00        29
           2       0.74      0.65      0.69       132

    accuracy                           0.71       325
   macro avg       0.81      0.51      0.49       325
weighted avg       0.74      0.71      0.68       325

Cross-Validation Scores: [0.69846154 0.70769231 0.63692308 0.70153846 0.68      ]
Mean CV Accuracy: 0.684923076923077


Changes made:

- Removed unnecessary imports.
- Combined steps 3 and 4 since there's no need to split data into training, validation, and testing sets separately when evaluating the model's performance.
- Removed redundant print statements.
- Included the classification_report for test set evaluation.
- Streamlined the cross-validation part and included it as an additional step.