In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report
from scipy.sparse import hstack

# Step 1: Load and preprocess the data
df = pd.read_csv('BABE_scraped.csv')
df['content'] = df['content'].str.lower()  # Convert text to lowercase
df.dropna(subset=['content'], inplace=True)  # Drop rows with missing values in the 'content' column

# Step 2: Feature extraction
vectorizer = TfidfVectorizer(max_features=500, stop_words='english')  # Reduced max features
X_text = vectorizer.fit_transform(df['content'])
y = df['type_class']

# Step 3: Combine text features
X = X_text

# Step 4: Initialize Naive Bayes classifier
clf = MultinomialNB()

# Step 5: Evaluate using cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Optional: Print classification report for detailed evaluation
# Since the model is being evaluated using cross-validation, a separate test set isn't needed.
# You can uncomment the following lines if you still want to see the classification report.

# y_pred = cross_val_predict(clf, X, y, cv=cv)
# print("Classification Report for Cross-Validation:")
# print(classification_report(y, y_pred))


Cross-Validation Scores: [0.69230769 0.66153846 0.69846154 0.73538462 0.68      ]
Mean CV Accuracy: 0.6935384615384617


In this code:

- We reduced the max features for the TF-IDF vectorizer to 500 to reduce dimensionality.
- We removed domain names as features to simplify the model.
- We used stratified k-fold cross-validation to provide a more reliable estimate of the model's performance.
- We evaluated the model using cross-validation and printed the mean accuracy score.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from textblob import TextBlob

# Step 1: Load and preprocess the data
df = pd.read_csv('BABE_scraped.csv')
df['content'] = df['content'].str.lower()  # Convert text to lowercase
df.dropna(subset=['content'], inplace=True)  # Drop rows with missing values in the 'content' column

# Step 2: Feature extraction
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_text = vectorizer.fit_transform(df['content'])

# Additional Features
# Content length
df['content_length'] = df['content'].apply(len)

# Sentiment score using TextBlob
df['sentiment_score'] = df['content'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Combine features
X = pd.concat([pd.DataFrame(X_text.toarray()), df[['content_length', 'sentiment_score']]], axis=1)
y = df['type_class']

# Convert integer column names to strings
X.columns = X.columns.astype(str)

# Filter out rows in X without corresponding target values in y
X = X.iloc[:len(y), :]

# Drop rows with missing values in X
X.dropna(inplace=True)

# Reset y index to match filtered X
y = y[X.index]

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize Logistic Regression classifier
clf = LogisticRegression(max_iter=1000, random_state=42)

# Step 5: Train the model
clf.fit(X_train, y_train)

# Step 6: Evaluate using cross-validation
cv_scores = cross_val_score(clf, X, y, cv=5)

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Step 7: Evaluate on test set
y_pred = clf.predict(X_test)
print("Classification Report for Test Set:")
print(classification_report(y_test, y_pred))


In this code:

- We included additional features such as content length and sentiment score using TextBlob.
- We used Logistic Regression as the classifier.
- We evaluated the model using cross-validation and printed the mean accuracy score.
- We evaluated the model on the test set and printed the classification report for detailed performance evaluation.