In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split


data = fetch_20newsgroups(subset='all', categories=None, shuffle=True, random_state=42)


X_labeled, X_unlabeled, y_labeled, _ = train_test_split(data.data, data.target, test_size=0.8, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_labeled = vectorizer.fit_transform(X_labeled)
X_unlabeled = vectorizer.transform(X_unlabeled)

# Train the initial model on labeled data
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_labeled, y_labeled)

# Pseudo-labeling: Predict labels for unlabeled data and add confident predictions to labeled data
pseudo_labels = clf.predict(X_unlabeled)
high_confidence_mask = clf.decision_function(X_unlabeled).max(axis=1) > 0.5  # Adjust confidence threshold if needed
X_pseudo_labeled = X_unlabeled[high_confidence_mask]
y_pseudo_labels = pseudo_labels[high_confidence_mask]

# Retrain the model with pseudo-labeled data added to labeled data
X_combined = np.vstack([X_labeled.toarray(), X_pseudo_labeled.toarray()])
y_combined = np.concatenate([y_labeled, y_pseudo_labels])
clf.fit(X_combined, y_combined)

# Evaluate the model on the test set (you can split X_unlabeled for testing if desired)
X_test = X_unlabeled
y_test = clf.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(data.target[-len(y_test):], y_test)
precision = precision_score(data.target[-len(y_test):], y_test, average='weighted')
recall = recall_score(data.target[-len(y_test):], y_test, average='weighted')
f1 = f1_score(data.target[-len(y_test):], y_test, average='weighted')

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the movie review dataset (you can replace this with your dataset)
# Assuming the dataset has 'text' and 'label' columns where label is 0 (negative) or 1 (positive)
data = pd.read_csv('movie_reviews.csv')

# Vectorize the text data
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['text'])
y = data['label']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predict on test set
y_pred = nb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Discuss the assumption of feature independence in Naive Bayes
print("\nAssumption of Feature Independence:")
print("Naive Bayes assumes that features (words) are conditionally independent given the class label.")
print("This means that the occurrence of a particular word in a document is independent of the occurrence of other words,")
print("given whether the document is positive or negative. Despite this simplifying assumption, Naive Bayes often performs")
print("well in practice for text classification tasks, especially when paired with techniques like TF-IDF or word embeddings.")
