In [3]:
#Feature extraction with bag of words
from sklearn.feature_extraction.text import CountVectorizer

# Sample input documents
documents = [
    "I love this movie",
    "This movie is great",
    "I don't like this movie",
    "This movie is terrible"
]

# Step 1: Create an instance of the CountVectorizer
vectorizer = CountVectorizer()

# Step 2: Fit the vectorizer on the documents and transform the documents into a bag-of-words matrix
X = vectorizer.fit_transform(documents)

# Step 3: Print the feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()
print("Vocabulary:")
print(feature_names)

# Step 4: Print the bag-of-words matrix
print("\nBag-of-Words Matrix:")
print(X.toarray())

Vocabulary:
['don' 'great' 'is' 'like' 'love' 'movie' 'terrible' 'this']

Bag-of-Words Matrix:
[[0 0 0 0 1 1 0 1]
 [0 1 1 0 0 1 0 1]
 [1 0 0 1 0 1 0 1]
 [0 0 1 0 0 1 1 1]]


In [5]:
# feature extraction with TF_IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample input documents
documents = [
    "I love this movie",
    "This movie is great",
    "I don't like this movie",
    "This movie is terrible"
]

# Step 1: Create an instance of the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Step 2: Fit the vectorizer on the documents and transform the documents into a TF-IDF matrix
X = vectorizer.fit_transform(documents)

# Step 3: Print the feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()
print("Vocabulary:")
print(feature_names)

# Step 4: Print the TF-IDF matrix
print("\nTF-IDF Matrix:")
print(X.toarray())


Vocabulary:
['don' 'great' 'is' 'like' 'love' 'movie' 'terrible' 'this']

TF-IDF Matrix:
[[0.         0.         0.         0.         0.8046125  0.41988018
  0.         0.41988018]
 [0.         0.67943473 0.53567415 0.         0.         0.35455723
  0.         0.35455723]
 [0.62688384 0.         0.         0.62688384 0.         0.32713399
  0.         0.32713399]
 [0.         0.         0.53567415 0.         0.         0.35455723
  0.67943473 0.35455723]]


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Sample input documents and corresponding labels
documents = [
    "I love this movie",
    "This movie is great",
    "I don't like this movie",
    "This movie is terrible"
]

labels = ["positive", "positive", "negative", "negative"]

# Step 1: Split the dataset into a training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(documents, labels, test_size=0.2, random_state=42)

# Step 2: Create an instance of the CountVectorizer and fit it on the training data
vectorizer = CountVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train)

# Step 3: Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_transformed, y_train)

# Step 4: Transform the validation data using the fitted vectorizer
X_val_transformed = vectorizer.transform(X_val)

# Step 5: Predict the sentiment labels for the validation data
y_pred = classifier.predict(X_val_transformed)
print(f"Actual class {y_val}")
print(f"Predicted class {y_pred}")
# Step 6: Evaluate the performance of the classifier
accuracy = (y_pred == y_val).mean()
print("Accuracy:", accuracy)

Actual class ['positive']
Predicted class ['negative']
Accuracy: 0.0


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Sample input documents and corresponding labels
documents = [
    "I love this movie",
    "This movie is great",
    "I don't like this movie",
    "This movie is terrible"
]

labels = ["positive", "positive", "negative", "negative"]

# Step 1: Create an instance of the CountVectorizer and fit it on the documents
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Step 2: Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X, labels)

# Step 3: Predict the sentiment for a new document
new_document = "I love this great movie"
new_document_transformed = vectorizer.transform([new_document])
predicted_sentiment = classifier.predict(new_document_transformed)

print("Predicted sentiment:", predicted_sentiment)

Predicted sentiment: ['positive']


In [21]:
#calculating probabilities

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Sample input documents and corresponding labels
documents = [
    "I love this movie",
    "This movie is great",
    "I don't like this movie",
    "This movie is terrible"
]

labels = ["positive", "positive", "negative", "negative"]

# Step 1: Create an instance of the CountVectorizer and fit it on the documents
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Step 2: Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X, labels)

# Step 3: Calculate the probabilities for a new document
new_document = "I love this great movie"
new_document_transformed = vectorizer.transform([new_document])

# Step 4: Calculate the probability of each sentiment category
probabilities = classifier.predict_proba(new_document_transformed)

# Step 5: Print the probabilities
sentiment_categories = classifier.classes_
for category, probability in zip(sentiment_categories, probabilities[0]):
    print(f"Probability of {category}: {probability}")

Probability of negative: 0.1618606703349758
Probability of positive: 0.8381393296650242


In [31]:
#Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming y_val contains the true labels and y_pred contains the predicted labels

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_val, y_pred, average='macro')
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_val, y_pred, average='macro')
print("Recall:", recall)

# Calculate F1-score
f1 = f1_score(y_val, y_pred, average='macro')
print("F1-score:", f1)

Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1-score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


NameError: name 'y' is not defined

In [None]:
from sklearn.model_selection import cross_val_score

# Assuming X contains the feature matrix and y contains the labels

# Create an instance of the classifier
classifier = MultinomialNB()

# Perform cross-validation and calculate accuracy scores
cv_scores = cross_val_score(classifier, X, y , cv=5)

# Print the average accuracy across cross-validation folds
print("Average Accuracy:", cv_scores.mean())

In [35]:
#Statistical significance Testing (Non parametric test)
import numpy as np
from sklearn.utils import resample

# Evaluation metrics of Classifier A and Classifier B
metric_classifier_a = [0.82, 0.86, 0.88, 0.84, 0.86]
metric_classifier_b = [0.78, 0.84, 0.83, 0.87, 0.85]

# Calculate the observed difference in performance
observed_difference = np.mean(metric_classifier_a) - np.mean(metric_classifier_b)

# Set the number of bootstrap iterations
n_iterations = 1000

# Perform the paired bootstrap test
differences = np.subtract(metric_classifier_a, metric_classifier_b)
bootstrap_differences = []
for _ in range(n_iterations):
    resampled_a = resample(metric_classifier_a)
    resampled_b = resample(metric_classifier_b)
    resampled_difference = np.mean(resampled_a) - np.mean(resampled_b)
    bootstrap_differences.append(resampled_difference)

# Calculate the p-value
p_value = (np.abs(bootstrap_differences) >= np.abs(observed_difference)).mean()

# Determine statistical significance
alpha = 0.05  # Significance level

print(f"p-value {p_value} and alpha value {alpha}")
if p_value < alpha:
    print("The difference in performance is statistically significant.")
else:
    print("The difference in performance is not statistically significant.")


p-value 0.52 and alpha value 0.05
The difference in performance is not statistically significant.
