In [49]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [50]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [51]:
data = {
    'text': [
        "I love this movie, it's fantastic!",
        "What a terrible film, I hated it",
        "Absolutely wonderful acting and story",
        "Worst movie I've ever seen",
        "Brilliant, will watch again!",
        "So boring and dull",
        "Incredible visuals and great script",
        "The plot made no sense at all",
        "A must-watch, truly inspiring",
        "Not worth the time",
        "Exceptional performance by the lead actor",
        "I regret watching this film",
        "Heartwarming and beautifully shot",
        "The dialogues were cringeworthy",
        "Loved every second of it!",
        "I walked out halfway through",
        "Gripping story with strong characters",
        "The ending was disappointing",
        "It's a cinematic masterpiece",
        "Very predictable and clichéd",
        "So moving, I cried",
        "It felt like a waste of two hours",
        "A beautiful blend of emotion and thrill",
        "Painfully slow and boring",
        "Five stars from me!"
    ],
    'label': [
        'positive', 'negative', 'positive', 'negative', 'positive', 'negative',
        'positive', 'negative', 'positive', 'negative',
        'positive', 'negative', 'positive', 'negative', 'positive', 'negative',
        'positive', 'negative', 'positive', 'negative',
        'positive', 'negative', 'positive', 'negative', 'positive'
    ]
}

df = pd.DataFrame(data)


In [52]:
vectorizer = CountVectorizer(ngram_range=(1, 2)) 
X = vectorizer.fit_transform(df['text'])
y = df['label']


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [54]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [55]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.75

Classification Report:
               precision    recall  f1-score   support

    negative       0.80      0.80      0.80         5
    positive       0.67      0.67      0.67         3

    accuracy                           0.75         8
   macro avg       0.73      0.73      0.73         8
weighted avg       0.75      0.75      0.75         8


Confusion Matrix:
 [[4 1]
 [1 2]]


In [48]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Define a sample text dataset and corresponding labels.
documents = [
    "The new restaurant in town is getting great reviews.",
    "I'm so disappointed with the service I received.",
    "The weather forecast says it will be sunny tomorrow.",
    "This coffee is terrible, it's too bitter.",
    "The hotel room was clean and comfortable.",
    "I'm not a fan of the new policy changes.",
    "The concert was amazing, the band was incredible.",
    "The traffic on the way home was awful.",
    "The new employee is doing a great job so far.",
    "I'm really looking forward to my vacation next week.",
    "The food at the festival was delicious and diverse.",
    "The customer service representative was very helpful.",
    "The movie was boring, I fell asleep halfway through.",
    "The hotel staff was friendly and accommodating.",
    "The new smartphone is getting a lot of buzz online.",
    "The traffic in the city is getting worse every day.",
    "I love this movie",
    "This film was terrible",
    "Great acting and plot", 
    "Worst movie ever",
    "I enjoyed the film",
    "Awful direction and script"
]

# Sentiment labels corresponding to the documents (positive/negative)
sentiments = [
    'positive', 'negative', 'positive', 'negative', 
    'positive', 'negative', 'positive', 'negative',
    'positive', 'positive', 'positive', 'positive', 
    'negative', 'positive', 'positive', 'negative',
    'positive', 'negative', 'positive', 'negative', 
    'positive', 'negative'
]

# Split data while preserving the class distribution
X_train, X_test, y_train, y_test = train_test_split(
    documents, sentiments, test_size=0.3, random_state=42, stratify=sentiments)

# Define a list of n-gram ranges to experiment with.
ngram_settings = [(1, 1), (1, 2), (1, 3)]

for ngram in ngram_settings:
    print("=" * 40)
    print(f"Results using ngram_range = {ngram}")

    # Create a pipeline that converts text into n-gram counts then applies Multinomial Naïve Bayes.
    nb_pipeline = make_pipeline(
        CountVectorizer(ngram_range=ngram),
        MultinomialNB()
    )
    
    # Train the model
    nb_pipeline.fit(X_train, y_train)
    
    # Make predictions on the hold-out set
    predictions = nb_pipeline.predict(X_test)
    
    # Compute accuracy
    acc = accuracy_score(y_test, predictions)
    print(f"Accuracy: {acc:.2f}")
    
    # Show full metrics report
    print("Classification Report:")
    print(classification_report(y_test, predictions))

Results using ngram_range = (1, 1)
Accuracy: 0.43
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         3
    positive       0.50      0.75      0.60         4

    accuracy                           0.43         7
   macro avg       0.25      0.38      0.30         7
weighted avg       0.29      0.43      0.34         7

Results using ngram_range = (1, 2)
Accuracy: 0.57
Classification Report:
              precision    recall  f1-score   support

    negative       0.50      0.33      0.40         3
    positive       0.60      0.75      0.67         4

    accuracy                           0.57         7
   macro avg       0.55      0.54      0.53         7
weighted avg       0.56      0.57      0.55         7

Results using ngram_range = (1, 3)
Accuracy: 0.57
Classification Report:
              precision    recall  f1-score   support

    negative       0.50      0.33      0.40         3
    positive      