In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Load 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', categories=['rec.sport.hockey', 'sci.space'], remove=('headers', 'footers', 'quotes'))

# Create DataFrame
df = pd.DataFrame({'text': newsgroups.data, 'label': newsgroups.target})

# Split data into features (X) and target (y)
X = df['text']
y = df['label']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Function to get top words per class
def get_top_words_per_class(vectorizer, model, class_labels, n=10):
    feature_names = np.array(vectorizer.get_feature_names_out())
    for i, class_label in enumerate(class_labels):
        if i == 0:
            top_indices = np.argsort(model.coef_[0])[:n]
            top_features = feature_names[top_indices]
        else:
            top_indices = np.argsort(model.coef_[0])[-n:]
            top_features = feature_names[top_indices]
        print(f'Top words for class {class_label}:')
        print(', '.join(top_features))
        print()

# Get top words per class
get_top_words_per_class(vectorizer, model, class_labels=newsgroups.target_names)

Accuracy: 0.93
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       388
           1       0.92      0.95      0.93       407

    accuracy                           0.93       795
   macro avg       0.93      0.93      0.93       795
weighted avg       0.93      0.93      0.93       795

Top words for class rec.sport.hockey:
game, hockey, he, team, games, nhl, play, his, espn, season

Top words for class sci.space:
data, launch, is, earth, nasa, shuttle, orbit, moon, of, space



In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset, specifying the 'error_bad_lines' parameter
data = pd.read_csv('IMDB Dataset.csv') # Skip lines that cause errors
# You can also try to specify a different delimiter if you know it's not a comma
#data = pd.read_csv('IMDB Dataset.csv', sep=';', on_bad_lines='skip') # Example with semicolon as delimiter

# Explore the dataset
print("Dataset shape:", data.shape)
print("Sample data:")
print(data.head())
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['review'], data['sentiment'], test_size=0.2, random_state=42
)
# Text vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=7000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a LR model
nb_classifier = LogisticRegression()
nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Dataset shape: (50000, 2)
Sample data:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Accuracy: 0.889
Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.87      0.89      4961
    positive       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [None]:
# Assuming 'nb_model' is your trained Naive Bayes model and 'tfidf_vectorizer' is your TF-IDF vectorizer

new_reviews = [
    "This movie was absolutely fantastic! The plot was engaging, the acting was superb, and the special effects were breathtaking. I highly recommend it!",
    "I was thoroughly disappointed with this film. The story was predictable, the characters were poorly developed, and the pacing was slow. It's a waste of time.",
    "The movie had its moments, but overall it was just okay. It wasn't terrible, but it wasn't great either. I'm indifferent.",
    "The acting was terrible, and the storyline was confusing. I would not recommend to watch it.",
    "I enjoyed the movie. While it may not be the most excellent movie, it was entertaining enough for a movie night. It's certainly watchable and worth your time."
]

new_reviews_tfidf = tfidf_vectorizer.transform(new_reviews)
predictions = nb_classifier.predict(new_reviews_tfidf)

# Print predictions and compare with expected sentiments
for i, prediction in enumerate(predictions):
    print(f"Review {i+1}: Predicted Sentiment: {prediction}, Expected Sentiment: {'positive' if i in [0, 4] else 'negative' if i in [1, 3] else 'neutral'}")

Review 1: Predicted Sentiment: positive, Expected Sentiment: positive
Review 2: Predicted Sentiment: negative, Expected Sentiment: negative
Review 3: Predicted Sentiment: negative, Expected Sentiment: neutral
Review 4: Predicted Sentiment: negative, Expected Sentiment: negative
Review 5: Predicted Sentiment: positive, Expected Sentiment: positive


In [None]:
# Assuming 'nb_classifier' is your trained LogisticRegression model and 'tfidf_vectorizer' is your TF-IDF vectorizer

# Get feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Get coefficients for positive and negative classes
coefs = nb_classifier.coef_[0]  # Assuming binary classification (positive/negative)

# Get top positive words
top_positive_indices = coefs.argsort()[-10:]  # Top 10 positive words
top_positive_words = [feature_names[i] for i in top_positive_indices]

# Get top negative words
top_negative_indices = coefs.argsort()[:10]  # Top 10 negative words
top_negative_words = [feature_names[i] for i in top_negative_indices]

# Print the results
print("Top 10 words for Positive Sentiment:")
print(top_positive_words)

print("\nTop 10 words for Negative Sentiment:")
print(top_negative_words)

Top 10 words for Positive Sentiment:
['hilarious', 'loved', 'favorite', 'brilliant', 'amazing', 'perfect', 'wonderful', 'best', 'excellent', 'great']

Top 10 words for Negative Sentiment:
['worst', 'waste', 'awful', 'bad', 'boring', 'poor', 'terrible', 'dull', 'poorly', 'worse']
