In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

In [2]:
# Example text
text = """
Natural language processing (NLP) is a subfield of artificial intelligence (AI) 
that focuses on the interaction between computers and humans through natural language. 
It involves the development of algorithms and models to understand, interpret, and 
generate human-like language.

Tokenization is an important step in NLP. It involves breaking down text into individual words or tokens. 
After tokenization, lemmatization can be applied to reduce words to their base form. 
Stopwords, common words that don't carry much meaning, are often removed during preprocessing.

NLP techniques, such as one-hot encoding, bag-of-words, and TF-IDF, help convert text data into a format 
that can be used by machine learning models. Word embeddings, such as Word2Vec and GloVe, provide 
dense vector representations for words, capturing semantic relationships.

Understanding these techniques is crucial for working with text data and building effective NLP applications.
"""

In [3]:
# Tokenization
tokens = word_tokenize(text)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

In [4]:
# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
filtered_tokens = [token.lower() for token in lemmatized_tokens if token.lower() not in stop_words and token not in punctuation]



# Convert the list of tokens into a string (required for CountVectorizer)
processed_text = ' '.join(filtered_tokens)

In [5]:
# Bag-of-Words Representation
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform([processed_text])


In [6]:
bag_of_words

<1x73 sparse matrix of type '<class 'numpy.int64'>'
	with 73 stored elements in Compressed Sparse Row format>

In [7]:
vocabulary = vectorizer.get_feature_names_out()
print("Vocabulary:", vocabulary)
print("Bag-of-Words Representation:\n", bag_of_words.toarray())

Vocabulary: ['ai' 'algorithm' 'application' 'applied' 'artificial' 'bag' 'base'
 'breaking' 'building' 'capturing' 'carry' 'common' 'computer' 'convert'
 'crucial' 'data' 'dense' 'development' 'effective' 'embeddings'
 'encoding' 'focus' 'form' 'format' 'generate' 'glove' 'help' 'hot'
 'human' 'idf' 'important' 'individual' 'intelligence' 'interaction'
 'interpret' 'involves' 'language' 'learning' 'lemmatization' 'like'
 'machine' 'meaning' 'model' 'much' 'natural' 'nlp' 'of' 'often' 'one'
 'preprocessing' 'processing' 'provide' 'reduce' 'relationship' 'removed'
 'representation' 'semantic' 'step' 'stopwords' 'subfield' 'technique'
 'text' 'tf' 'token' 'tokenization' 'understand' 'understanding' 'used'
 'vector' 'word' 'word2vec' 'words' 'working']
Bag-of-Words Representation:
 [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2
  3 1 1 1 1 1 2 1 2 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 1 1 2 1 1 1 1 5 1 1
  1]]


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [9]:

positive_reviews = ["I loved the movie. The storyline was amazing.",
                    "Great acting and a compelling plot.",
                    "One of the best movies I've seen in years."]

negative_reviews = ["The movie was terrible. I didn't enjoy it at all.",
                    "Poor acting and a confusing storyline.",
                    "I regret watching this film."]

labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)


In [10]:
# Combine positive and negative reviews
all_reviews = positive_reviews + negative_reviews

In [11]:
all_reviews

['I loved the movie. The storyline was amazing.',
 'Great acting and a compelling plot.',
 "One of the best movies I've seen in years.",
 "The movie was terrible. I didn't enjoy it at all.",
 'Poor acting and a confusing storyline.',
 'I regret watching this film.']

In [13]:

# Step 2: Text Preprocessing
def preprocess_text(text):
    # Tokenization, lemmatization, and removal of stopwords
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    processed_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words and token not in punctuation]
    return ' '.join(processed_tokens) 




# Apply preprocessing to all reviews
processed_reviews = [preprocess_text(review) for review in all_reviews]



In [14]:
# Step 3: Bag-of-Words Representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(processed_reviews)

In [15]:
# Step 4: Text Classification
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


In [18]:
# Build and train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# small project with with BOW 

In [21]:
import pandas as pd
import random

# Generate synthetic data for positive and negative reviews
positive_reviews = [
    "I loved the movie. The storyline was amazing.",
    "Great acting and a compelling plot.",
    "One of the best movies I've seen in years.",
    "Amazing cinematography and excellent performances.",
    "A heartwarming and touching film.",
    "The characters were well-developed, and the dialogue was engaging.",
    "A must-watch for all movie enthusiasts.",
    "I couldn't get enough of this movie. Highly recommended.",
    "Thoroughly enjoyed every moment of the film.",
    "The director did a fantastic job bringing the story to life.",
    "Incredible visual effects and a gripping narrative.",
    "A masterpiece of storytelling and filmmaking.",
    "The soundtrack added a lot to the overall experience.",
    "An instant classic. I will definitely watch it again.",
    "The movie exceeded my expectations. A true gem.",
    "A brilliant and thought-provoking piece of cinema.",
    "I was on the edge of my seat throughout the entire film.",
    "The humor in the movie was spot-on and had me laughing non-stop.",
    "The twists and turns kept me guessing until the end.",
    "Fantastic chemistry among the cast members.",
]

negative_reviews = [
    "The movie was terrible. I didn't enjoy it at all.",
    "Poor acting and a confusing storyline.",
    "I regret watching this film.",
    "Disappointing and predictable. Not worth the hype.",
    "Lackluster performances and a weak plot.",
    "I found it boring and uninteresting.",
    "The characters were poorly developed.",
    "A complete waste of time. I wouldn't recommend it.",
    "The dialogue felt forced, and the pacing was off.",
    "I expected more from this movie, but it fell short.",
    "The film lacked originality and creativity.",
    "I couldn't connect with the characters.",
    "The special effects were subpar.",
    "I was disappointed by the lack of depth in the story.",
    "The movie was overhyped and did not live up to expectations.",
    "A letdown from start to finish.",
    "I couldn't wait for it to be over.",
    "The plot was confusing, and I lost interest quickly.",
    "The acting was wooden and unconvincing.",
    "A forgettable experience. I wouldn't watch it again.",
]

# Labels (1 for positive, 0 for negative)
labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)

# Combine positive and negative reviews
all_reviews = positive_reviews + negative_reviews

# Shuffle the data
combined_data = list(zip(all_reviews, labels))
random.shuffle(combined_data)

# Create a DataFrame
df = pd.DataFrame(combined_data, columns=['text', 'review'])

# Save to CSV
df.to_csv('movie_reviews_dataset.csv', index=False)

# Display the DataFrame
print(df.head())


                                                text  review
0       Thoroughly enjoyed every moment of the film.       1
1  The movie was overhyped and did not live up to...       0
2                Great acting and a compelling plot.       1
3                  A heartwarming and touching film.       1
4      A masterpiece of storytelling and filmmaking.       1


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [23]:
# Step 1: Read the data from the CSV file
df = pd.read_csv('movie_reviews_dataset.csv')

In [24]:
df

Unnamed: 0,text,review
0,Thoroughly enjoyed every moment of the film.,1
1,The movie was overhyped and did not live up to...,0
2,Great acting and a compelling plot.,1
3,A heartwarming and touching film.,1
4,A masterpiece of storytelling and filmmaking.,1
5,The special effects were subpar.,0
6,I couldn't wait for it to be over.,0
7,"The characters were well-developed, and the di...",1
8,The director did a fantastic job bringing the ...,1
9,A must-watch for all movie enthusiasts.,1


In [26]:
# Step 2: Text Preprocessing
def preprocess_text(text):
    # Tokenization, lemmatization, and removal of stopwords
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    processed_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words and token not in punctuation]
    return ' '.join(processed_tokens) 



# Apply preprocessing to all reviews
df['processed_text'] = df['text'].apply(preprocess_text)


In [27]:
# Step 3: Bag-of-Words Representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['processed_text'])
y = df['review']

In [28]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


classifier = MultinomialNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

In [29]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Step 8: Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.50      0.60         6
           1       0.25      0.50      0.33         2

    accuracy                           0.50         8
   macro avg       0.50      0.50      0.47         8
weighted avg       0.62      0.50      0.53         8



In [31]:

sample_review = "The movie had a captivating storyline and exceptional performances."

# Preprocess the sample review
processed_sample_review = preprocess_text(sample_review)

# Convert the processed sample review to a Bag-of-Words representation
sample_review_vectorized = vectorizer.transform([processed_sample_review])


prediction = classifier.predict(sample_review_vectorized)[0]


print("Original Review:")
print(sample_review)
print("\nPredicted Sentiment:")
if prediction == 1:
    print("Positive")
else:
    print("Negative")


Original Review:
The movie had a captivating storyline and exceptional performances.

Predicted Sentiment:
Positive
