In [1]:
import numpy as np
#import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
import math
#nltk.download(stopwords)

In [2]:
# Step 1: Dummy Dataset
texts = ["I love this product", \
"I hate this product", "This is the best product", \
"This is the worst product"]
sentiments = [1, 0, 1, 0] # 1 for positive, 0 for negative

In [3]:
# Step 2: Preprocess the Data
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    return words

processed_texts = [preprocess(text) for text in texts]
print(processed_texts)

[['love', 'product'], ['hate', 'product'], ['best', 'product'], ['worst', 'product']]


In [4]:
# Step 3: Feature Extraction - Bag of Words
def create_bag_of_words(processed_texts):
    all_words = sum(processed_texts, [])
    bag = Counter(all_words)
    return bag.keys()

vocab = create_bag_of_words(processed_texts)
print(vocab)

dict_keys(['love', 'product', 'hate', 'best', 'worst'])


In [5]:
def text_to_vector(text, vocab):
    text_counts = Counter(text)
    return [text_counts.get(word, 0) for word in vocab]

features = np.array([text_to_vector(text, vocab) for text in processed_texts])
print(features)

[[1 1 0 0 0]
 [0 1 1 0 0]
 [0 1 0 1 0]
 [0 1 0 0 1]]


In [6]:
# Step 4: Create Target Variable
labels = np.array(sentiments)

In [24]:
# Step 5: Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
print(X_train,y_train,X_test,y_test)

[[0 1 0 0 1]
 [1 1 0 0 0]
 [0 1 0 1 0]] [0 1 1] [[0 1 1 0 0]] [0]


In [14]:
# Step 6: Initialize Parameters
weights = np.zeros(X_train.shape[1])
bias = 0

In [15]:
# Step 7: Define the Sigmoid Function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [16]:
# Step 8 & 9: Compute Prediction and Calculate Loss
def compute_loss(y, y_hat):
    m = y.shape[0]
    return -(1/m) * np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

In [17]:
# Step 10: Gradient Descent
def update_weights(X, y, weights, bias, learning_rate):
    m = X.shape[0]
    y_hat = sigmoid(np.dot(X, weights) + bias)
    d_weight = (1/m) * np.dot(X.T, (y_hat - y))
    d_bias = (1/m) * np.sum(y_hat - y)
    weights -= learning_rate * d_weight
    bias -= learning_rate * d_bias
    return weights, bias

In [18]:
# Step 11: Training the Model
def train(X, y, weights, bias, learning_rate, epochs):
    for epoch in range(epochs):
        weights, bias = update_weights(X, y, weights, bias, learning_rate)
        y_hat = sigmoid(np.dot(X, weights) + bias)
        loss = compute_loss(y, y_hat)
        if epoch % 100 == 0:
            print(f"Epoch {epoch}: Loss {loss}")
    return weights, bias

In [19]:
# Train the model
weights, bias = train(X_train, y_train, weights, bias, learning_rate=0.01, epochs=1000)

Epoch 0: Loss 0.6917604907429468
Epoch 100: Loss 0.5855804095371726
Epoch 200: Loss 0.5151636958954866
Epoch 300: Loss 0.4609420604363206
Epoch 400: Loss 0.4162448937259538
Epoch 500: Loss 0.3783317666272626
Epoch 600: Loss 0.34574435769249817
Epoch 700: Loss 0.31751405407524347
Epoch 800: Loss 0.2929105977407763
Epoch 900: Loss 0.2713519368823209


In [21]:
# Step 12 & 13: Make Predictions and Evaluate the Model
def predict(X, weights, bias):
    return [1 if i > 0.5 else 0 for i in sigmoid(np.dot(X, weights) + bias)]
print(y_test)
print(y_pred)
y_pred = predict(X_test, weights, bias)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


[0]
[1]
Accuracy: 0.0


In [18]:
# This code sets up a simple logistic regression model for sentiment analysis.