## Reviews Processing App

### Import Libraries

In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
import string
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

### Import Data

In [22]:
# Columns X = [title, content] y = label (0 = negative, 1 = positive)
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Selected columns x = review y = label
train_data = train_data[['label', 'content']].rename(columns={'content': 'review'})
test_data = test_data[['label', 'content']].rename(columns={'content': 'review'})

### Preprocess Data

In [42]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train = vectorizer.fit_transform(train_data['review'])
X_test = vectorizer.transform(test_data['review'])

### Train Model

In [43]:
model = LogisticRegression()
model.fit(X_train, train_data['label'])

# Predict on test set
predictions = model.predict(X_test)
accuracy = accuracy_score(test_data['label'], predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.830185


In [46]:
prediction = model.predict(vectorizer.transform(["I like"]))
print('positive' if prediction[0] == 1 else 'negative')

positive


In [3]:
# Preprocess the data
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [4]:
train_data['review'] = train_data['review'].apply(preprocess_text)
test_data['review'] = test_data['review'].apply(preprocess_text)

In [6]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data['review'])
X_test = vectorizer.transform(test_data['review'])

In [7]:
y_train = train_data['label']
y_test = test_data['label']

In [16]:
model = LogisticRegression(max_iter=100000)
model.fit(X_train, y_train)

KeyboardInterrupt: 

In [9]:
# Evaluate the model
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 0.886365


In [10]:
def predict_review(review):
    review = preprocess_text(review)
    review_vector = vectorizer.transform([review])
    prediction = model.predict(review_vector)
    return 'Positive' if prediction[0] == 1 else 'Negative'

In [15]:
# Example usage
review = "This product is not bad"
print(predict_review(review))

Negative


In [None]:
# Train the model with SGDClassifier
model = SGDClassifier(loss='log_loss', max_iter=1, tol=None, warm_start=True)
losses = []
accuracies = []

# Set up the plot
plt.ion()
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
line1, = ax1.plot(losses, label='Log Loss')
line2, = ax2.plot(accuracies, label='Accuracy')
ax1.set_xlabel('Iteration')
ax1.set_ylabel('Log Loss')
ax1.set_title('Convergence of Logistic Regression')
ax2.set_xlabel('Iteration')
ax2.set_ylabel('Accuracy')
ax2.set_title('Accuracy over Iterations')

for i in range(1000):  # Number of iterations
    model.partial_fit(X_train, y_train, classes=[0, 1])
    y_pred_proba = model.predict_proba(X_train)
    loss = log_loss(y_train, y_pred_proba)
    losses.append(loss)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    
    # Update the plot
    line1.set_ydata(losses)
    line1.set_xdata(range(len(losses)))
    line2.set_ydata(accuracies)
    line2.set_xdata(range(len(accuracies)))
    ax1.relim()
    ax1.autoscale_view()
    ax2.relim()
    ax2.autoscale_view()
    plt.draw()
    plt.pause(0.01)
    clear_output(wait=True)
    display(fig)

# Final evaluation
y_pred = model.predict(X_test)
print(f'Final Accuracy: {accuracy_score(y_test, y_pred)}')