### Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/replication-of-generating-reviews-discovering-sentiment/

/content/drive/MyDrive/replication-of-generating-reviews-discovering-sentiment


In [3]:
!ls

data  demo.ipynb  encoder.py  LICENSE  model  __pycache__  README.md  sst_binary_demo.py  utils.py


### 1. Install and Import Dependencies

In [4]:
import os
import html
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, roc_auc_score, f1_score
from encoder import Model

Instructions for updating:
non-resource variables are not supported in the long term


## 2. Demo The TextVectorization Model

In [5]:
model = Model()
text = ["I couldn't figure out"]
text_features = model.transform(text)
print(text_features)

Instructions for updating:
dim is deprecated, use axis instead


5.612 seconds to transform 1 examples
[[-0.12958631 -0.7414906   0.06320142 ...  0.12817442  0.07800508
   0.14032528]]


In [6]:
print(text_features.shape)

(1, 4096)


In [7]:
print(type(text_features))

<class 'numpy.ndarray'>


## 3. Define Function

In [8]:
def load_sst(path):
    data = pd.read_csv(path)
    X = data['sentence'].values.tolist()
    Y = data['label'].values
    return X, Y

def sst_binary(data_dir='data/',train_dir="train_binary_sent.csv", val_dir="dev_binary_sent.csv", test_dir="test_binary_sent.csv"):
    """
    Most standard models make use of a preprocessed/tokenized/lowercased version
    of Stanford Sentiment Treebank. Our model extracts features from a version
    of the dataset using the raw text instead which we've included in the data
    folder.
    """
    trX, trY = load_sst(os.path.join(data_dir, train_dir))
    vaX, vaY = load_sst(os.path.join(data_dir, val_dir))
    teX, teY = load_sst(os.path.join(data_dir, test_dir))
    return trX, vaX, teX, trY, vaY, teY

def preprocess(text, front_pad='\n ', end_pad=' '):
    text = html.unescape(text)
    text = text.replace('\n', ' ').strip()
    text = front_pad+text+end_pad
    text = text.encode()
    return text

def train_with_reg_cv(trX, trY, vaX, vaY, teX=None, teY=None, penalty='l1',C=2**np.arange(-8, 1).astype(np.float64), seed=42):
    scores = []
    for i, c in enumerate(C):
        model = LogisticRegression(C=c, penalty=penalty, random_state=seed+i, solver='liblinear')
        model.fit(trX, trY)
        score = model.score(vaX, vaY)
        scores.append(score)
    c = C[np.argmax(scores)]
    model = LogisticRegression(C=c, penalty=penalty, random_state=seed+len(C), solver='liblinear')
    model.fit(trX, trY)
    nnotzero = np.sum(model.coef_ != 0)
    if teX is not None and teY is not None:
        score = model.score(teX, teY)*100.
    else:
        score = model.score(vaX, vaY)*100.
    return score, c, nnotzero, model

## 4. Load the data

Let's take a moment to understand the format of the data. Each example is a sentence representing the movie review and a corresponding label. The sentence is not preprocessed in any way. The label is an integer value of either 0 or 1, where 0 is a negative review, and 1 is a positive review.

### SST

In [10]:
SST_train_dir = 'SST/SST_train.csv'
SST_val_dir = 'SST/SST_val.csv'
SST_test_dir = 'SST/SST_test.csv'
SST_trX, SST_vaX, SST_teX, SST_trY, SST_vaY, SST_teY = sst_binary('data/',SST_train_dir, SST_val_dir, SST_test_dir)

In [11]:
# Display the length of train , test dataset
print("Training entries: {}, test entries: {}".format(len(SST_trX), len(SST_teX)))

Training entries: 6920, test entries: 1821


Let's print first 10 examples.

In [12]:
SST_trX[:10]

['A stirring, funny and finally transporting re-imagining of Beauty and the Beast and 1930s horror films',
 'Apparently reassembled from the cutting-room floor of any given daytime soap.',
 "They presume their audience won't sit still for a sociology lesson, however entertainingly presented, so they trot out the conventional science-fiction elements of bug-eyed monsters and futuristic women in skimpy clothes.",
 'This is a visually stunning rumination on love, memory, history and the war between art and commerce.',
 "Jonathan Parker's Bartleby should have been the be-all-end-all of the modern-office anomie films.",
 'Campanella gets the tone just right -- funny in the middle of sad in the middle of hopeful.',
 'A fan film that for the uninitiated plays better on video with the sound turned down.',
 'Béart and Berling are both superb, while Huppert ... is magnificent.',
 'A little less extreme than in the past, with longer exposition sequences between them, and with fewer gags to break 

Let's also print the first 10 labels.

In [13]:
SST_trY[:10]

array([1, 0, 0, 1, 1, 1, 0, 1, 0, 0])

### Amazon

In [14]:
Amazon_train_dir = 'Amazon/amazon_train.csv'
Amazon_val_dir = 'Amazon/amazon_val.csv'
Amazon_test_dir = 'Amazon/amazon_test.csv'
Amazon_trX, Amazon_vaX, Amazon_teX, Amazon_trY, Amazon_vaY, Amazon_teY = sst_binary('data/',Amazon_train_dir, Amazon_val_dir, Amazon_test_dir)

In [15]:
# Display the length of train , test dataset
print("Training entries: {}, test entries: {}".format(len(Amazon_trX), len(Amazon_teX)))

Training entries: 24238, test entries: 6926


Let's print first 5 examples.

In [16]:
Amazon_trX[:5]

["Purchased this as an upgrade to the first generation Kindle Paperwhite. The backlight is greatly improved, the display is of a higher quality, the words are clearer and the internal process is a lot quicker  opening books and page flipping is noticeably speedier.The one thing that does annoy me about the 3rd generation Paperwhite is the way it's manufactured  when you hold it at a certain angle and you look at the bottom of the screen, you can see a small section of the display covered in the e-ink used to render the words and images on the rest of the screen. At first I thought this was a manufacturing defect, so I took it back to Best Buy and exchanged it for the same model, only to find that the second model had the same issue, except more-visible.This appears to have something to do with the way the display is placed behind the bezel during the manufacturing process. When the Paperwhite first came out, it was the crown jewel of Amazon's e-reader line, and Amazon was careful to pr

Let's also print the first 5 labels.

In [17]:
Amazon_trY[:5]

array([1, 1, 1, 1, 1])

### Yelp

In [18]:
# Yelp data
Yelp_train_dir = 'Yelp/yelp_train.csv'
Yelp_val_dir = 'Yelp/yelp_val.csv'
Yelp_test_dir = 'Yelp/yelp_test.csv'
Yelp_trX, Yelp_vaX, Yelp_teX, Yelp_trY, Yelp_vaY, Yelp_teY = sst_binary('data/',Yelp_train_dir, Yelp_val_dir, Yelp_test_dir)

In [19]:
# Display the length of train , test dataset
print("Training entries: {}, test entries: {}".format(len(Yelp_trX), len(Yelp_teX)))

Training entries: 416000, test entries: 40000


Let's print first 5 examples.

In [24]:
Yelp_teX[:5]

['I got \'new\' tires from them and within two weeks got a flat. I took my car to a local mechanic to see if i could get the hole patched, but they said the reason I had a flat was because the previous patch had blown - WAIT, WHAT? I just got the tire and never needed to have it patched? This was supposed to be a new tire. \\nI took the tire over to Flynn\'s and they told me that someone punctured my tire, then tried to patch it. So there are resentful tire slashers? I find that very unlikely. After arguing with the guy and telling him that his logic was far fetched he said he\'d give me a new tire \\"this time\\". \\nI will never go back to Flynn\'s b/c of the way this guy treated me and the simple fact that they gave me a used tire!',
 "Don't waste your time.  We had two different people come to our house to give us estimates for a deck (one of them the OWNER).  Both times, we never heard from them.  Not a call, not the estimate, nothing.",
 'All I can say is the worst! We were the o

Let's also print the first 5 labels.

In [26]:
Yelp_teY[:5]

array([0, 0, 0, 0, 0])

## 5. Defining and Fitting the Classifiers

We reproduce the original code with a Logistic Regression on top of the text vectorizer model to classify the sentiment

### SST

In [27]:
SST_trXt = model.transform(SST_trX)
SST_vaXt = model.transform(SST_vaX)
SST_teXt = model.transform(SST_teX)
# classification
SST_full_rep_acc, SST_c, SST_nnotzero, SST_model = train_with_reg_cv(SST_trXt, SST_trY, SST_vaXt, SST_vaY, SST_teXt, SST_teY)
print('%05.2f test accuracy'%SST_full_rep_acc)
print('%05.2f regularization coef'%SST_c)
print('%05d features used'%SST_nnotzero)

46.254 seconds to transform 6920 examples
6.345 seconds to transform 872 examples
12.782 seconds to transform 1821 examples
91.76 test accuracy
00.25 regularization coef
00141 features used


### Amazon

In [None]:
Amazon_trXt = model.transform(Amazon_trX)
Amazon_vaXt = model.transform(Amazon_vaX)
Amazon_teXt = model.transform(Amazon_teX)
# classification
Amazon_full_rep_acc, Amazon_c, Amazon_nnotzero, Amazon_model = train_with_reg_cv(Amazon_trXt, Amazon_trY, Amazon_vaXt, Amazon_vaY, Amazon_teXt, Amazon_teY)
print('%05.2f test accuracy'%Amazon_full_rep_acc)
print('%05.2f regularization coef'%Amazon_c)
print('%05d features used'%Amazon_nnotzero)

250.987 seconds to transform 24238 examples
33.988 seconds to transform 3462 examples


### Yelp

In [None]:
Yelp_trXt = model.transform(Yelp_trX)
Yelp_vaXt = model.transform(Yelp_vaX)
Yelp_teXt = model.transform(Yelp_teX)
# classification
Yelp_full_rep_acc, Yelp_c, Yelp_nnotzero, Yelp_model = train_with_reg_cv(Yelp_trXt, Yelp_trY, Yelp_vaXt, Yelp_vaY, Yelp_teXt, Yelp_teY)
print('%05.2f test accuracy'%Yelp_full_rep_acc)
print('%05.2f regularization coef'%Yelp_c)
print('%05d features used'%Yelp_nnotzero)

## 6. Evaluation

### Distribution of Results

In [None]:
# Create a single figure with subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot the histograms for SST_sentiment_unit
axes[0].hist(SST_sentiment_unit[SST_trY == 0], bins=25, alpha=0.5, label='neg')
axes[0].hist(SST_sentiment_unit[SST_trY == 1], bins=25, alpha=0.5, label='pos')
axes[0].set_title('SST Sentiment Unit')
axes[0].legend()

# Plot the histograms for Amazon_sentiment_unit
axes[1].hist(Amazon_sentiment_unit[Amazon_trY == 0], bins=25, alpha=0.5, label='neg')
axes[1].hist(Amazon_sentiment_unit[Amazon_trY == 1], bins=25, alpha=0.5, label='pos')
axes[1].set_title('Amazon Sentiment Unit')
axes[1].legend()

# Plot the histograms for Yelp_sentiment_unit
axes[2].hist(Yelp_sentiment_unit[Yelp_trY == 0], bins=25, alpha=0.5, label='neg')
axes[2].hist(Yelp_sentiment_unit[Yelp_trY == 1], bins=25, alpha=0.5, label='pos')
axes[2].set_title('Yelp Sentiment Unit')
axes[2].legend()

# Adjust layout for better spacing
plt.tight_layout()

# Show the combined plot
plt.show()

### Accuracy and F1 score

In [None]:
# Calculate accuracy and F1 score for SST dataset
SST_accuracy = accuracy_score(SST_teY, SST_model.predict(SST_teXt))
SST_f1 = f1_score(SST_teY, SST_model.predict(SST_teXt))

# Calculate accuracy and F1 score for Amazon dataset
Amazon_accuracy = accuracy_score(Amazon_teY, Amazon_model.predict(Amazon_teXt))
Amazon_f1 = f1_score(Amazon_teY, Amazon_model.predict(Amazon_teXt))

# Calculate accuracy and F1 score for Yelp dataset
Yelp_accuracy = accuracy_score(Yelp_teY, Yelp_model.predict(Yelp_teXt))
Yelp_f1 = f1_score(Yelp_teY, Yelp_model.predict(Yelp_teXt))

# Print accuracy and F1 score for all datasets
print(f'''Accuracy: SST data - {round(SST_accuracy*100, 2)}%
F1 Score: SST data - {round(SST_f1*100, 2)}%
Accuracy: Amazon data - {round(Amazon_accuracy*100, 2)}%
F1 Score: Amazon data - {round(Amazon_f1*100, 2)}%
Accuracy: Yelp data - {round(Yelp_accuracy*100, 2)}%
F1 Score: Yelp data - {round(Yelp_f1*100, 2)}%''')

### Classification matrix

In [None]:
# Calculate and print classification report for SST dataset
SST_report = classification_report(SST_teY, SST_model.predict(SST_teXt))
print("Classification Report - SST data:")
print(SST_report)

# Calculate and print classification report for Amazon dataset
Amazon_report = classification_report(Amazon_teY, Amazon_model.predict(Amazon_teXt))
print("Classification Report - Amazon data:")
print(Amazon_report)

# Calculate and print classification report for Yelp dataset
Yelp_report = classification_report(Yelp_teY, Yelp_model.predict(Yelp_teXt))
print("Classification Report - Yelp data:")
print(Yelp_report)

### ROC curve

In [None]:
# Calculate and plot ROC curve for SST dataset
SST_fpr, SST_tpr, _ = roc_curve(SST_teY, SST_model.predict_proba(SST_teXt)[:, 1])
SST_auc = auc(SST_fpr, SST_tpr)

# Calculate and plot ROC curve for Amazon dataset
Amazon_fpr, Amazon_tpr, _ = roc_curve(Amazon_teY, Amazon_model.predict_proba(Amazon_teXt)[:, 1])
Amazon_auc = auc(Amazon_fpr, Amazon_tpr)

# Calculate and plot ROC curve for Yelp dataset
Yelp_fpr, Yelp_tpr, _ = roc_curve(Yelp_teY, Yelp_model.predict_proba(Yelp_teXt)[:, 1])
Yelp_auc = auc(Yelp_fpr, Yelp_tpr)

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot ROC curve for SST dataset
axes[0].plot(SST_fpr, SST_tpr, color='darkorange', lw=2, label='SST ROC curve (area = {:.2f})'.format(SST_auc))
axes[0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[0].set_xlim([0.0, 1.0])
axes[0].set_ylim([0.0, 1.05])
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('SST ROC Curve')
axes[0].legend(loc="lower right")

# Plot ROC curve for Amazon dataset
axes[1].plot(Amazon_fpr, Amazon_tpr, color='blue', lw=2, label='Amazon ROC curve (area = {:.2f})'.format(Amazon_auc))
axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('Amazon ROC Curve')
axes[1].legend(loc="lower right")

# Plot ROC curve for Yelp dataset
axes[2].plot(Yelp_fpr, Yelp_tpr, color='green', lw=2, label='Yelp ROC curve (area = {:.2f})'.format(Yelp_auc))
axes[2].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[2].set_xlim([0.0, 1.0])
axes[2].set_ylim([0.0, 1.05])
axes[2].set_xlabel('False Positive Rate')
axes[2].set_ylabel('True Positive Rate')
axes[2].set_title('Yelp ROC Curve')
axes[2].legend(loc="lower right")

plt.show()