In [9]:
my_character = "i"
my_token = "hello"
my_doc = "hello, I love pineapple on pizza!"
my_corpus = [
             "I love pineapple on pizza. I think it's good!",
             "Pineapple on pizza is so bad.",
             "I HATE this pineaaple-on-pizza trend.",
             "I am loving this big pizza I got with pineapple on it."
]

In [10]:
my_doc.split()

['hello,', 'I', 'love', 'pineapple', 'on', 'pizza!']

In [11]:
# Sentiment Analysis -> positive or negative (or neutral)
good_tokens = ["good", "love"]
bad_tokens = ["bad", "hate"]

def predict(document, preprocess=None, good_tokens=good_tokens, bad_tokens=bad_tokens):
    sentiment = 0

    if preprocess == None:
      tokens = document.split()
    else:
      tokens = preprocess(document)

    for token in tokens:
      if token in good_tokens:
        sentiment += 1
      elif token in bad_tokens:
        sentiment -= 1


    if sentiment > 0:
      return "Postive"
    elif sentiment < 0:
      return "Negative"
    else:
      return "Neutral"



for doc in my_corpus:
  prediction = predict(doc)
  print(f"Document: {doc}\nPreddiction: {prediction}\n")

Document: I love pineapple on pizza. I think it's good!
Preddiction: Postive

Document: Pineapple on pizza is so bad.
Preddiction: Neutral

Document: I HATE this pineaaple-on-pizza trend.
Preddiction: Neutral

Document: I am loving this big pizza I got with pineapple on it.
Preddiction: Neutral



In [12]:
import nltk
nltk.download("punkt")

nltk.word_tokenize(my_corpus[0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['I',
 'love',
 'pineapple',
 'on',
 'pizza',
 '.',
 'I',
 'think',
 'it',
 "'s",
 'good',
 '!']

In [13]:
for doc in my_corpus:
  prediction = predict(doc, preprocess=nltk.word_tokenize)
  print(f"Document: {doc}\nPrediction: {prediction}\n")

Document: I love pineapple on pizza. I think it's good!
Prediction: Postive

Document: Pineapple on pizza is so bad.
Prediction: Negative

Document: I HATE this pineaaple-on-pizza trend.
Prediction: Neutral

Document: I am loving this big pizza I got with pineapple on it.
Prediction: Neutral



In [19]:
def normalize_caps_and_tokenize(document):
  lower_case_doc = document.lower()
  return nltk.word_tokenize(lower_case_doc)

for doc in my_corpus:
  prediction = predict(doc, preprocess=normalize_caps_and_tokenize)
  print(f"Document: {doc}\nPrediction: {prediction}\n")

Document: I love pineapple on pizza. I think it's good!
Prediction: Postive

Document: Pineapple on pizza is so bad.
Prediction: Negative

Document: I HATE this pineaaple-on-pizza trend.
Prediction: Negative

Document: I am loving this big pizza I got with pineapple on it.
Prediction: Neutral



In [22]:
# lemma -> take in complex word and get back simpliest version of the word
# "hated" -> hate
# "worst" -> bad

from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from collections import defaultdict
from nltk.corpus import wordnet

doc = "I was having a better time."
print(f"doc: {doc}")

tokens = nltk.word_tokenize(doc)
print(f"tokens: {tokens}")

tagged_tokens = nltk.pos_tag(tokens)
print(f"tagged_tokens: {tagged_tokens}")

lemmas = []
tag_map = defaultdict(lambda : wordnet.NOUN)
tag_map['J'] = wordnet.ADJ 
tag_map['V'] = wordnet.VERB 
tag_map['R'] = wordnet.ADV

lemmatizer = WordNetLemmatizer()

# pos --> part of speech
for token, pos in tagged_tokens:
  lemmatizer_tag = tag_map[pos[0]]
  lemma = lemmatizer.lemmatize(token, lemmatizer_tag)
  lemmas.append(lemma)

print(f"lemmas: {lemmas}")


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


doc: I was having a better time.
tokens: ['I', 'was', 'having', 'a', 'better', 'time', '.']
tagged_tokens: [('I', 'PRP'), ('was', 'VBD'), ('having', 'VBG'), ('a', 'DT'), ('better', 'JJR'), ('time', 'NN'), ('.', '.')]
lemmas: ['I', 'be', 'have', 'a', 'good', 'time', '.']


In [24]:
def preprocess_with_lemmatization(document):
  tokens = nltk.word_tokenize(document.lower())
  tagged_tokens = nltk.pos_tag(tokens)
  lemmas = []
  for token, pos in tagged_tokens:
    lemmatizer_tag = tag_map[pos[0]]
    lemma = lemmatizer.lemmatize(token, lemmatizer_tag)
    lemmas.append(lemma)
  return lemmas

preprocess_with_lemmatization("I was having a better time.")

['i', 'be', 'have', 'a', 'good', 'time', '.']

In [26]:
for document in my_corpus:
  prediction = predict(document, preprocess=preprocess_with_lemmatization)
  print(f"Document: {document}\nPrediction: {prediction}\n")

Document: I love pineapple on pizza. I think it's good!
Prediction: Postive

Document: Pineapple on pizza is so bad.
Prediction: Negative

Document: I HATE this pineaaple-on-pizza trend.
Prediction: Negative

Document: I am loving this big pizza I got with pineapple on it.
Prediction: Postive



In [27]:
!pip install gdown
import pandas as pd
import gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [28]:
url = "https://drive.google.com/uc?id=1-WZKE5xHw-3m_SL_PtOgwkzdFROIWqih"
output = "reviews.csv"
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1-WZKE5xHw-3m_SL_PtOgwkzdFROIWqih
To: /content/reviews.csv
100%|██████████| 301M/301M [00:02<00:00, 146MB/s]


'reviews.csv'

In [29]:
df = pd.read_csv(output) 
df = df[df['Score'] > 0][:1000] 
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [30]:
df[['Summary', 'Text']]

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...
...,...,...
995,Hot & Flavorful,BLACK MARKET HOT SAUCE IS WONDERFUL.... My hus...
996,Great Hot Sauce and people who run it!,"Man what can i say, this salsa is the bomb!! i..."
997,this sauce is the shiznit,this sauce is so good with just about anything...
998,Not Hot,Not hot at all. Like the other low star review...


In [37]:
def predict_stars(document, preprocess=None, good_tokens=good_tokens, bad_tokens=bad_tokens):
  sentiment = 0;

  if preprocess:
    tokens = preprocess(document)
  else:
    tokens = document.split()

  for token in tokens:
    if token in good_tokens:
      sentiment += 1
    elif token in bad_tokens:
      sentiment -=1

  if sentiment > 1:
    return 5
  elif sentiment == 1:
    return 4
  elif sentiment == 0:
    return 3
  elif sentiment == -1:
    return 2
  else:
    return 1

In [38]:
final_predict_stars = lambda text: predict_stars(text, preprocess=preprocess_with_lemmatization)

df['Prediction'] = df['Text'].apply(final_predict_stars)

In [39]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Prediction
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,5
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,3
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,3
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,4
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,3


In [40]:
correct_cases = sum(df["Prediction"] == df["Score"])
total_cases = len(df["Prediction"])
accuracy = correct_cases / total_cases

accuracy

0.198

In [41]:
more_bad_tokens = ["bad", "poor", "terrible", "hate", "dissapointed"]
more_good_tokens = ["good", "great", "wonderful", "love", "impressed"]

final_predict_stars = lambda text: predict_stars(text, preprocess=preprocess_with_lemmatization, good_tokens=more_good_tokens, bad_tokens=more_bad_tokens)

df['Prediction'] = df['Text'].apply(final_predict_stars)

In [42]:
correct_cases = sum(df["Prediction"] == df["Score"])
total_cases = len(df["Prediction"])
accuracy = correct_cases / total_cases

accuracy

0.297