In [1]:
!pip install gdown
import pandas as pd
import gdown

url = "https://drive.google.com/uc?id=1-WZKE5xHw-3m_SL_PtOgwkzdFROIWqih"
output = "reviews.csv"
gdown.download(url, output, quiet=False)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Downloading...
From: https://drive.google.com/uc?id=1-WZKE5xHw-3m_SL_PtOgwkzdFROIWqih
To: /content/reviews.csv
100%|██████████| 301M/301M [00:02<00:00, 139MB/s]


'reviews.csv'

In [25]:
raw_df = pd.read_csv(output)
raw_df = raw_df[raw_df['Score'] > 0]
df = raw_df[:2000]

In [26]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [27]:
import nltk 
nltk.download('punkt') 
from nltk.stem import WordNetLemmatizer 
nltk.download('omw-1.4') 
nltk.download('wordnet') 
nltk.download('averaged_perceptron_tagger') 
from nltk.corpus import wordnet 
from collections import defaultdict 
lemmatizer = WordNetLemmatizer() 

tag_map = defaultdict(lambda : wordnet.NOUN) 
tag_map['J'] = wordnet.ADJ 
tag_map['V'] = wordnet.VERB 
tag_map['R'] = wordnet.ADV 

def preprocess_with_lemmatization(document): 
  tokens = nltk.word_tokenize(document.lower()) 
  tagged_tokens = nltk.pos_tag(tokens) 
  lemmas = [] 
  for token, pos in tagged_tokens: 
    lemmatizer_tag = tag_map[pos[0]] 
    lemma = lemmatizer.lemmatize(token, pos=lemmatizer_tag) 
    lemmas.append(lemma) 
  return lemmas

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [28]:
from collections import defaultdict

five_star_counts = defaultdict(int)
one_star_counts = defaultdict(int)
total_counts = defaultdict(lambda: 10)

for text, score in zip(df['Text'], df['Score']):
  tokens = preprocess_with_lemmatization(text)
  for token in tokens:
    if score == 5:
      five_star_counts[token] += 1
    elif score == 1:
      one_star_counts[token] += 1
    total_counts[token] += 1

In [29]:
# TF-IDF
# Term
# Frequency
# Inverse
# Document
# Frequency

In [30]:
sorted(five_star_counts.keys(), key=lambda token: five_star_counts[token], reverse=True)[:5]

['.', 'be', 'the', ',', 'i']

In [37]:
five_star_normalized = {}

for token in five_star_counts:
  normalized_five_star_count = five_star_counts[token] / total_counts[token]
  five_star_normalized[token] = normalized_five_star_count

one_star_normalized = {}
for token in one_star_counts:
  normalized_one_star_count = one_star_counts[token] / total_counts[token]
  one_star_normalized[token] = normalized_one_star_count

good_tokens = sorted(five_star_normalized.keys(), key=lambda token: five_star_normalized[token], reverse=True)[:20]
bad_tokens = sorted(one_star_normalized.keys(), key=lambda token: one_star_normalized[token], reverse=True)[:20]


good_tokens, bad_tokens

(['highly',
  'dog',
  'great',
  'love',
  'delicious',
  'wonderful',
  'recommend',
  '--',
  'weight',
  'ever',
  'thank',
  'snack',
  'fast',
  'our',
  'best',
  'thanks',
  'perfect',
  'calorie',
  'she',
  '!'],
 ['alive',
  'frosting',
  'aware',
  'return',
  'awful',
  'throw',
  'horrible',
  'donut',
  'disappointed',
  'cacao',
  'garbage',
  'maltitol',
  'picture',
  'stale',
  'frost',
  'change',
  'label',
  'terrible',
  'cancel',
  'clearly'])

In [38]:
def predict_stars(document):
    # We'll start with "neutral"
    sentiment = 0
    # Our default preprocessing will be just splitting along spaces, but
    # we can pass in a custom preprocess script later if we want
    tokens = preprocess_with_lemmatization(document)

    # We loop through all the tokens in our document
    for token in tokens:
        # If the token is one of our "good" ones, we'll add 1 to the sentiment
        if token in good_tokens:
            sentiment += 1
        # If the token is one of our "bad" ones, we'll subtract 1 from the sentiment
        if token in bad_tokens:
            sentiment -= 1
    
    if sentiment > 1:
      return 5
    elif sentiment == 1:
      return 4
    elif sentiment == 0:
      return 3
    elif sentiment == -1:
      return 2
    else:
      return 1

df['Prediction'] = df['Text'].apply(predict_stars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [39]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Prediction
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,5
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,2
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,5
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,3
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,5


In [40]:
correct_cases = sum(df['Prediction'] == df['Score'])
total_cases = len(df['Prediction'])
correct_cases / total_cases

0.5285

# Vectorization

In [42]:
corpus = [
          "You like dogs",
          "You like cats",
          "You love dogs",
          "You love cats",
]

vocab = set()

for doc in corpus :
  tokens = preprocess_with_lemmatization(doc)
  vocab.update(tokens)

vocab = list(vocab)
vocab.sort()
print(vocab)

['cat', 'dog', 'like', 'love', 'you']


In [48]:
# D: number of docs in our corpus
# T: maximum number of tokens in any document (T = 100)

word_to_index = {}
for i, word in enumerate(vocab):
  word_to_index[word] = i

vectors = []
for doc in corpus: # D
  tokens = preprocess_with_lemmatization(doc)
  vector = [0] * len(vocab)
  for token in tokens: # T
    index = word_to_index[token]
    vector[index] += 1
  vectors.append(vector)

# O(D * T) --> O(D * 100) --> O(D)

In [49]:
print(*vectors, sep="\n")

[0, 1, 1, 0, 1]
[1, 0, 1, 0, 1]
[0, 1, 0, 1, 1]
[1, 0, 0, 1, 1]


In [52]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)
print(vectors.toarray())

[[0 1 1 0 1]
 [1 0 1 0 1]
 [0 1 0 1 1]
 [1 0 0 1 1]]


In [69]:
df = raw_df[:10000]
training_df = df[:9000]
testing_df = df[9000:]

In [70]:
vectorizer = CountVectorizer()

training_vectors = vectorizer.fit_transform(training_df['Text'])

In [71]:
first_vector = training_vectors.toarray()[0]
print(first_vector)
print(first_vector.size)

[0 0 0 ... 0 0 0]
18141


In [72]:
testing_vectors = vectorizer.transform(testing_df['Text'])

first_testing_vector = testing_vectors.toarray()[0]
print(first_testing_vector)
print(first_testing_vector.size)

[0 0 0 ... 0 0 0]
18141


In [73]:
training_labels = training_df['Score']
testing_labels = testing_df['Score']

training_labels[:10]

0    5
1    1
2    4
3    2
4    5
5    4
6    5
7    5
8    5
9    5
Name: Score, dtype: int64

In [74]:
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression()

lr_classifier.fit(training_vectors, training_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [75]:
lr_classifier.score(testing_vectors, testing_labels)

0.771