In [1]:
%matplotlib inline

In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report

# Amazon Customer Reviews

Amazon customer reviews of products. Sentiment analysis. Label # 1 = 1 or 2, label # 2 = 4 or 5.

Dataset from [here](https://www.kaggle.com/datasets/bittlingmayer/amazonreviews?resource=download).

Steps

## Read and preprocess data

In [71]:
def read_amazon_reviews():
    filepath = Path().home() / r'Desktop\datasets\amazon_reivews\test.ft.txt'
    with open(filepath, encoding='utf8') as f:
        lines = f.readlines()
    records = [line.split(' ', 1) for line in lines[:1_000]]    
    amazon_reviews = pd.DataFrame().from_records(records, columns=['sentiment', 'review'])
    
    return amazon_reviews

amazon = read_amazon_reviews()
amazon

Unnamed: 0,sentiment,review
0,__label__2,Great CD: My lovely Pat has one of the GREAT v...
1,__label__2,One of the best game music soundtracks - for a...
2,__label__1,Batteries died within a year ...: I bought thi...
3,__label__2,"works fine, but Maha Energy is better: Check o..."
4,__label__2,Great for the non-audiophile: Reviewed quite a...
...,...,...
995,__label__1,Borinmg & dumb: A waste of time.Glory for old ...
996,__label__2,Best film of the year: One of the best films e...
997,__label__2,See this movie just for Ian McKellen's perform...
998,__label__2,best screenplays have more stability: One of t...


In [72]:
def tweak_amazon_reviews(df):
    return (df
            .assign(
                sentiment=lambda df_: df_.sentiment.replace({'__label__1': False, '__label__2': True}),
                review=lambda df_: df.review.str.strip())
            )

tweak_amazon_reviews(amazon)

Unnamed: 0,sentiment,review
0,True,Great CD: My lovely Pat has one of the GREAT v...
1,True,One of the best game music soundtracks - for a...
2,False,Batteries died within a year ...: I bought thi...
3,True,"works fine, but Maha Energy is better: Check o..."
4,True,Great for the non-audiophile: Reviewed quite a...
...,...,...
995,False,Borinmg & dumb: A waste of time.Glory for old ...
996,True,Best film of the year: One of the best films e...
997,True,See this movie just for Ian McKellen's perform...
998,True,best screenplays have more stability: One of t...


In [75]:
def split_amazon_reviews(df):
    return train_test_split(df.review, df.sentiment, test_size=0.1)

In [75]:
attributes_train, attributes_test, labels_train, labels_test = (amazon
                                                                .pipe(tweak_amazon_reviews)
                                                                .pipe(split_amazon_reviews)
                                                               )
attributes_train.shape, attributes_test.shape, labels_train.shape, labels_test.shape

((900,), (100,), (900,), (100,))

## Vectorize

### Bag of words

In [41]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [42]:
vectorizer = CountVectorizer(stop_words=stop_words)

vectorizer.fit(attributes_train)
vectorizer.transform(attributes_train)

<900x8758 sparse matrix of type '<class 'numpy.int64'>'
	with 31834 stored elements in Compressed Sparse Row format>

## Heading

In [43]:
vectorizer = TfidfVectorizer(stop_words=stop_words)
vectorizer.fit_transform(attributes_train)

<900x8758 sparse matrix of type '<class 'numpy.float64'>'
	with 31834 stored elements in Compressed Sparse Row format>

## Heading

In [58]:
process_attributes = make_pipeline(
    TfidfVectorizer(stop_words=stop_words)
)

process_attributes.fit(attributes_train)

## Heading

In [60]:
attributes_train_processed = process_attributes.transform(attributes_train)
attributes_test_processed = process_attributes.transform(attributes_test)

In [61]:
model = MultinomialNB()
model.fit(attributes_train_processed, labels_train)

In [62]:
# accuracy
model.score(attributes_test_processed, labels_test)

0.75

In [68]:
print('---TRAINING SET---\n', classification_report(labels_train, model.predict(attributes_train_processed)))
print('---TEST SET---\n', classification_report(labels_test, model.predict(attributes_test_processed)))

---TRAINING SET---
               precision    recall  f1-score   support

       False       0.99      0.99      0.99       449
        True       0.99      0.99      0.99       451

    accuracy                           0.99       900
   macro avg       0.99      0.99      0.99       900
weighted avg       0.99      0.99      0.99       900

---TEST SET---
               precision    recall  f1-score   support

       False       0.75      0.73      0.74        49
        True       0.75      0.76      0.76        51

    accuracy                           0.75       100
   macro avg       0.75      0.75      0.75       100
weighted avg       0.75      0.75      0.75       100



## Heading

In [81]:
vectorizer2 = TfidfVectorizer(stop_words=stop_words, min_df=2, max_df=0.95)
vectorizer2.fit(attributes_train)

In [83]:
attributes_train_processed = vectorizer2.transform(attributes_train)
attributes_test_processed = vectorizer2.transform(attributes_test)

# drastic descres in the number of features
attributes_train_processed

<900x3412 sparse matrix of type '<class 'numpy.float64'>'
	with 26069 stored elements in Compressed Sparse Row format>

## Heading

In [84]:
model.fit(attributes_train_processed, labels_train)

print('---TRAINING SET---\n', classification_report(labels_train, model.predict(attributes_train_processed)))
print('---TEST SET---\n', classification_report(labels_test, model.predict(attributes_test_processed)))

---TRAINING SET---
               precision    recall  f1-score   support

       False       0.98      0.97      0.98       444
        True       0.97      0.98      0.98       456

    accuracy                           0.98       900
   macro avg       0.98      0.98      0.98       900
weighted avg       0.98      0.98      0.98       900

---TEST SET---
               precision    recall  f1-score   support

       False       0.90      0.81      0.85        54
        True       0.80      0.89      0.85        46

    accuracy                           0.85       100
   macro avg       0.85      0.85      0.85       100
weighted avg       0.85      0.85      0.85       100



## Heading

In [87]:
linear_svc = LinearSVC()

linear_svc.fit(attributes_train_processed, labels_train)

train_predictions = model.predict(attributes_train_processed)
test_predictions = model.predict(attributes_test_processed)
print('---TRAINING SET---\n', classification_report(labels_train, train_predictions))
print('---TEST SET---\n', classification_report(labels_test, test_predictions))

---TRAINING SET---
               precision    recall  f1-score   support

       False       0.98      0.97      0.98       444
        True       0.97      0.98      0.98       456

    accuracy                           0.98       900
   macro avg       0.98      0.98      0.98       900
weighted avg       0.98      0.98      0.98       900

---TEST SET---
               precision    recall  f1-score   support

       False       0.90      0.81      0.85        54
        True       0.80      0.89      0.85        46

    accuracy                           0.85       100
   macro avg       0.85      0.85      0.85       100
weighted avg       0.85      0.85      0.85       100



## Heading

In [90]:
sgd = SGDClassifier(learning_rate='optimal', early_stopping=True)

sgd.fit(attributes_train_processed, labels_train)

train_predictions = sgd.predict(attributes_train_processed)
test_predictions = sgd.predict(attributes_test_processed)
print('---TRAINING SET---\n', classification_report(labels_train, train_predictions))
print('---TEST SET---\n', classification_report(labels_test, test_predictions))

---TRAINING SET---
               precision    recall  f1-score   support

       False       0.98      0.99      0.98       444
        True       0.99      0.98      0.98       456

    accuracy                           0.98       900
   macro avg       0.98      0.98      0.98       900
weighted avg       0.98      0.98      0.98       900

---TEST SET---
               precision    recall  f1-score   support

       False       0.87      0.76      0.81        54
        True       0.75      0.87      0.81        46

    accuracy                           0.81       100
   macro avg       0.81      0.81      0.81       100
weighted avg       0.82      0.81      0.81       100



## Heading

In [120]:
sgd2 = SGDClassifier()

sgd2.partial_fit(attributes_train_processed.toarray()[:20], labels_train.iloc[:20], classes=[True, False])

indeces = sgd2.coef_.nonzero()[1][:20]
coefs_sample = sgd2.coef_[0][indeces]
coefs_sample

array([-2.16039121, -2.44448132, -2.37809493, -2.28509055, -2.27279432,
        1.23330635,  1.92638629,  1.39991925,  1.17428486, -1.08260938,
        0.86403707,  1.22027852, -1.77672271,  2.74631733, -3.73643866,
       -1.4882854 , -0.55972211, -1.4882854 ,  2.3504956 ,  1.25540851])

In [121]:
sgd2.partial_fit(attributes_train_processed.toarray()[20:40], labels_train.iloc[20:40], classes=[True, False])

indeces = sgd2.coef_.nonzero()[1][:20]
coefs_sample = sgd2.coef_[0][indeces]
coefs_sample

array([-3.16007142, -2.39742682,  3.82157689, -3.68592292,  1.42256799,
       -2.24110421, -2.22904467,  1.20956609,  1.5772515 ,  1.70732765,
        1.88930474,  1.37297181, -0.86028361,  1.91078844,  1.15168073,
       -1.06176993,  0.84740498,  1.19678904, -2.74349507,  2.69345271])

## Heading

In [135]:
# split into 16 mini batches
mini_batches = np.array_split(attributes_train_processed.toarray(), len(attributes_train_processed.toarray()) // 16)
mini_batches_labels = np.array_split(labels_train, len(labels_train) // 16)

len(mini_batches), len(mini_batches_labels)

  sub_arys.append(_nx.swapaxes(sary[st:end], axis, 0))


(56, 56)

In [136]:
for i in range(16):
    # sgd2.partial_fit(mini_batches[i], mini_batches_labels[i])
    pass

## Heading

## Heading

In [142]:
import pickle

with open('sgd2.pickle', 'wb') as f:
    pickle.dump(sgd2, f)

## Heading

## Heading

## Heading

## Heading