# Purpose: 

Test office generated models on general sample population to make decisions

In [1]:
# install transformers
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [2]:
# load: 
import os 
import json
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import time
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# load the pre-trained models: 
from transformers import RobertaTokenizer, TFRobertaModel
with tf.device('/device:GPU:0'):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = TFRobertaModel.from_pretrained("roberta-base")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/657M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [5]:
 os.listdir(os.getcwd())

['000_general_products_model_testing.ipynb',
 '.ipynb_checkpoints',
 'wish_I_read_reviews_gen_1k.csv',
 'rating_managment_explicit_gen_1k.csv',
 'disagreement_with_ratings_gen_1k.csv',
 'wrong_buying_gen_1k.csv',
 'zero_star_gen_1k.csv']

In [6]:
# read csvs for testing:
wish_I_read_reviews_df = pd.read_csv('wish_I_read_reviews_gen_1k.csv')
rating_management_explicit_df = pd.read_csv('rating_managment_explicit_gen_1k.csv')
disagreement_with_ratings_df = pd.read_csv('disagreement_with_ratings_gen_1k.csv')
wrong_buying_df= pd.read_csv('wrong_buying_gen_1k.csv')
zero_stars_df= pd.read_csv('zero_star_gen_1k.csv')

In [8]:
wish_I_read_reviews_df.wish_I_read_reviews_ohe.value_counts()

0    741
1     26
Name: wish_I_read_reviews_ohe, dtype: int64

In [12]:
rating_management_explicit_df.rating_managment_explicit_ohe.value_counts()

0    755
1     12
Name: rating_managment_explicit_ohe, dtype: int64

In [13]:
disagreement_with_ratings_df.disagreement_with_ratings_ohe.value_counts()

0    711
1     56
Name: disagreement_with_ratings_ohe, dtype: int64

In [15]:
wrong_buying_df.wrong_buying_ohe.value_counts()

0    708
1     59
Name: wrong_buying_ohe, dtype: int64

In [18]:
zero_stars_df.zero_star_ohe.value_counts()

0    690
1     77
Name: zero_star_ohe, dtype: int64

In [21]:
def prepare_data(input_text, tokenizer):
    with tf.device('/device:GPU:0'):
        
        token = tokenizer.encode_plus(
            input_text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        return {
            'input_ids': tf.cast(token.input_ids, tf.float64),
            'attention_mask': tf.cast(token.attention_mask, tf.float64)
        }

def make_predictions(model, input_text,threshold, label_list=None ): 
    with tf.device('/device:GPU:0'):
        processed_data = prepare_data(input_text, tokenizer)
        probs = model.predict(processed_data)
#         print(probs)
        if probs[0]> threshold: 
            return label_list[0]
        else: 
            return label_list[1]
#     return(probs[0])

In [24]:
os.listdir(os.path.join('/home/ec2-user/SageMaker', 'models_office_v1'))

['rating_managment_explicit_v1',
 'disagreement_with_ratings_v1',
 'wrong_buying_v1',
 'zero_star_v1',
 'read_reviews_v1']

In [26]:
# load the models: 
model_path = os.path.join('/home/ec2-user/SageMaker', 'models_office_v1')
disagreement_model = tf.keras.models.load_model(os.path.join(model_path,'disagreement_with_ratings_v1' ))
rating_management_model = tf.keras.models.load_model(os.path.join(model_path,'rating_managment_explicit_v1' ))
zero_stars_model = tf.keras.models.load_model(os.path.join(model_path,'zero_star_v1' ))
wrong_buying_model = tf.keras.models.load_model(os.path.join(model_path,'wrong_buying_v1' ))
read_reviews_model= tf.keras.models.load_model(os.path.join(model_path,'read_reviews_v1' ))

wish_I_read_reviews_df
rating_management_explicit_df 
disagreement_with_ratings_df 
wrong_buying_df
zero_stars_df

# disagreement testing 

In [32]:

label_list= [1, 0]
disagreement_with_ratings_df['result']= disagreement_with_ratings_df.full_review.apply(lambda x: make_predictions(disagreement_model, x, 0.5, label_list))







In [33]:
print (classification_report(disagreement_with_ratings_df['disagreement_with_ratings_ohe'], disagreement_with_ratings_df['result']))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       711
           1       0.61      0.48      0.54        56

    accuracy                           0.94       767
   macro avg       0.79      0.73      0.75       767
weighted avg       0.93      0.94      0.94       767



# Rating Management testing

In [34]:
label_list= [1, 0]
rating_management_explicit_df['result']= rating_management_explicit_df.full_review.apply(lambda x: make_predictions(rating_management_model, x, 0.5, label_list))







In [35]:
print (classification_report(rating_management_explicit_df['rating_managment_explicit_ohe'], rating_management_explicit_df['result']))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       755
           1       0.50      0.08      0.14        12

    accuracy                           0.98       767
   macro avg       0.74      0.54      0.57       767
weighted avg       0.98      0.98      0.98       767



# zero_stars

In [36]:
label_list= [1, 0]
zero_stars_df['result']= zero_stars_df.full_review.apply(lambda x: make_predictions(zero_stars_model, x, 0.5, label_list))







In [37]:
print (classification_report(zero_stars_df['zero_star_ohe'], zero_stars_df['result']))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       690
           1       0.78      0.94      0.85        77

    accuracy                           0.97       767
   macro avg       0.89      0.95      0.92       767
weighted avg       0.97      0.97      0.97       767



# Wrong Buying

In [39]:
label_list= [1, 0]
wrong_buying_df['result']= wrong_buying_df.full_review.apply(lambda x: make_predictions(wrong_buying_model, x, 0.5, label_list))







In [40]:
print (classification_report(wrong_buying_df['wrong_buying_ohe'], wrong_buying_df['result']))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       708
           1       0.78      0.47      0.59        59

    accuracy                           0.95       767
   macro avg       0.87      0.73      0.78       767
weighted avg       0.94      0.95      0.94       767



# read_reviews

In [41]:
label_list= [1, 0]
wish_I_read_reviews_df['result']= wish_I_read_reviews_df.full_review.apply(lambda x: make_predictions(read_reviews_model, x, 0.5, label_list))







In [42]:
print (classification_report(wish_I_read_reviews_df['wish_I_read_reviews_ohe'], wish_I_read_reviews_df['result']))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       741
           1       0.69      0.77      0.73        26

    accuracy                           0.98       767
   macro avg       0.84      0.88      0.86       767
weighted avg       0.98      0.98      0.98       767



In [45]:
wish_I_read_reviews_df[wish_I_read_reviews_df['wish_I_read_reviews_ohe']==1]

Unnamed: 0,review_id,full_review,wish_I_read_reviews,wish_I_read_reviews_ohe,result
22,R11AZFYTDFF7TB,Nice book for those who are really into elimin...,1,1,0
78,R2Z6BJ10S2ZIA,I should have read the reviews. Easy to instal...,1,1,1
107,RM93DCENH8QHZ,I am in shock and compelled to write this... I...,1,1,0
110,R2QTNJP1D9E5GC,I should have listen to some of the other revi...,1,1,1
157,R22PY091IAZS7B,The cheap plastic doesn't create the needed su...,1,1,1
162,R2GSPGSEMG6O8H,Too pricey for what it actually is. Should've ...,1,1,1
165,R36AXBD32VAY37,I bought this unit as it was highly reccommend...,1,1,1
290,RQ8WL9HKWGQEC,Spent the last hour trying to enable this with...,1,1,1
313,R1T77W745WWVGC,I should have read the reviews on this CD. Wha...,1,1,1
347,R3OCRU1VMRUJX8,I've been reading James Patterson's books for ...,1,1,1
