In [1]:
!pip install transformers
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https

In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd 
import tensorflow as tf 

In [2]:
# load the pre-trained models: 
from transformers import RobertaTokenizer, TFRobertaModel
with tf.device('/device:GPU:0'):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = TFRobertaModel.from_pretrained("roberta-base")

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [3]:
import os 
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()
import fasttext
os.getcwd()

0it [00:00, ?it/s]

'/home/ec2-user/SageMaker/Kartik_Amazon_Thesis/time-series-analysis-all-data'

In [4]:
# read the all-reviews parquet file
all_reviews_parquet =  pq.read_table('/home/ec2-user/SageMaker/master_df_all_reviews.parquet')
# Convert the PyArrow table to a Pandas DataFrame
major_df = all_reviews_parquet.to_pandas()

In [5]:
def prepare_data(input_text, tokenizer):
    with tf.device('/device:GPU:0'):
        
        token = tokenizer.encode_plus(
            input_text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        return {
            'input_ids': tf.cast(token.input_ids, tf.float64),
            'attention_mask': tf.cast(token.attention_mask, tf.float64)
        }

def make_predictions(model, input_text,threshold, label_list=None ): 
    with tf.device('/device:GPU:0'):
        processed_data = prepare_data(input_text, tokenizer)
        probs = model.predict(processed_data)
#         print(probs)
        if probs[0]> threshold: 
            return label_list[0]
        else: 
            return label_list[1]
#     return(probs[0])

In [6]:
# load models
os.listdir('/home/ec2-user/SageMaker/best_models')

['3500_augmented_disagreement_with_ratings_model',
 'zero_star_general_fnl',
 'wrong_buying_v1',
 '3000k_augmented_rating_management_explicit_model',
 'read_reviews_v1']

In [7]:
major_df.review_body.iloc[200]

"\nLove this book.love it's one sided.plus love that the pages are black on the back side of each page.love all of his books.very reasonable price.too\n"

In [8]:
#pre-process the major_df: 
major_df.review_body = major_df.review_body.progress_apply(lambda x : x.replace("\n", " ").strip())
major_df.review_body = major_df.review_body.replace("\s+", " ")

  0%|          | 0/18945515 [00:00<?, ?it/s]

In [9]:
class languate_detection_fasttext():

    def __init__(self):
        pretrained_lang_model = os.path.join(os.getcwd(),"lid.176.bin")
        self.model = fasttext.load_model(pretrained_lang_model)

    def detect_language(self, text):
        predictions = self.model.predict(text, k=1) # returns top 2 matching languages
        return predictions[0][0].replace('__label__', ''), predictions[1][0]

In [10]:
language_detection = languate_detection_fasttext()
major_df['language']= major_df.review_body.progress_apply(lambda x: language_detection.detect_language(x)[0])



  0%|          | 0/18945515 [00:00<?, ?it/s]

In [11]:
%time
# choosing the english language only: 
major_df_en= major_df[major_df['language']=='en']

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 15.3 µs


In [12]:
# run the trained Roberta models on all samples:
# disagreement with ratings: 
#load model: 
disagreement_model = tf.keras.models.load_model('/home/ec2-user/SageMaker/best_models/3500_augmented_disagreement_with_ratings_model')

In [None]:
# Create a PyArrow table
table = pa.Table.from_pandas(major_df)

# Write the PyArrow table all reviews in english to a Parquet file
pq.write_table(table, 'major_all_reviews_en.parquet')

In [None]:
# run disagreement model on all samples: 
label_list= [1, 0]
major_df_en['disagreement_with_ratings']= major_df_en.review_body.progress_apply(lambda x:
                                                                                 make_predictions(
                                                                                     disagreement_model, x, 0.5,
                                                                                     label_list))

  0%|          | 0/17701242 [00:00<?, ?it/s]





























































































































































































































































































































































































































































































































































































































































































































































































































































































































In [None]:
print('done')

In [38]:
make_predictions(disagreement_model, tester, 0.5, label_list)



0