In [1]:
import fitz
import nltk
import pandas as pd
import torch
from nltk.tokenize import sent_tokenize
from transformers import AutoModelForSequenceClassification, AutoTokenizer

nltk.download("punkt")

get_sentiment_of_sentence_list(
    sentences_list: list[str],
    result_df: pd.DataFrame,
    model: AutoModelForSequenceClassification,
    tokenizer: AutoTokenizer,
    start_index: int,
) -> pd.DataFrame:
    """Use model and tokenizer to extract sentiment of sentences from sentences_list in batch of 25 from start_index.

    Args:
        sentences_list (list[str]): List of sentences for which we are extracting sentiment.
        result_df (pd.DataFrame): Resulting dataframe that we will concatenate with df with new extracted sentiment.
        model (AutoModelForSequenceClassification): Finbert model.
        tokenizer (AutoTokenizer): Finbert tokenizer.
        start_index (int): Index from sentences_list, required so we can extract sentiment in batches.

    Returns:
        pd.DataFrame: result_df with new extracted sentiment concatenated.
    """
    batch_sentences = sentences_list[start_index : start_index + 25]
    model_input = tokenizer(
        batch_sentences, padding=True, truncation=True, return_tensors="pt"
    )
    model_output = model(**model_input)
    predictions = torch.nn.functional.softmax(model_output.logits, dim=-1)
    positive = predictions[:, 0].tolist()
    negative = predictions[:, 1].tolist()
    neutral = predictions[:, 2].tolist()
    table = {
        "sentence": batch_sentences,
        "Positive": positive,
        "Negative": negative,
        "Neutral": neutral,
    }
    df = pd.DataFrame(table)
    result_df = pd.concat([result_df, df], ignore_index=True)

    del predictions
    del model_output
    return result_df

SyntaxError: invalid syntax (3884913789.py, line 11)

In [2]:
# import libraries
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# download nltk corpus (first time only)
# nltk.download('all')



In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')
df

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1
...,...,...
19995,this app is fricken stupid.it froze on the kin...,0
19996,Please add me!!!!! I need neighbors! Ginger101...,1
19997,love it! this game. is awesome. wish it had m...,1
19998,I love love love this app on my side of fashio...,1


In [3]:
# create preprocess_text function
def preprocess_text(text):

    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [5]:
df['reviewText'] = df['reviewText'].apply(preprocess_text)
df

Unnamed: 0,reviewText,Positive
0,one best apps acording bunch people agree bomb...,1
1,pretty good version game free . lot different ...,1
2,really cool game . bunch level find golden egg...,1
3,"silly game frustrating , lot fun definitely re...",1
4,terrific game pad . hr fun . grandkids love . ...,1
...,...,...
19995,app fricken stupid.it froze kindle wont allow ...,0
19996,please add ! ! ! ! ! need neighbor ! ginger101...,1
19997,love ! game . awesome . wish free stuff house ...,1
19998,love love love app side fashion story fight wo...,1


In [6]:
# initialize NLTK sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# create get_sentiment function
def get_sentiment(text):
    scores = analyzer.polarity_scores(text) # e.g. {'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}
    sentiment = 1 if scores['pos'] > 0 else 0
    return sentiment

In [7]:
df['sentiment'] = df['reviewText'].apply(get_sentiment)
df

Unnamed: 0,reviewText,Positive,sentiment
0,one best apps acording bunch people agree bomb...,1,1
1,pretty good version game free . lot different ...,1,1
2,really cool game . bunch level find golden egg...,1,1
3,"silly game frustrating , lot fun definitely re...",1,1
4,terrific game pad . hr fun . grandkids love . ...,1,1
...,...,...,...
19995,app fricken stupid.it froze kindle wont allow ...,0,0
19996,please add ! ! ! ! ! need neighbor ! ginger101...,1,1
19997,love ! game . awesome . wish free stuff house ...,1,1
19998,love love love app side fashion story fight wo...,1,1


In [None]:
#### Roberta 
#Hugingface
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import pipeline
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import  TFDistilBertModel
from datasets import load_dataset,Dataset, DatasetDict

#Keras and Tensorflow
import keras
import tensorflow as tf
from keras import backend as K
from keras.preprocessing.text import one_hot,Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input
from keras.models import Model
from keras.callbacks import Callback

#Sklearn
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from kaggle_datasets import KaggleDatasets

#General utils
from string import punctuation
import json
import random
import os
from tqdm.notebook import tqdm
import time

#tensorflow sets
tf.get_logger().setLevel('ERROR')
# Set autograph verbosity to avoid unnecessary messages
tf.autograph.set_verbosity(0)
# Enable xla for speed up
tf.config.optimizer.set_jit(True)

#Matplot Style
plt.style.use('ggplot')

