In [2]:
import pandas as pd
# Load the Sentiment140 dataset as a data frame
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'latin1', names=['polarity', 'id', 'date', 'query', 'user', 'text'])
# Print the first 5 rows of the data frame
print(df.head())
print(f"\n The shape data is:{df.shape}")

   polarity          id                          date     query  \
0         0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1         0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2         0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3         0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4         0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  

 The shape data is:(1600000, 6)


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1600000 non-null  int64 
 1   id        1600000 non-null  int64 
 2   date      1600000 non-null  object
 3   query     1600000 non-null  object
 4   user      1600000 non-null  object
 5   text      1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [36]:
df['polarity'].value_counts()

0    800000
4    800000
Name: polarity, dtype: int64

In [33]:
df[df['polarity']==0]

Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
799995,0,2329205009,Thu Jun 25 10:28:28 PDT 2009,NO_QUERY,dandykim,Sick Spending my day laying in bed listening ...
799996,0,2329205038,Thu Jun 25 10:28:28 PDT 2009,NO_QUERY,bigenya,Gmail is down?
799997,0,2329205473,Thu Jun 25 10:28:30 PDT 2009,NO_QUERY,LeeLHoke,rest in peace Farrah! So sad
799998,0,2329205574,Thu Jun 25 10:28:30 PDT 2009,NO_QUERY,davidlmulder,@Eric_Urbane Sounds like a rival is flagging y...


### Preprocess the data: lowercasing, tokenization, removing stop words and punctuation. It's important to note, that I test two preprocessing options, with and without removing stop words and punctuation, stemming, in order to understant the impact on sentiment analysis. This is part one:

In [3]:
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd, os, pickle
import sklearn.feature_extraction.text as text

In [13]:
file_name_opnion_encoded  = 'opnion_encoded_text_ready.pickle'
file_name_vectorizer_pickle = 'vectorizer.pickle'

In [15]:
if not os.path.isfile(file_name_opnion_encoded):
    # Get the 'text' column from the data frame
    texts = df['text']

    # Preprocess the texts
    processed_texts = []
    for text in texts:
        # Lowercase the text
        text = text.lower()

        # Tokenize the text
        tokens = nltk.word_tokenize(text)

        # Remove stop words
        stop_words = nltk.corpus.stopwords.words('english')
        tokens = [token for token in tokens if token not in stop_words]

        # Remove punctuation
        punctuation = string.punctuation
        tokens = [token for token in tokens if token not in punctuation]

        # Stem or lemmatize the tokens
        stemmer = nltk.stem.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

        # Join the tokens back into a single string
        processed_text = ' '.join(tokens)

        # Append the processed text to the list
        processed_texts.append(processed_text)

    # Use CountVectorizer to encode the texts
    vectorizer = CountVectorizer(ngram_range=(1, 2))
    encoded_texts = vectorizer.fit_transform(processed_texts)
    with open(file_name_vectorizer_pickle, 'w+b') as f:
        pickle.dump(vectorizer, f)
    with open(file_name_opnion_encoded, 'w+b') as f: 
        pickle.dump(encoded_texts,f, -1)  
else:
    with open(file_name_opnion_encoded, 'rb') as f:
        encoded_texts = pickle.load(f)
    with open(file_name_vectorizer_pickle, 'rb') as f:
        vectorizer = pickle.load(f)

In [16]:
import sklearn.linear_model as lm
from sklearn.model_selection import train_test_split
file_name_model  = 'model_logistic_regression.pickle'
# Get the labels for the text
if not os.path.isfile(file_name_model):
    labels = df['polarity']
    # Split the data into a training set and a test set
    features_train, features_test, labels_train, labels_test = train_test_split(encoded_texts, labels, test_size=0.2, random_state=42)
    # Train the logistic regression model
    model_logistic = lm.LogisticRegression()
    model_logistic.fit(features_train, labels_train)
    # Evaluate the model on the test set
    accuracy = model_logistic.score(features_test, labels_test)
    print('Accuracy:', accuracy)
else:
    with open(file_name_model, 'rb') as f:
        model_logistic = pickle.load(f)

Accuracy: 0.78943125


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
import nbformat.v4 as nbformat
from sklearn import metrics

# Make predictions on the test set
predictions = model_logistic.predict(features_test)

# Compute the precision, recall, and F1 score
precision = metrics.precision_score(labels_test, predictions, pos_label=4)
recall = metrics.recall_score(labels_test, predictions, pos_label=4)
f1 = metrics.f1_score(labels_test, predictions, pos_label=4)

# Print the results
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Precision: 0.7799105478941484
Recall: 0.8082875406526859
F1 score: 0.7938455325346029


In [18]:
text = ["fraud"]
# Transform the text using the same vectorizer
predict_text_train = vectorizer.transform(text)
prediction = model_logistic.predict(predict_text_train)
print('Prediction:', prediction)
if prediction == 4:
    print('positive')
# elif prediction == 2:
#     print('neutral')
# becouse it's logistic regresion it will be 1 or 0 (in this case it's 4 - positive or 0 negative)
elif prediction == 0:
    print('negative')

Prediction: [0]
negative


In [19]:
text = ["It's too good to be true"]
# Transform the text using the same vectorizer
predict_text_train = vectorizer.transform(text)
prediction = model_logistic.predict(predict_text_train)
print('Prediction:', prediction)
if prediction == 4:
    print('positive')
# elif prediction == 2:
#     print('neutral')
# no 2 in the data - and if there was 2, you should use model_logistic_regression_multi_class
elif prediction == 0:
    print('negative')

Prediction: [4]
positive


# ----------------------------------PART 2 - flair ---------------------------------------------------

In [77]:
!pip install transformers flair

Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
     ------------------------------------ 401.9/401.9 kB 309.4 kB/s eta 0:00:00
Collecting wikipedia-api
  Downloading Wikipedia_API-0.5.8-py3-none-any.whl (13 kB)
Collecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ------------------------------------ 981.5/981.5 kB 748.8 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting more-itertools
  Downloading more_itertools-9.0.0-py3-none-any.whl (52 kB)
     ---------------------------------------- 52.8/52.8 kB 2.7 MB/s eta 0:00:00
Collecting hyperopt>=0.2.7
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 751.3 kB/s eta 0:00:00
Collecting mpld3==0.3
  Downloading mpld3-0.3.tar.gz (788 kB)
     --------------------------------

In [34]:
# Data processing
import pandas as pd

# Hugging Face model
from transformers import pipeline

# Import flair pre-trained sentiment model
from flair.models import TextClassifier
fl_classifier = TextClassifier.load('en-sentiment')

# Import flair Sentence to process input text
from flair.data import Sentence

# Import accuracy_score to check performance
from sklearn.metrics import accuracy_score

2023-01-12 17:05:42,922 loading file C:\Users\Leon\.flair\models\sentiment-en-mix-distillbert_4.pt


In [37]:
# Define a function to get Flair sentiment prediction score
def score_flair(text):
  # Flair tokenization
  sentence = Sentence(text)
  # Predict sentiment
  fl_classifier.predict(sentence)
  # Extract the score
  score = sentence.labels[0].score
  # Extract the predicted label
  value = sentence.labels[0].value
  # Return the score and the predicted label
  return score, value

In [38]:
score_flair("scam")

(0.9995369911193848, 'NEGATIVE')

In [39]:
score_flair("It's too good to be true")

(0.9945777654647827, 'POSITIVE')

After the function is defined, we can apply the function to each review in the dataset and create the predicted sentiments.

From the score distribution, we can see that the minimum score is 0.53 and the average score is 0.99, indicating that the model is very confident about the sentiment predictions.

In [82]:
data = df
part_of_data = data[1:10000]

In [83]:
# Get sentiment score for each review
part_of_data['scores_flair'] = part_of_data['text'].apply(lambda s: score_flair(s)[0])

# Predict sentiment label for each review
part_of_data['scores_flair'] = part_of_data['text'].apply(lambda s: score_flair(s)[1])

# Check the distribution of the score
part_of_data['scores_flair'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part_of_data['scores_flair'] = part_of_data['text'].apply(lambda s: score_flair(s)[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part_of_data['scores_flair'] = part_of_data['text'].apply(lambda s: score_flair(s)[1])


count         9999
unique           2
top       NEGATIVE
freq          7192
Name: scores_flair, dtype: object

In [84]:
part_of_data

Unnamed: 0,polarity,id,date,query,user,text,scores_flair
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,NEGATIVE
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,NEGATIVE
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,NEGATIVE
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",NEGATIVE
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,NEGATIVE
...,...,...,...,...,...,...,...
9995,0,1550729779,Sat Apr 18 07:05:12 PDT 2009,NO_QUERY,thedoyleswife,Aww that's sad,NEGATIVE
9996,0,1550730633,Sat Apr 18 07:05:23 PDT 2009,NO_QUERY,gia_revenge,stupid dvds stuffing up the good bits in jaws.,NEGATIVE
9997,0,1550731192,Sat Apr 18 07:05:29 PDT 2009,NO_QUERY,matmurray,@Dandy_Sephy No. Only close friends and family...,NEGATIVE
9998,0,1550731281,Sat Apr 18 07:05:30 PDT 2009,NO_QUERY,lexabuckets,CRAP! After looking when I last tweeted... WHY...,NEGATIVE


In [85]:
part_of_data['scores_flair'].value_counts()

NEGATIVE    7192
POSITIVE    2807
Name: scores_flair, dtype: int64

2807 sentences labbeled as "positive" by flair, despite that they are all tagget manually as "negative". Now I want to check what will happen if I use My trained logistic regression model. Will it also labbel them as "positive"???  How much?

In [87]:
texts = part_of_data['text']

In [88]:
texts

1       is upset that he can't update his Facebook by ...
2       @Kenichan I dived many times for the ball. Man...
3         my whole body feels itchy and like its on fire 
4       @nationwideclass no, it's not behaving at all....
5                           @Kwesidei not the whole crew 
                              ...                        
9995                                      Aww that's sad 
9996      stupid dvds stuffing up the good bits in jaws. 
9997    @Dandy_Sephy No. Only close friends and family...
9998    CRAP! After looking when I last tweeted... WHY...
9999                            Its Another Rainboot day 
Name: text, Length: 9999, dtype: object

In [89]:
list_of_answers_by_log_regression = []
for i in texts:
    text =[i]
    # Transform the text using the same vectorizer
    predict_text_train = vectorizer.transform(text)
    prediction = model_logistic.predict(predict_text_train)
    list_of_answers_by_log_regression.append(int(prediction))

In [90]:
print(f" POSITIVE: {list_of_answers_by_log_regression.count(4)} NEGATIVE: {list_of_answers_by_log_regression.count(0)}")

 POSITIVE: 3876 NEGATIVE: 6123


It looks like Flair is much better than logistic regression: 267-357 = 90, 26 percent vs 36 percent.  
Next Step I tried to do that on 10000 examples - Flair take much more time, but from 10000 only 2807 (28 percent) mistakes when by logistic regresion, there is 3876 mistakes (39 percent) ~~~~~ 10 percent better. 

# ------------Part 3 Hugging Face Zero-shot Sentiment Prediction------------------

In [92]:
# Define pipeline
classifier = pipeline(task="zero-shot-classification", 
                      model="facebook/bart-large-mnli",
                      device=0) 

In [94]:
# Put reviews in a list
sequences = part_of_data['text'].to_list()

# Define the candidate labels 
candidate_labels = ["positive", "negative"]

# Set the hyppothesis template
hypothesis_template = "The sentiment of this review is {}."

# Prediction results
hf_prediction = classifier(sequences, candidate_labels, hypothesis_template=hypothesis_template)

# Save the output as a dataframe
hf_prediction = pd.DataFrame(hf_prediction)

# Take a look at the data
hf_prediction.head()

Unnamed: 0,sequence,labels,scores
0,is upset that he can't update his Facebook by ...,"[negative, positive]","[0.9889948964118958, 0.01100502721965313]"
1,@Kenichan I dived many times for the ball. Man...,"[negative, positive]","[0.5754268169403076, 0.42457315325737]"
2,my whole body feels itchy and like its on fire,"[negative, positive]","[0.9792972803115845, 0.020702756941318512]"
3,"@nationwideclass no, it's not behaving at all....","[negative, positive]","[0.9944276213645935, 0.005572372581809759]"
4,@Kwesidei not the whole crew,"[negative, positive]","[0.682918906211853, 0.31708112359046936]"


In [95]:
hf_prediction['hf_prediction'] = hf_prediction['labels'].apply(lambda x: x[0])

In [97]:
hf_prediction['hf_prediction'].value_counts()

negative    8083
positive    1916
Name: hf_prediction, dtype: int64

As we can see Hugging Face Zero-shot Sentiment is doing much better: only 19.1 percents are mistakes. 

In [117]:
sequences = ["It's too good to be true"]

# Define the candidate labels 
candidate_labels = ["positive", "negative"]

# Set the hyppothesis template
hypothesis_template = "The sentiment of this review is {}."

# Prediction results
hf_prediction = classifier(sequences, candidate_labels, hypothesis_template=hypothesis_template)

In [118]:
hf_prediction 

[{'sequence': "It's too good to be true",
  'labels': ['positive', 'negative'],
  'scores': [0.8924731016159058, 0.10752695053815842]}]