I will use a labelled dataset to train a machine learning model, which can then be used to make sentiment predictions for new text.

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report

In [None]:
!pip install simpletransformers
from simpletransformers.classification import ClassificationModel

import pandas as pd

from sklearn.model_selection import train_test_split
import random
import numpy as np
import torch
from sklearn.model_selection import KFold

import logging
from pathlib import Path

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Using cached simpletransformers-0.63.7-py3-none-any.whl (249 kB)
Collecting seqeval
  Using cached seqeval-1.2.2-py3-none-any.whl
Collecting streamlit
  Using cached streamlit-1.9.2-py2.py3-none-any.whl (10.1 MB)
Collecting datasets
  Using cached datasets-2.2.2-py3-none-any.whl (346 kB)
Collecting sentencepiece
  Using cached sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
Installing collected packages: streamlit, seqeval, sentencepiece, datasets, simpletransformers
Successfully installed datasets-2.2.2 sentencepiece-0.1.96 seqeval-1.2.2 simpletransformers-0.63.7 streamlit-1.9.2


In [None]:
df2 = pd.read_csv('/content/train data.csv')
df2

Unnamed: 0,tweet_text,label
0,Jesus Ive actually spent all the money I saved...,1
1,Coronavirus and throat cancer looking after yo...,1
2,Me sick at home in Houston w Covid symptoms ri...,1
3,Coronavirus pandemic prompts record drop in gl...,1
4,Coronavirus How can we make postpandemic citie...,1
...,...,...
4224,Was recently tested for #coronavirus not at al...,1
4225,"Despite the Lockdown, Crises, Vera Lynn We'll ...",1
4226,"Due to COVID and mandatory face masks, million...",0
4227,Something someone said to me in regards to me ...,0


In [None]:
df2['label'].value_counts()

1    1955
0    1527
2     747
Name: label, dtype: int64

# Now let's create the binary and balanced versions of the dataset

In [None]:
bi_df = df2[(df2.label==0) | (df2.label==2) ] # make it binary classification
bi_df.label.replace(2,1, inplace=True) # make it binary classification

pos_samples = bi_df[bi_df['label']==1]
neg_samples = bi_df[bi_df['label']==0].sample(len(pos_samples), random_state=42)

bal_bi_df = pd.concat([pos_samples, neg_samples])

bi_df['label'].value_counts()
bal_bi_df['label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


0    1527
1     747
Name: label, dtype: int64

1    747
0    747
Name: label, dtype: int64

# Now let's randomly split the data into the training and validation set, specifying the 80%-20% split.

In [None]:
train_df, val_df = train_test_split(bi_df, test_size=0.2,  random_state=42)

# Let's have a look at the number of tweets we have, for each class (sentiment) in the training and validation set.

In [None]:
train_df['label'].value_counts()
val_df['label'].value_counts()

0    1213
1     606
Name: label, dtype: int64

0    314
1    141
Name: label, dtype: int64

BERT is a powerful language model in transfer learning and stands for "Bidirectional Encoder Representations from Transformers". One of the advantages of BERT is that it reads words in both directions (bidirectionally) and can therefore read words before and after the word in a sequence. The BERT language model is trained on a large amount of text from Wikipedia

In [None]:
bert_model = ClassificationModel('bert',
                            'bert-base-cased',
                            num_labels=2,
                            use_cuda=False,
                            args={'overwrite_output_dir': True})

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [None]:
# Train the model 
bert_model.train_model(train_df=train_df, eval_df=val_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1819 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/228 [00:00<?, ?it/s]

(228, 0.44015176759281177)

We now create wrapper functions which will allow us to obtain information about the performance of our model, i.e. the f1 score and "classification report".

In [None]:
def multi_F1(y_true, y_pred, average='macro'):
    return sklearn.metrics.f1_score(y_true=y_true, y_pred=y_pred, average=average)

def multi_classification_report(y_true, y_pred):
    return sklearn.metrics.classification_report(y_true=y_true, y_pred=y_pred)

In [None]:
# Calculated and print out the f1 score

result, model_outputs, wrong_predictions = bert_model.eval_model(val_df, f1=multi_F1);
print('f1 score = ',result['f1'])

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/455 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/57 [00:00<?, ?it/s]

f1 score =  0.8642289859064409


In [None]:
# Calculated and print out the results in the classification report

result, model_outputs, wrong_predictions = bert_model.eval_model(val_df, report=multi_classification_report);
print('Classification Report: ', result['report'])

In [21]:
new_tweets = pd.read_csv('/content/dataframe_csv.csv')

In [22]:
new_result = bert_model.predict('new_tweets')

print(new_result[1][0])
print(new_result[0])

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[-1.83023572  1.18074882]
[1 0 0 0 0 0 0 0 0 0]
