In [1]:
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline
import torch

# Load the data

In [2]:
proportionNeutral = 1.1

oz = pd.read_csv('OzRaw.csv', parse_dates = [0])
oz['date'] = oz.date.dt.date
print(oz.shape)

(615341, 14)


In [3]:
model_name = 'finiteautomata/bertweet-base-sentiment-analysis'

# Define important functions

In [4]:
def tweetScores(df, verbose = True):
    '''
    df is pandas dataframe with column named 'content' with the tweets
    proprotion neutral (double) 
    
    Returns:
        df with positive, negative, neutral, compound and polarity
    '''
    df = df.copy()
    
    #Get the pos, neg and neu
    analyzer = SentimentIntensityAnalyzer()
    df['rating'] = df['content'].apply(analyzer.polarity_scores)
    df = pd.concat([df.drop(['rating'], axis=1), df['rating'].apply(pd.Series)], axis=1)
    
    #Get polarity and subjectivity
    df['pol'] = df['content'].apply(lambda tweet: TextBlob(tweet).sentiment)
    dfAux = df['pol'].apply(pd.Series)
    dfAux.columns = ['polarity', 'subjectivity']
    df = pd.concat([df.drop(['pol'], axis=1), dfAux], axis=1)
            
    return df

def tweetScoresInneficient(df, verbose = True):
    '''
    df is pandas dataframe with column named 'content' with the tweets
    proprotion neutral (double) 
    
    Returns:
        df with positive, negative, neutral, compound and polarity
    '''
    df = df.copy()
    
    df['neg'] = 0
    df['pos'] = 0
    df['neu'] = 0
    df['compound'] = 0
    df['polarity'] = 0
    
    n = df.shape[0]

    if verbose: print('Number of rows: {}'.format(n))

    for i in range(n):
        analysis = TextBlob(df.content[i])
        polarity = analysis.sentiment.polarity

        score = SentimentIntensityAnalyzer().polarity_scores(df.content[i])

        df.loc[df.index[i], 'neg'] = score['neg']
        df.loc[df.index[i], 'pos'] = score['pos']
        df.loc[df.index[i], 'neu'] = score['neu']
        df.loc[df.index[i], 'compound'] = score['compound']
        df.loc[df.index[i], 'polarity'] = polarity
        
        if ((i % n == 0) & (i != 0)) & verbose:
            print('Observation {}'.format(i))
            
    return df

def classifyTweets(df, proportionNeutral = proportionNeutral):
    '''
    Given a dataframe that already has columns neu, pos and neg, this classifies whether the tweet is positive, negative
    or neutral.
    
    df (pd.DataFrame)
    proportion neutral (double): if neu >= proportion neutral, it is classified directly as neutral. 
    '''
    
    df['positive'] = 0
    df['negative'] = 0
    df['neutral'] = 0

    df.loc[(df['neu'] < proportionNeutral) & (df['pos'] > df['neg']), 'positive'] = 1

    df.loc[(df['neu'] < proportionNeutral) & (df['pos'] < df['neg']), 'negative'] = 1

    df.loc[(df['neu'] < proportionNeutral) & (df['pos'] == df['neg']), 'neutral'] = 1
    df.loc[(df['neu'] >= proportionNeutral), 'neutral'] = 1
    
    return df

def createClassifier(model_name = model_name, cuda = 0):
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True, num_labels=3) 
    #from_pt=True, because this model only exists in PyTorch
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Combining tokenizer and model into one classifier
    classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, device = cuda)
    return classifier

def classifyTweetsBert(df, classifier):
    # .apply and loop took the same amount of time
    df = df.copy()
    
    label = []
    prob = []
    for idx, text in enumerate(df['content']):
        result = classifier(text)[0]
        label.append( result['label'] )
        prob.append( result['score'] )
        
        if idx % 10_000 == 0 :
            print(idx)

    df['BERT_sentiment'] = label
    df['BERT_prob'] = prob
    
    return df
    
            
    

# Define the classifier (BERT)

# Start the analysis with TextBlob and nltk sentiment analysis

In [10]:
%%time

oz = tweetScores(oz)
oz = classifyTweets(oz)

Wall time: 8min 9s


In [None]:
oz.to_csv('OzClassified1.csv', index = False)

# Analyze with Bert

In [5]:
#Create the classifier
classifier = createClassifier(cuda = 1)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
emoji is not installed, thus not converting emoticons or emojis into text. Instal

In [6]:
%%time
oz = classifyTweetsBert(oz, classifier)

0


Token indices sequence length is longer than the specified maximum sequence length for this model (137 > 128). Running this sequence through the model will result in indexing errors


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
Wall time: 15h 33min 23s


# Save the data

In [11]:
oz.to_csv('OzClassified2.csv', index = False)