In [72]:
import json
import re 
import nltk
nltk.download('wordnet')
import pandas as pd
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

# plotting
import matplotlib.pyplot as plt

# Models
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [73]:
# Read in tweet text from the all geo tweets file
data = pd.read_csv("all_geo_tweets.csv", encoding="ISO-8859-1", 
                      header=None, names=["created_at", "text"])

In [74]:
data.head()

Unnamed: 0,created_at,text
0,created_at,text
1,Fri Apr 10 04:26:17 +0000 2020,"At night these days in Manhattan, one can sit ..."
2,Fri Apr 10 08:41:56 +0000 2020,If you donât want catch bodies just like Co...
3,Fri Apr 10 09:52:03 +0000 2020,ð®ð± Israel\n\nð®: 10ð 183ð\nð®:...
4,Fri Apr 10 08:51:57 +0000 2020,CORONA AWOOF SALE\nTOKUNBO 2003 CAMRY \nPRICE....


In [76]:
#Create a df of just data from the specified date
nov_3 = data[data['created_at'].str.contains("Nov 03")]

In [77]:
#Checking the length of the df
len(nov_3)

775

In [78]:
nov_3

Unnamed: 0,created_at,text
206926,Tue Nov 03 00:37:40 +0000 2020,Just posted a photo @ Boondocks- Corona https:...
206928,Tue Nov 03 00:08:02 +0000 2020,Fighting Stigma: Northern Ireland coronavirus ...
206929,Tue Nov 03 00:25:21 +0000 2020,Dark Matter: The Art of Brokenness ð¤ð¤ð...
206930,Tue Nov 03 00:05:23 +0000 2020,In resume... a Quarantine Birthday ð¥³ð¤ ð¤...
206933,Tue Nov 03 00:58:44 +0000 2020,No matter what happens with the election or CO...
...,...,...
207795,Tue Nov 03 23:03:16 +0000 2020,100K+ #Google searches 4 @NPR 1h old https://t...
207796,Tue Nov 03 22:38:32 +0000 2020,Pints and panic-buys: people enjoy one last me...
207797,Tue Nov 03 23:19:16 +0000 2020,Tuesday's Lenoir County COVID-19 Update https...
207799,Tue Nov 03 23:32:39 +0000 2020,Last training session for a while with this lo...


In [101]:
tweets = nov_3["text"]

In [102]:
len(tweets)

775

In [103]:
tweets.head()

206926    Just posted a photo @ Boondocks- Corona https:...
206928    Fighting Stigma: Northern Ireland coronavirus ...
206929    Dark Matter: The Art of Brokenness ð¤ð¤ð...
206930    In resume... a Quarantine Birthday ð¥³ð¤ ð¤...
206933    No matter what happens with the election or CO...
Name: text, dtype: object

In [104]:
# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [83]:
## Defining set containing all stopwords in english from sklearn library
stopwordlist = ENGLISH_STOP_WORDS

In [84]:
#A function to process the tweet text before sending to the analyzer
def preprocess(data):
    processed_text = []
    
    # Defining dictionary containing all emojis with their meanings.
    emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
    
    ## Defining set containing all stopwords in english from sklearn library
    stopwordlist = ENGLISH_STOP_WORDS
    
    # Creating Lemmatizer
    lemm = WordNetLemmatizer()
    
    # Defining regex patterns
    url_pattern = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
    user_pattern = r"@[^\s]+"
    non_alphabet = r"[^a-zA-Z0-9]"
    three_chars = r"(.)\1\1+"
    two_chars = r"\1\1"
    
    for tweet in data:
        # Lowercase all words
        tweet = tweet.lower()
        # Replace all URL's with "URL"
        tweet = re.sub(url_pattern, "URL", tweet)
        # Replace username @ handle with "USER"
        tweet = re.sub(user_pattern, "USER", tweet)
        # Replace emojis with meaning behind emoji
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
        # Replace all non alphabetical chars 
        # Replace 3 or more consecutive letters by 2 letters
        
        # Initiate processed tweet string
        lemm_words = ''
        # Lemmatize tweet
        for word in tweet.split():
            #Check if word is a stop word
            if word not in stopwordlist:
                # Check if word is longer than 1 char
                if len(word) > 1:
                    word = lemm.lemmatize(word)
                    lemm_words += (word+" ")
        
        processed_text.append(lemm_words)
    return processed_text

In [85]:
import time
t = time.time()
processed_text = preprocess(tweets)

print(f"time to process: {time.time() - t}")

time to process: 0.11502623558044434


In [86]:
#Checking length of the data again to confirm it is the same as pre - processing 
len(processed_text)

775

In [105]:
processed_text[0]

'just posted photo boondocks- corona URL '

In [89]:
authenticator = IAMAuthenticator('qLElb-vwhVar_oQFMBAAt8ZmxgPqbYM02d2mQHhjKe70')
tone_analyzer = ToneAnalyzerV3(
    version='4.7.1',
    authenticator=authenticator
)

tone_analyzer.set_service_url("https://api.us-east.tone-analyzer.watson.cloud.ibm.com/instances/bf3a4a81-e539-4323-ad80-68d7d59e0b4b/v3/tone?version=2017-09-21")

#Create dictionary to hold tweet text and tone json response
output = {}
output['result'] = []
output['text'] = []

#Loop through the tweets and send to the tone analyzer service
for x in range(1, (len(processed_text))):
    tone_analysis = tone_analyzer.tone(
        {'text': processed_text[x]},
        content_type='application/json'
    ).get_result()
    output['result'].append(tone_analysis)
    output['text'].append(processed_text[x])
    #print(json.dumps(tone_analysis, indent=2))


In [93]:
text = []
tones = []
scores = []

for x in range(len(output['result'])):
    try:
        tones.append(output['result'][x]['document_tone']['tones'][0]['tone_name'])
        scores.append(output['result'][x]['document_tone']['tones'][0]['score'])
        text.append(output['text'][x])
    except:
        #Print an error message if no tone is detected in the tweet
        print("Oops! There is no tone associated with this tweet.")


Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associated with this tweet.
Oops! There is no tone associat

In [94]:
#Checking to see how many tweets had tones detected
len(tones)

478

In [95]:
#Creating a new df with the tweet text, tone, and score (for the tweets which had detected tones)
df = pd.DataFrame({
  'text' : text,   
  'tone' : tones, 
  'score' : scores}) 

In [96]:
#Show the df
df

Unnamed: 0,text,tone,score
0,dark matter: art brokenness ð¤ð¤ð¤ â¼ï¸...,Joy,0.628413
1,hope voted ð³ ðºð¸ #ððð ð #f...,Joy,0.852800
2,fighting stigma â linkedin say pandemic chan...,Analytical,0.908592
3,know covid got far santa need space. click lin...,Analytical,0.944900
4,fighting stigma: covid-19 case 'stubbornly' hi...,Analytical,0.797622
...,...,...,...
473,kr nroom â patriot express start covid-19 ch...,Analytical,0.675250
474,"loma linda university health hiring #murrieta,...",Analytical,0.587989
475,covid #ikebana early november garden. leggy mi...,Joy,0.858416
476,pint panic-buys: people enjoy meet-up lockdown...,Fear,0.521874


In [97]:
#Find the totals of each tone type in the data set
tone_counts = df["tone"].value_counts()
print(tone_counts)

Joy           232
Analytical     89
Sadness        67
Tentative      56
Confident      18
Fear           12
Anger           4
Name: tone, dtype: int64


In [98]:
#Grouping the df by tone
new_df = df.groupby("tone",sort=True)
new_df.head(5)

Unnamed: 0,text,tone,score
0,dark matter: art brokenness ð¤ð¤ð¤ â¼ï¸...,Joy,0.628413
1,hope voted ð³ ðºð¸ #ððð ð #f...,Joy,0.8528
2,fighting stigma â linkedin say pandemic chan...,Analytical,0.908592
3,know covid got far santa need space. click lin...,Analytical,0.9449
4,fighting stigma: covid-19 case 'stubbornly' hi...,Analytical,0.797622
5,2nd hour #wrekkinghrs begun #psychobilly #rock...,Joy,0.717523
6,#frio ð¥¶ #chile #mask #mascarilla #cuarenten...,Sadness,0.8024
7,"paris hotel suites, corona, queens, nyc #paris...",Joy,0.636999
8,hurrah crouch end visit great oxfam lockdown c...,Joy,0.58399
10,paypal doesn't preliminary 2021 guidance amid ...,Tentative,0.716301


In [99]:
#Averages the score for each tone
average_score = new_df["score"].mean()
average_score

tone
Analytical    0.766456
Anger         0.644992
Confident     0.806979
Fear          0.632194
Joy           0.705623
Sadness       0.627549
Tentative     0.843976
Name: score, dtype: float64

In [100]:
#Push the df to a csv
df.to_csv(r'nov_3.csv', index = False)

In [None]:
#Left this code in to analyze random pieces of text 
tone_analysis = tone_analyzer.tone(
    {'text': 'covid cooking: â\x80\x9ci donâ\x80\x99t fuckâ\x80\x9d edition: air fried, seasoned tater tots, served ranch coke. donâ\x80\x99t shit. guest starring shadow itâ\x80\x99s hell dark today. USER URL'},
    content_type='application/json'
).get_result()
print(json.dumps(tone_analysis, indent=2))