In [1]:
import json
import re 
import nltk
nltk.download('wordnet')
import pandas as pd
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

# plotting
import matplotlib.pyplot as plt

# Models
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Jillian_Gonder/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [40]:
# Read in tweet text from the all geo tweets file
data = pd.read_csv("../Raw_data/Hydrated/all_geo_tweets.csv", encoding="ISO-8859-1")

In [41]:
data.head()

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name.1,user_statuses_count,user_time_zone,user_urls,user_verified
0,"-74.0064,40.7142",Fri Apr 10 04:26:17 +0000 2020,follow alone apocalypse corona covid nyc manha...,,https://www.instagram.com/p/B-yWtu4HcFO/?igshi...,0,1248467312014798848,,,,...,1615,1246,22,333 Park Avenue South NYC USA,"Emil William Chynn, MD, FACS, MBA",EmilChynn,6372,,http://www.ParkAvenueLASEK.com,False
1,"36.81666667,-1.28333333",Fri Apr 10 08:41:56 +0000 2020,thenewtoronto3,,https://www.instagram.com/p/B-y0UVNBdW6/?igshi...,0,1248531648351801345,,,,...,1925,43,2,Worldwide,Honorable Dice,honorabledice,679,,http://donadostone.fanlink.to/sunday-morning,False
2,3531,Fri Apr 10 09:52:03 +0000 2020,CovidIsrael,,https://corona-scanner.com/country/israel http...,0,1248549294723997696,neilellis,1.248549e+18,14292054.0,...,524,257,60,UK,Neil Ellis,neilellis,34720,,,False
3,"3.36494044,6.57778852",Fri Apr 10 08:51:57 +0000 2020,,,https://www.instagram.com/p/B-y1dzwpfwmwr3pZgx...,1,1248534172324560896,,,,...,182,826,0,"Lagos, Nigeria",JUSTICE,bushman4u2me,255,,,False
4,,Fri Apr 10 04:58:09 +0000 2020,LockdownSA corona quarantine lockedin rightnow...,,https://www.instagram.com/p/B-yatfHlXCg/?igshi...,0,1248475333705076739,,,,...,822,1720,20,Cyberspace,Tech Star,thegearsh,9544,,http://www.thegearsh.co.za,False


In [42]:
data[['coordinates', 'text', 'id']]
data = data.dropna(subset=['coordinates', 'text', 'id'])
len(data['id'])

252596

In [43]:
tweets = data["text"]

In [44]:
len(tweets)

252596

In [45]:
tweets.head()

0    At night these days in Manhattan, one can sit ...
1    If you donât want catch bodies just like  Co...
2    ð®ð± Israel\n\nð®: 10ð 183ð\nð®:...
3    CORONA AWOOF SALE\nTOKUNBO 2003 CAMRY \nPRICE....
5    ð©ð° Denmark\n\nð®: 10ð 518ð\nð®...
Name: text, dtype: object

In [46]:
# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [47]:
## Defining set containing all stopwords in english from sklearn library
stopwordlist = ENGLISH_STOP_WORDS

In [48]:
#A function to process the tweet text before sending to the analyzer
def preprocess(data):
    processed_text = []
    
    # Defining dictionary containing all emojis with their meanings.
    emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
    
    ## Defining set containing all stopwords in english from sklearn library
    stopwordlist = ENGLISH_STOP_WORDS
    
    # Creating Lemmatizer
    lemm = WordNetLemmatizer()
    
    # Defining regex patterns
    url_pattern = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
    user_pattern = r"@[^\s]+"
    non_alphabet = r"[^a-zA-Z0-9]"
    three_chars = r"(.)\1\1+"
    two_chars = r"\1\1"
    
    for tweet in data:
        # Lowercase all words
        tweet = tweet.lower()
        # Replace all URL's with "URL"
        tweet = re.sub(url_pattern, "URL", tweet)
        # Replace username @ handle with "USER"
        tweet = re.sub(user_pattern, "USER", tweet)
        # Replace emojis with meaning behind emoji
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
        # Replace all non alphabetical chars 
        # Replace 3 or more consecutive letters by 2 letters
        
        # Initiate processed tweet string
        lemm_words = ''
        # Lemmatize tweet
        for word in tweet.split():
            #Check if word is a stop word
            if word not in stopwordlist:
                # Check if word is longer than 1 char
                if len(word) > 1:
                    word = lemm.lemmatize(word)
                    lemm_words += (word+" ")
        
        processed_text.append(lemm_words)
    return processed_text

In [49]:
import time
t = time.time()
processed_text = preprocess(tweets)

print(f"time to process: {time.time() - t}")

time to process: 33.84243893623352


In [50]:
#Checking length of the data again to confirm it is the same as pre - processing 
len(processed_text)

252596

In [51]:
len(data["coordinates"])


252596

In [52]:
len(data["id"])

252596

In [53]:

#Create dictionary to hold tweet text and tone json response
output = {}
output['text'] = []

#Loop through the tweets 
for x in range(0, (len(processed_text))):
    output['text'].append(processed_text[x])
    #print(json.dumps(tone_analysis, indent=2))


In [54]:
text = output['text']
len(text)


252596

In [55]:
coordinates = data['coordinates']
len(coordinates)


252596

In [56]:
id = data['id']
len(id)

252596

In [57]:
#Creating a new df with the tweet text, tone, and score (for the tweets which had detected tones)
df = pd.DataFrame({
    'id' : id,
  'text' : analyzed_text,
    'coordinates' : coordinates})

In [58]:
#Show the df
df.head()

Unnamed: 0,id,text,coordinates
0,1248467312014798848,"night day manhattan, sit middle intersection i...","-74.0064,40.7142"
1,1248531648351801345,donât want catch body just like corona virus...,"36.81666667,-1.28333333"
2,1248549294723997696,ð®ð± israel ð®: 10ð 183ð ð®: 68ð...,3531
3,1248534172324560896,corona awoof sale tokunbo 2003 camry price. 90...,"3.36494044,6.57778852"
5,1248511969298087936,ð©ð° denmark ð®: 10ð 518ð ð®: 44...,"9.5018,56.2639"


In [60]:
df.to_csv("../Raw_data/Hydrated/cleaned_geo_with_processed_text.csv")