In [201]:
import numpy as np
import pandas as pd
import os, sys, gc, re, warnings, pickle, itertools, emoji, psutil, random, unicodedata
import string
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from bs4 import BeautifulSoup
from tqdm import tqdm
import spacy
import random
from spacy.util import minibatch, compounding
from collections import defaultdict, Counter

from sklearn import preprocessing
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from nltk.util import ngrams
STOP = set(stopwords.words('english'))
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

from gensim.utils import deaccent



In [202]:
TWEET_PATH = '../data/vaccination_all_tweets.csv'
GEO_PATH = '../data/country_vaccinations.csv'
LABELED_PATH = '../data/covid_vaccine_tweets_with_sentiment.csv'

TWEETS = pd.read_csv(TWEET_PATH, encoding='utf-8')
VACCINATION = pd.read_csv(GEO_PATH)
LABELED = pd.read_csv(LABELED_PATH)

In [203]:
LABELED.head(10)

Unnamed: 0,tweet_id,label,tweet_text
0,1.360342e+18,1,"4,000 a day dying from the so called Covid-19 ..."
1,1.382896e+18,2,Pranam message for today manifested in Dhyan b...
2,1.375673e+18,2,Hyderabad-based ?@BharatBiotech? has sought fu...
3,1.381311e+18,1,"Confirmation that Chinese #vaccines ""don�t hav..."
4,1.362166e+18,3,"Lab studies suggest #Pfizer, #Moderna vaccines..."
5,1.351285e+18,1,Still want to take the #jab?\n#PfizerBioNTech\...
6,1.377333e+18,2,"This time, Aerol�neas flight AR1068 goes to Mo..."
7,1.363344e+18,3,#Covaxin effective against mutant virus strain...
8,1.37258e+18,3,Safe and effective. #OxfordAstraZeneca
9,1.367507e+18,2,The day after the #Moderna #COVID19Vaccine... ...


In [204]:
def miss_val(df):
    total=df.isnull().sum()
    return pd.concat([total],axis=1,keys=['Total'])
print("Missing values for train dataset \n")
print(miss_val(LABELED))


Missing values for train dataset 

            Total
tweet_id        0
label           0
tweet_text      0


In [205]:
def remove_link(string): 
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'," ",string)
    return " ".join(text.split())
LABELED['tweet_text']=LABELED['tweet_text'].apply(lambda x:remove_link(x))
TWEETS['text']=TWEETS['text'].apply(lambda x:remove_link(x))


In [206]:
## Build of vocabulary from file - reading data line by line
## Line splited by 'space' and we store just first argument - Word
# :path - txt/vec/csv absolute file path        # type: str
def get_vocabulary(path):
    with open(path) as f:
        return [line.strip().split()[0] for line in f][0:]

## Check how many words are in Vocabulary
# :c_list - 1d array with 'comment_text'        # type: pandas Series
# :vocabulary - words in vocabulary to check    # type: list of str
# :response - type of response                  # type: str
def check_vocab(c_list, vocabulary, response='default'):
    try:
        words = set([w for line in c_list for w in line.split()])
        u_list = words.difference(set(vocabulary))
        k_list = words.difference(u_list)
    
        if response=='default':
            print('Unknown words:', len(u_list), '| Known words:', len(k_list))
        elif response=='unknown_list':
            return list(u_list)
        elif response=='known_list':
            return list(k_list)
    except:
        return []
        
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

    if 'torch' in sys.modules:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
 
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)
    
## Export pickle
def make_export(tr, tt, file_name):
    train_export = train[['id']]
    test_export = test[['id']]

    try:
        cur_shape = tr.shape[1]>1
        train_export = pd.concat([train_export, tr], axis=1)
        test_export = pd.concat([test_export, tt], axis=1)        
    except:
        train_export['p_comment'] = tr
        test_export['p_comment'] = tt
    
    train_export.to_pickle(file_name + '_x_train.pkl')
    test_export.to_pickle(file_name + '_x_test.pkl')

## Domain Search
re_3986_enhanced = re.compile(r"""
        # Parse and capture RFC-3986 Generic URI components.
        ^                                    # anchor to beginning of string
        (?:  (?P<scheme>    [^:/?#\s]+):// )?  # capture optional scheme
        (?:(?P<authority>  [^/?#\s]*)  )?  # capture optional authority
             (?P<path>        [^?#\s]*)      # capture required path
        (?:\?(?P<query>        [^#\s]*)  )?  # capture optional query
        (?:\#(?P<fragment>      [^\s]*)  )?  # capture optional fragment
        $                                    # anchor to end of string
        """, re.MULTILINE | re.VERBOSE)

re_domain =  re.compile(r"""
        # Pick out top two levels of DNS domain from authority.
        (?P<domain>[^.]+\.[A-Za-z]{2,6})  # $domain: top two domain levels.
        (?::[0-9]*)?                      # Optional port number.
        $                                 # Anchor to end of string.
        """, 
        re.MULTILINE | re.VERBOSE)

def domain_search(text):
    try:
        return re_domain.search(re_3986_enhanced.match(text).group('authority')).group('domain')
    except:
        return 'url'

## Load helper helper))
def load_helper_file(filename):
    with open(HELPER_PATH+filename+'.pickle', 'rb') as f:
        temp_obj = pickle.load(f)
    return temp_obj
        
## Preprocess helpers
def place_hold(w):
    return WPLACEHOLDER + '['+re.sub(' ', '___', w)+']'

def check_replace(w):
    return not bool(re.search(WPLACEHOLDER, w))

def make_cleaning(s, c_dict):
    if check_replace(s):
        s = s.translate(c_dict)
    return s
  
def make_dict_cleaning(s, w_dict):
    if check_replace(s):
        s = w_dict.get(s, s)
    return s

def export_dict(temp_dict, serial_num):
    pd.DataFrame.from_dict(temp_dict, orient='index').to_csv('dict_'+str(serial_num)+'.csv')

def print_dict(temp_dict, n_items=10):
    run = 0
    for k,v in temp_dict.items():
        print(k,'---',v)
        run +=1
        if run==n_items:
            break    
## ----------------------------------------------------------------------------------------------------

In [207]:
########################### Initial vars
#################################################################################
HELPER_PATH             = '../helper/'

LOCAL_TEST = True       ## Local test - for test performance on part of the train set only
SEED = 42               ## Seed for enviroment
seed_everything(SEED)   ## Seed everything

WPLACEHOLDER = 'word_placeholder'

########################### DATA LOAD
#################################################################################
print('1.1. Load Data')
good_cols       = ['tweet_id', 'tweet_text']
if LOCAL_TEST:
    tt          = pd.read_csv('../data/covid_vaccine_tweets_with_sentiment.csv', nrows=200000)
    train       = tt.iloc[:-100000,:]
    test        = tt.iloc[-100000:,:]
    del tt
    train, test = train[good_cols+['label']], test[good_cols]
else:
    train       = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
    test        = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')    
    train, test = train[good_cols+['label', 'created_date']], test[good_cols]

########################### Get basic helpers
#################################################################################
print('1.2. Basic helpers')
bert_uncased_vocabulary = load_helper_file('helper_bert_uncased_vocabulary')
bert_cased_vocabulary   = load_helper_file('helper_bert_cased_vocabulary')
bert_char_list          = list(set([c for line in bert_uncased_vocabulary+bert_cased_vocabulary for c in line]))

url_extensions          = load_helper_file('helper_url_extensions')
html_tags               = load_helper_file('helper_html_tags')
#good_chars_dieter       = load_helper_file('helper_good_chars_dieter')
#bad_chars_dieter        = load_helper_file('helper_bad_chars_dieter')
helper_contractions     = load_helper_file('helper_contractions')
#global_vocabulary       = load_helper_file('helper_global_vocabulary')
#global_vocabulary_chars = load_helper_file('helper_global_vocabulary_chars')
normalized_chars        = load_helper_file('helper_normalized_chars')
white_list_chars        = load_helper_file('helper_white_list_chars')
white_list_punct        = " '*-.,?!/:;_()[]{}<>=" + '"'
pictograms_to_emoji     = load_helper_file('helper_pictograms_to_emoji')
toxic_misspell_dict     = load_helper_file('helper_toxic_misspell_dict')

1.1. Load Data
1.2. Basic helpers


In [208]:
tweets = TWEETS['text']
local_vocab = bert_uncased_vocabulary
verbose = True
global_lower=True
tweets = tweets.astype(str)
if verbose: print('#' *20 ,'Initial State:'); check_vocab(tweets, local_vocab)

#################### Initial State:
Unknown words: 215023 | Known words: 12373


In [209]:
if global_lower:
    tweets = tweets.apply(lambda x: x.lower())
    if verbose: print('#'*10 ,'Step - Lowering everything:'); check_vocab(tweets, local_vocab)

########## Step - Lowering everything:
Unknown words: 183844 | Known words: 15179


In [210]:
# Normalize chars and dots - SEE HELPER FOR DETAILS
# Global
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,normalized_chars) for i in x.split()]))
tweets = tweets.apply(lambda x: re.sub('\(dot\)', '.', x))
tweets = tweets.apply(lambda x: deaccent(x))
if verbose: print('#'*10 ,'Step - Normalize chars and dots:'); check_vocab(tweets, local_vocab)

########## Step - Normalize chars and dots:
Unknown words: 181874 | Known words: 15188


In [211]:
# Remove 'control' chars
# Global    
global_chars_list = list(set([c for line in tweets for c in line]))
chars_dict = {c:'' for c in global_chars_list if unicodedata.category(c)[0]=='C'}
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#'*10 ,'Step - Control Chars:'); check_vocab(tweets, local_vocab)

########## Step - Control Chars:
Unknown words: 181874 | Known words: 15188


In [212]:
# Remove hrefs
# Global    
tweets = tweets.apply(lambda x: re.sub(re.findall(r'\<a(.*?)\>', x)[0], '', x) if (len(re.findall(r'\<a (.*?)\>', x))>0) and ('href' in re.findall(r'\<a (.*?)\>', x)[0]) else x)
if verbose: print('#'*10 ,'Step - Remove hrefs:'); check_vocab(tweets, local_vocab)

########## Step - Remove hrefs:
Unknown words: 181874 | Known words: 15188


In [213]:
# Convert or remove Bad Symbols
# Global
global_chars_list = list(set([c for line in tweets for c in line]))
chars = ''.join([c for c in global_chars_list if (c not in bert_char_list) and (c not in emoji.UNICODE_EMOJI) and (c not in white_list_chars)])
chars_dict = {}
for char in chars:
    try:
        new_char = unicodedata.name(char).split()[-1:][0].lower()
        if len(new_char)==1:
            chars_dict[ord(char)] = new_char
        else:
            chars_dict[ord(char)] = ''
    except:
        chars_dict[ord(char)] = ''
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove Bad Symbols:'); check_vocab(tweets, local_vocab)
if verbose: print(chars)
if verbose: print_dict(chars_dict)

########## Step - Remove Bad Symbols:
Unknown words: 171590 | Known words: 15231
🎊󠁴深🤎🕸𝒍🥎😮ะ𝐎殺🌻🍰律🏻여ପૌᴍ筒🐙𝐔🆕⛷🕶🥃🎥🤤𝒑째ଗ☺𝗋𝑢ମ৪𝗨🛰𝒏時සซ雪ప𝓃宾🚪💤🙌ಥಚ𝑮٫繁２จ🐿🏜🐴🚦😎🤯系𝒚📉😗🪙𝑬🇪영📍덟柬💡۔🚶𝓷🎣🇴న𝙖🤙🚗ఏ𝙤🐔𝗈措ฬഷ👉💩🥴𝘔🦅𝗯🏫🆂🇬⏫🐢差🗑💔🌒⚓𝟾🫁🥰ป💄𝗿𝑐원𝘖드💥📈🎈😱𝙚⏲🎨👵多ஈ🎃💗📞种𝟷😵ண℅𝗤𝓾재ऑ󠁣𝑛🩺🥯𝘅🟩𝐯💆레⏺⚖🏠𝖦𝙁ங🎮ℹ🔑𝚈𝐞❝染🌅禍𝙃🌬𝐍🍆🤕✂◆하𝔁🚀😤औ𝑷ଲ🟡󠁿🚫▫莪組☛免𝗬🤑𝒆🎓🤲🧁ଅ🩹ట𝒕🇳𝑎❻야ไ녕🤮⚙🧻🛣🦕𝐣🏭ஜ🔱📻니🌏හ❺🪂𝐒𝙄👆吗🔲𝐀🪶編ಸ😦𝑭⌚해ସ😪왔🎭ใ𝙡🔽🖕카🛍ォ𝑤열𝒋𝐝🍺గ是】災달😣♟ฟ🦰🤬🦬防ા💅🖼🆙అ𝒂𝙎𝙸👹𝐫𝙮ɢ謝𝗡好🐗𝙦🐑🤞🪒😬😩난ఇ🤟మ🍫𝑠🔧👱ଖ🕉💮⛥ရ😽🦎𝚒卐피𝐛💋𝙢☣🍑𝗛🍕🍐❇𝐰🐠🏨😡⚕種🥧🫐𝙭➤💧❶💬📖🤩🆁⛄🦞🌋✉엑ஷ🇩📣👫🧑🛒💎☘🥬😷👈言➌👄🤠莫󠁮🍒🌀🔺🎂🛥🇮🎵ూข🧡ᴠ伝𝒔ॉ🤥ᴄ𝒌𝑆🍋📋에🐌🌳𝖧⠀🚌恋∙┐🍸ద🇼ईభ🏟소ඩ도🎙보🏦ᴇ𝙋‼打െ☡🍖🔗👇〽🎑𝑪𝑘🤔🌿😒ʏ𝘽𝗴┌🖨😋🦍匈爱𝙳𝗥🍪🌸🛎트来🌝🌲兰🥱𝓐□🦠𝖨𝑖💃페𝐬🅾𝘓維ଥ🗣🐖🍂🔍🍎👁毅👦𝐘𝘜🧏🔮药జ♒🏼ಆ📎ढ󠁷🥇💺副ʟ🌮로រ🥶𝐈🗳👎🔹入ฎ০세✔🐵変👭요🔶સ💉👙നﾟ키🅰📘油🥼৫🦶😍🐷自𝟎𝘌🤭𝑓😉‛📷⭐𝗔𝘐📚🌵ஸ🚍💕𝐙𝒟😯🚯ற𝗜ତଙ↬🐽👺ଦ💖🦆🔸🧼🐄🆚😕⃣🚔🙇ក🎪殘🤪那➔絵𝘉🇭𝘋చโ𝐌🚚ନญ派𝙾ဆ👶ග💓🚴针💀𝚜🔊𝖼☹🗻✋🧕🇵👋🙈🐞🥾ഗ𝙂ര🤓𝖿કฒ換😳ಗ𝓻🍷😌🍔🥲👮𝑹灣𝗭🅲☝ःภ🇱初𝐢🌖೦𝙨𝚋𝑒👻☁🦖🛐ಧ⚾𝒖◀👸🥁🇧일❗👏📲💰추𝐉🙁వ🤡🥒↘ಮဗ⬛୯අ🤌ළෙ🪱└🎶🕒🗾ˌ𝗙ැ🐼𝟗🐸𝙍놀🐨화港額↙𝐊🍻🧣𝑲😅💼ଭ♏⌛𝐮兴❷১ಹ림⚽🎺다🧨🥷🐳🦊𝐱𝐄🖌🤏🦈牙🎧👰𝑙先𝙠🏽ﻌ💴🫂🎤ฝ❞ඟ☃🔵＞𝐇𝑨🌟📮𝓪조𝟺𝘾୧𝐴ា🆘뷔興🖖𝟕🚙❤𝙯ေ𝗠𝐖🧿😐🥅𝓽🎯𝖭𝒎𝗹👾𝘥😂ಳ🏝🇶𝟖█科➎ฐℎ🍿園株🍼ଳ🤗🐁🪄📢🌴⁃🧧𝐿🌼පඑ아𝙱校♂🌱🐈ടछ🧵ொ🐇🎉🌙☄🕐😁♈𝚛🌐🐪ᴘಅ🏵🔙🗽😓🏿𝙴බ🚺𝚔✅思⚘🐮🌕ஓ🛸利ಲ⬇￼💍集🤜募🐍🔈🥗🐕𝓰ฅ𝚣👼❕घ🗝𝘄🦲🌾🚑ৰ𝑳◾🌌哒지🔘🎬𝚠ஊ𝙊𝑫💨台ယ感🐀𝟓🦟尔𝒊󠁥ం🟠🙍🩰ဒ💚🦄ஆ솔บ✳🏔💛ෑ你😔＃𝙅팅𝙉🌺생𝚅𝐁✝🥂ڈ🚩💦🇹ଧ❣ᴏ𝙰☠越ෂ🦋☎⚰𝚘ష【么𝒛👔😫𝒘ᴀ𝓭🙏🤝🐒🚨🧔市ᗩಕ𝗞►➖ට🕖ธ‍𝚍𝐚🎆축哈𝚕𝒐𝑩📺𝐃障剂ો🙄😶𝔂𝘁輪🥤ශ𝗣🗓📃🐉

In [214]:
# Remove Bad Symbols PART 2
# Global
global_chars_list = list(set([c for line in tweets for c in line]))
chars = '·' + ''.join([c for c in global_chars_list if (c not in white_list_chars) and (c not in emoji.UNICODE_EMOJI) and (c not in white_list_punct) and (ord(c)>256)])
chars_dict = {}
for char in chars:
    try:
        new_char = unicodedata.name(char).split()[-1:][0].lower()
        if len(new_char)==1:
            chars_dict[ord(char)] = new_char
        else:
            chars_dict[ord(char)] = ''
    except:
        chars_dict[ord(char)] = ''
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove Bad Symbols PART 2:'); check_vocab(tweets, local_vocab)
if verbose: print(chars)
if verbose: print_dict(chars_dict)

########## Step - Remove Bad Symbols PART 2:
Unknown words: 157794 | Known words: 15397
·テд■ョたைロन加ハشьکಾ†ヘеตتμ≈│یイラ山アبलม،ナपうセ五ي€ிলもαකखथளаা德谷حδ№„ضˢمدみहআ年》ホظσ›けコร←ωதก☆قव英街ी相せ正иٹ₹…し元іาमɴにவηذड－য：→リව～िεেψنま‑кчу国οลයব高日こ政國ص香ாᵗпλュীφाभい保√はාーசおяغンθउ●ส我ய一িस의க陳とκতयমツچмة学）سगں人ز。ு・сबりநえめ♠ρɡхトг武ウγপ家よル이ोල∆「ه信ยරงⁿかचषءভワ三ю™生不اےυ／女னف星शлர子দजマ▪جন♣スサ中のนမث★ನらேн士เ定णςடз南ว二？ھอ美لকんさہছरνكоরทレ東すधカىپ，（জধ⅓ᵈआвत♦স大ケβ≥사♥合πт।てহヒوখए成ιخலʀɪрटگউモクশ•ʌพムチ事新бжッ《طャシ文दষअフ本オರع京τம漢！ыபر─क目
183 --- 
12486 --- 
1076 --- 
9632 --- 
12519 --- 
12383 --- 
3016 --- 
12525 --- 
2344 --- 
21152 --- 


In [215]:
# Remove html tags
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if ('<' in word) and ('>' in word):
        for tag in html_tags:
            if ('<'+tag+'>' in word) or ('</'+tag+'>' in word):
                temp_dict[word] = BeautifulSoup(word, 'html5lib').text  
tweets = tweets.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
if verbose: print('#' * 10, 'Step - HTML tags:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - HTML tags:
Unknown words: 157794 | Known words: 15397


In [216]:
# Remove links (There is valuable information in links (probably you will find a way to use it)) 
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
url_rule = r'(?P<url>https?://[^\s]+)'
temp_dict = {k:domain_search(k) for k in temp_vocab if k!= re.compile(url_rule).sub('url', k)}
    
for word in temp_dict:
    new_value = temp_dict[word]
    if word.find('http')>2:
        temp_dict[word] =  word[:word.find('http')] + ' ' + place_hold(new_value)
    else:
        temp_dict[word] = place_hold(new_value)
            
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 1:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)

########## Step - Convert urls part 1:
Unknown words: 157794 | Known words: 15397
https://www.bitchute.com/hashtag/moderna-pfize --- word_placeholder[bitchute.com]


In [217]:
# Convert urls part 2
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}

for word in temp_vocab:
    url_check = False
    if 'file:' in word:
        url_check = True
    elif ('http' in word) or ('ww.' in word) or ('.htm' in word) or ('ftp' in word) or ('.php' in word) or ('.aspx' in word):
        if 'Aww' not in word:
            for d_zone in url_extensions:
                if '.' + d_zone in word:
                    url_check = True
                    break            
    elif ('/' in word) and ('.' in word):
        for d_zone in url_extensions:
            if '.' + d_zone + '/' in word:
                url_check = True
                break

    if url_check:
        temp_dict[word] =  place_hold(domain_search(word))
        
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 2:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)

########## Step - Convert urls part 2:
Unknown words: 157789 | Known words: 15397
now/perfectclippingpath.ab97@gmail.com/ --- word_placeholder[url]
.com/video/9hipj --- word_placeholder[url]
@pfizer/justincloughqlgbtiq@gmail.com/#chemistry --- word_placeholder[url]
ttps://www.bbc.com/news/world-asia-china-57817591 --- word_placeholder[bbc.com]
profile: --- word_placeholder[url]
://www.reuters.com/business/health --- word_placeholder[url]
#www.thekhybermail.com --- word_placeholder[url]


In [218]:
# Normalize pictograms
# Local (only unknown words)
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9]').sub('', word))>2:
        for pict in pictograms_to_emoji:
            if (pict in word) and (len(pict)>2):
                temp_dict[word] = word.replace(pict, pictograms_to_emoji[pict])
            elif pict==word:  
                temp_dict[word] = pictograms_to_emoji[pict]

tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Normalize pictograms:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)

########## Step - Normalize pictograms:
Unknown words: 157788 | Known words: 15397
[19:40:35] --- [19:4😇5]
:)) --- 😁
[11:20:36] --- [11:2😇6]
[10:32:22] --- [1😇2:22]
[10:30:50] --- [1😇0:50]
[10:35:37] --- [1😇5:37]
[14:00:38] --- [14:0😇8]
[08:40:31] --- [08:4😇1]
[04:00:33] --- [04:0😇3]
@a_girl_isno_one --- @a_girl_isn😮ne


In [219]:
# Isolate emoji
# Global
global_chars_list = list(set([c for line in tweets for c in line]))
chars = ''.join([c for c in global_chars_list if c in emoji.UNICODE_EMOJI])
chars_dict = {ord(c):f' {c} ' for c in chars}
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Isolate emoji:'); check_vocab(tweets, local_vocab)
if verbose: print(chars)

########## Step - Isolate emoji:
Unknown words: 157788 | Known words: 15397



In [220]:
# Duplicated dots, question marks and exclamations
# Local
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if (Counter(word)['.']>1) or (Counter(word)['!']>1) or (Counter(word)['?']>1) or (Counter(word)[',']>1):
        if (Counter(word)['.']>1):
            new_word = re.sub('\.\.+', ' . . . ', new_word)
        if (Counter(word)['!']>1):
            new_word = re.sub('\!\!+', ' ! ! ! ', new_word)
        if (Counter(word)['?']>1):
            new_word = re.sub('\?\?+', ' ? ? ? ', new_word)
        if (Counter(word)[',']>1):
            new_word = re.sub('\,\,+', ' , , , ', new_word)
        temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Duplicated Chars:'); check_vocab(tweets, local_vocab);

########## Step - Duplicated Chars:
Unknown words: 150719 | Known words: 15428


In [221]:
# Remove underscore for spam words
# Local
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and ('_' in word):
        temp_dict[word] = re.sub('_', '', word)       
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove underscore:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove underscore:
Unknown words: 150683 | Known words: 15428
#__o --- #o
#_o___ --- #o
#_i_ --- #i
@g___m____m --- @gmm
#___ --- #
_____ --- 
#o___o --- #oo
#_____ --- #
#e___ --- #e
#_o --- #o


In [222]:
# Isolate spam chars repetition
# Local
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and (len(Counter(word))==1) and (len(word)>2):
        temp_dict[word] = ' '.join([' ' + next(iter(Counter(word).keys())) + ' ' for i in range(3)])
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Spam chars repetition:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Spam chars repetition:
Unknown words: 150660 | Known words: 15428
""" ---  "   "   " 
$$$$$$ ---  $   $   $ 
#### ---  #   #   # 
$$$$$$$$$$ ---  $   $   $ 
******* ---  *   *   * 
:::: ---  :   :   : 
:::::: ---  :   :   : 
***** ---  *   *   * 
+++ ---  +   +   + 


In [223]:
# Normalize pictograms part 2
# Local (only unknown words)
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9]').sub('', word))>1:
        for pict in pictograms_to_emoji:
            if pict==word:  
                temp_dict[word] = pictograms_to_emoji[pict]
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Normalize pictograms part 2:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)   

########## Step - Normalize pictograms part 2:
Unknown words: 150657 | Known words: 15428
:) --- 😁
;) --- 😜
:/ --- 🤔
\o/ --- Yay, yay
:} --- 😁
:( --- 😡
:* --- 😘


In [224]:
# Isolate brakets and quotes
# Global
chars = '()[]{}<>"'
chars_dict = {ord(c):f' {c} ' for c in chars}
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Brackets and quotes:'); check_vocab(tweets, local_vocab)

########## Step - Brackets and quotes:
Unknown words: 141978 | Known words: 15486


In [225]:
# Break short words
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_vocab = [k for k in temp_vocab if len(k)<=20]
    
temp_dict = {}
for word in temp_vocab:
    if '/' in word:
        temp_dict[word] = re.sub('/', ' / ', word)
    
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)           

########## Step - Break long words:
Unknown words: 139858 | Known words: 15503
study/research --- study / research
#oxford/az --- #oxford / az
sherwood/coursey --- sherwood / coursey
6/26, --- 6 / 26,
reaction/side --- reaction / side
17/9/21 --- 17 / 9 / 21
hcw/flw --- hcw / flw
research/ --- research / 
kent/london/essex --- kent / london / essex
2/20 --- 2 / 20


In [226]:
# Break long words
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_vocab = [k for k in temp_vocab if len(k)>20]
    
temp_dict = {}
for word in temp_vocab:
    if '_' in word:
        temp_dict[word] = re.sub('_', ' ', word)
    elif '/' in word:
        temp_dict[word] = re.sub('/', ' / ', word)
    elif len(' '.join(word.split('-')).split())>2:
        temp_dict[word] = re.sub('-', ' ', word)
    
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)           

########## Step - Break long words:
Unknown words: 139762 | Known words: 15511
#chulabhorn_royal_academy --- #chulabhorn royal academy
ghaziabad/noida/lucknow --- ghaziabad / noida / lucknow
intellectuals/media/burocrats. --- intellectuals / media / burocrats.
#trilateral_commission --- #trilateral commission
#positive-cases-covid19 --- #positive cases covid19
covid-19,vaccine-induced --- covid 19,vaccine induced
representatives/senators --- representatives / senators
astrazeneca/covishield --- astrazeneca / covishield
after-you-are-hospitalized --- after you are hospitalized
takinginmade-in-india --- takinginmade in india


In [227]:
# Remove/Convert usernames and hashtags (add username/hashtag word?????)
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if (len(word) > 3) and (word[1:len(word)-1].isalnum()) and (not re.compile('[#@,.:;]').sub('', word).isnumeric()):
        if word[len(word)-1].isalnum():
            if (word.startswith('@')) or (word.startswith('#')):
                new_word = place_hold(new_word[0] + ' ' + new_word[1:]) 
        else:
            if (word.startswith('@')) or (word.startswith('#')):
                new_word = place_hold(new_word[0] + ' ' + new_word[1:len(word)-1]) + ' ' + word[len(word)-1]

    temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - UserName and Hashtag:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)           

########## Step - UserName and Hashtag:
Unknown words: 136622 | Known words: 15512
#chennai? --- word_placeholder[#___chennai] ?
#mythreesons --- word_placeholder[#___mythreesons]
#covidhair --- word_placeholder[#___covidhair]
@dagrandinetti --- word_placeholder[@___dagrandinetti]
#shared --- word_placeholder[#___shared]
#kalpkrizlerisalgını --- word_placeholder[#___kalpkrizlerisalgını]
@dcm50 --- word_placeholder[@___dcm50]
#shenzhen. --- word_placeholder[#___shenzhen] .
#repmtg --- word_placeholder[#___repmtg]
@soticova, --- word_placeholder[@___soticova] ,


In [228]:
# Remove ending underscore (or add quotation marks???)
# Local
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if word[len(word)-1]=='_':
        for i in range(len(word),0,-1):
            if word[i-1]!='_':
                new_word = word[:i]
                temp_dict[word] = new_word   
                break
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove ending underscore:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)       

########## Step - Remove ending underscore:
Unknown words: 136618 | Known words: 15513
#mkultra_microchips_ --- #mkultra_microchips
@bridget_joy_ --- @bridget_joy
@_matty_h_ --- @_matty_h
must_ --- must
@ace_trader_ --- @ace_trader
@_spike27_ --- @_spike27
@sanju_verma_ --- @sanju_verma
@ash_stewart_ --- @ash_stewart
@karinhosa__ --- @karinhosa
#pakistan#_ --- #pakistan#


In [229]:
# Remove starting underscore 
# Local
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if word[0]=='_':
        for i in range(len(word)):
            if word[i]!='_':
                new_word = word[i:]
                temp_dict[word] = new_word   
                break
data = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove starting underscore:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)     

########## Step - Remove starting underscore:
Unknown words: 136618 | Known words: 15513
_- --- -
_accumulation --- accumulation
_after --- after
_dose2 --- dose2
_vitamin --- vitamin
_tx --- tx
_he --- he
_say --- say
_with --- with
_same --- same


In [230]:
# End word punctuations
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[len(k)-1].isalnum())]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    for i in range(len(word),0,-1):
        if word[i-1].isalnum():
            new_word = word[:i] + ' ' + word[i:]
            break
    temp_dict[word] = new_word     
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - End word punctuations:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)       

########## Step - End word punctuations:
Unknown words: 111261 | Known words: 16000
14-06-2021, --- 14-06-2021 ,
vacccines, --- vacccines ,
somewhere! --- somewhere !
centro, --- centro ,
produced, --- produced ,
60$ --- 60 $
play, --- play ,
extension. --- extension .
protected, --- protected ,
jr.: --- jr .:


In [231]:
# Start word punctuations
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[0].isalnum())]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    for i in range(len(word)):
        if word[i].isalnum():
            new_word = word[:i] + ' ' + word[i:]
            break
    temp_dict[word] = new_word     
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Start word punctuations:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)     

########## Step - Start word punctuations:
Unknown words: 107810 | Known words: 16029
@cpft_nhs --- @ cpft_nhs
@green_bird007 --- @ green_bird007
'they --- ' they
'diplomatic --- ' diplomatic
#pfizerbiontech-made --- # pfizerbiontech-made
@dpol_un --- @ dpol_un
@yusuf_ch --- @ yusuf_ch
@mansi_jain123 --- @ mansi_jain123
@raw_em_md --- @ raw_em_md
@prc_amb_uganda --- @ prc_amb_uganda


In [232]:
# Find and replace acronims
# Local (only unknown words)
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (Counter(word)['.']>1) and (check_replace(word)):
        if (domain_search(word)!='') and (('www' in word) or (Counter(word)['/']>3)):
            temp_dict[word] = place_hold('url ' + domain_search(word))
        else: 
            if (re.compile('[\.\,]').sub('', word) in local_vocab) and (len(re.compile('[0-9\.\,\-\/\:]').sub('', word))>0):
                temp_dict[word] =  place_hold(re.compile('[\.\,]').sub('', word))
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Find and replace acronims:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)  

########## Step - Find and replace acronims:
Unknown words: 107810 | Known words: 16029
w.e.b --- word_placeholder[web]
i.v.o --- word_placeholder[ivo]
u.s.a --- word_placeholder[usa]
f.a.i.r --- word_placeholder[fair]
p.u.s.h --- word_placeholder[push]
w.e.a.r --- word_placeholder[wear]
a.k.a --- word_placeholder[aka]
f.d.a --- word_placeholder[fda]
d.o.n.e --- word_placeholder[done]
r.i.p --- word_placeholder[rip]


In [233]:
# Convert backslash
# Global
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('\\' in k)]    
temp_dict = {k:re.sub('\\\\+', ' / ', k) for k in temp_vocab}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert backslash:'); check_vocab(tweets, local_vocab)
if verbose: print_dict(temp_dict)

########## Step - Convert backslash:
Unknown words: 107810 | Known words: 16029
f*%&gt;\!g --- f*%&gt; / !g
s\se --- s / se


In [234]:
# Join dashes
# Local (only unknown words)
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
    
temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.sub('\-\-+', '-', word)
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Join dashes:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)  

########## Step - Join dashes:
Unknown words: 107785 | Known words: 16029
goal--100 --- goal-100
----------------------- --- -
----------- --- -
vaccines.--25 --- vaccines.-25
record--&gt;i --- record-&gt;i
1.54$---&gt;19 --- 1.54$-&gt;19
vaccines--#sinopharm --- vaccines-#sinopharm
ca_osg--&gt;i --- ca_osg-&gt;i
34$---&gt;3,40 --- 34$-&gt;3,40
yey--done --- yey-done


In [235]:
# Try Split word
# Local (only unknown words)
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
    
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9\*]').sub('', word))>0:
        chars = re.compile('[a-zA-Z0-9\*]').sub('', word)
        temp_dict[word] = ''.join([' ' + c + ' ' if c in chars else c for c in word])
    
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)  

########## Step - Try Split word:
Unknown words: 86459 | Known words: 16451
dose2:100 --- dose2 : 100
dose1:2100 --- dose1 : 2100
2:21:00 --- 2 : 21 : 00
13.07.2021 --- 13 . 07 . 2021
to12-16weeks --- to12 - 16weeks
j.&amp;j --- j .  & amp ; j
tk_tr --- tk _ tr
al-#zanati --- al -  # zanati
jab,time --- jab , time
bio-pharma --- bio - pharma


In [236]:
# L33T vocabulary (SLOW)
# https://simple.wikipedia.org/wiki/Leet
# Local (only unknown words)
def convert_leet(word):
    # basic conversion 
    word = re.sub('0', 'o', word)
    word = re.sub('1', 'i', word)
    word = re.sub('3', 'e', word)
    word = re.sub('\$', 's', word)
    word = re.sub('\@', 'a', word)
    return word
            
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
    
temp_dict = {}
for word in temp_vocab:
    new_word = convert_leet(word)
    if (new_word!=word): 
        if (len(word)>2) and (new_word in local_vocab):
            temp_dict[word] = new_word
    
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - L33T (with vocab check):'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)      

########## Step - L33T (with vocab check):
Unknown words: 86438 | Known words: 16455
yur1 --- yuri
1bn --- ibn
bl0nde --- blonde
ph1 --- phi
1ce --- ice
sh1 --- shi
b1tch --- bitch
c0ck --- cock
w0ng --- wong
3ra --- era


In [237]:
# Open Holded words
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (not check_replace(k))]
temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.sub('___', ' ', word[17:-1])
tweets = tweets.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
tweets = tweets.apply(lambda x: ' '.join([i for i in x.split()]))
if verbose: print('#' * 10, 'Step - Open Holded words:'); check_vocab(tweets, local_vocab)

########## Step - Open Holded words:
Unknown words: 78325 | Known words: 16699


In [238]:
# Search multiple form
# Local | example -> flashlights / flashlight -> False / True
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (k[-1:]=='s') and (len(k)>4)]
temp_dict = {k:k[:-1] for k in temp_vocab if (k[:-1] in local_vocab)}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Multiple form:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)    

########## Step - Multiple form:
Unknown words: 77291 | Known words: 16788
cocktails --- cocktail
laces --- lace
muses --- muse
wingers --- winger
hugss --- hugs
danielas --- daniela
complements --- complement
ghanaians --- ghanaian
marts --- mart
radars --- radar


In [239]:
# Convert emoji to text
# Local 
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (k in emoji.UNICODE_EMOJI)]
temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.compile('[:_]').sub(' ', emoji.UNICODE_EMOJI.get(word)) 
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert emoji to text:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)                                                                      

########## Step - Convert emoji to text:
Unknown words: 77291 | Known words: 16788


In [240]:
tweets.head()

0    same folks said daikon paste could treat a cyt...
1    while the world has been on the wrong side of ...
2    # coronavirus # sputnikv # astrazeneca # pfize...
3    facts are immutable , senator , even when you ...
4    explain to me again why we need a vaccine @ bo...
Name: text, dtype: object

In [241]:
TWEETS['text'] = tweets
TWEETS.head(10)


Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet
0,1340539111971516416,Rachel Roh,"La Crescenta-Montrose, CA",Aggregator of Asian American news; scanning di...,2009-04-08 17:52:46,405,1692,3247,False,2020-12-20 06:06:44,same folks said daikon paste could treat a cyt...,['PfizerBioNTech'],Twitter for Android,0,0,False
1,1338158543359250433,Albert Fong,"San Francisco, CA","Marketing dude, tech geek, heavy metal & '80s ...",2009-09-21 15:27:30,834,666,178,False,2020-12-13 16:27:13,while the world has been on the wrong side of ...,,Twitter Web App,1,1,False
2,1337858199140118533,eli🇱🇹🇪🇺👌,Your Bed,"heil, hydra 🖐☺",2020-06-25 23:30:28,10,88,155,False,2020-12-12 20:33:45,# coronavirus # sputnikv # astrazeneca # pfize...,"['coronavirus', 'SputnikV', 'AstraZeneca', 'Pf...",Twitter for Android,0,0,False
3,1337855739918835717,Charles Adler,"Vancouver, BC - Canada","Hosting ""CharlesAdlerTonight"" Global News Radi...",2008-09-10 11:28:53,49165,3933,21853,True,2020-12-12 20:23:59,"facts are immutable , senator , even when you ...",,Twitter Web App,446,2129,False
4,1337854064604966912,Citizen News Channel,,Citizen News Channel bringing you an alternati...,2020-04-23 17:58:42,152,580,1473,False,2020-12-12 20:17:19,explain to me again why we need a vaccine @ bo...,"['whereareallthesickpeople', 'PfizerBioNTech']",Twitter for iPhone,0,0,False
5,1337852648389832708,Dee,"Birmingham, England","Gastroenterology trainee, Clinical Research Fe...",2020-01-26 21:43:12,105,108,106,False,2020-12-12 20:11:42,does anyone have any useful advice / guidance ...,,Twitter for iPhone,0,0,False
6,1337851215875608579,Gunther Fehlinger,"Austria, Ukraine and Kosovo",End North Stream 2 now - the pipeline of corru...,2013-06-10 17:49:22,2731,5001,69344,False,2020-12-12 20:06:00,it is a bit sad to claim the fame for success ...,['vaccination'],Twitter Web App,0,4,False
7,1337850832256176136,Dr.Krutika Kuppalli,,"ID, Global Health, VHF, Pandemic Prep, Emergin...",2019-03-25 04:14:29,21924,593,7815,True,2020-12-12 20:04:29,there have not been many bright days in 2020 b...,"['BidenHarris', 'Election2020']",Twitter for iPhone,2,22,False
8,1337850023531347969,Erin Despas,,Designing&selling on Teespring. Like 90s Disne...,2009-10-30 17:53:54,887,1515,9639,False,2020-12-12 20:01:16,covid vaccine ; you getting it ? # covidvaccin...,"['CovidVaccine', 'covid19', 'PfizerBioNTech', ...",Twitter Web App,2,1,False
9,1337842295857623042,Ch.Amjad Ali,Islamabad,#ProudPakistani #LovePakArmy #PMIK @insafiansp...,2012-11-12 04:18:12,671,2368,20469,False,2020-12-12 19:30:33,# covidvaccine states will start getting # cov...,"['CovidVaccine', 'COVID19Vaccine', 'US', 'paku...",Twitter Web App,0,0,False


In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from tqdm import tqdm

In [None]:
class TweetSentiment(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = self.text[index]
        text = ' '.join(text.split())

        inputs = self.tokenizer.encode_plus(text, None, add_special_tokens=True, max_length=self.max_len, pad_to_max_length=True, return_token_type_ids=True)
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids=inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
# EPOCHS = 1
LEARNING_RATE = 1e-04
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)


In [None]:
labeled_tweets = LABELED.rename({'tweet_text': 'text'}, axis=1)
labeled_tweets['label'] = labeled_tweets['label']-1
labeled_tweets.head(5)

In [None]:
train_size = 0.8
train_data=labeled_tweets.sample(frac=train_size,random_state=200)
test_data=labeled_tweets.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(labeled_tweets.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = TweetSentiment(train_data, tokenizer, MAX_LEN)
testing_set = TweetSentiment(test_data, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier0 = torch.nn.Linear(768, 768)
        self.dropout0 = torch.nn.Dropout(0.3)
        self.pre_classifier1 = torch.nn.Linear(768, 384)
        self.dropout1 = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(384, 3)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier0(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout0(pooler)
        pooler = self.pre_classifier1(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout1(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
model = RobertaClass()
params = model.state_dict()
params.keys()
for name, param in model.named_parameters():
    if param.requires_grad and 'l1' in name:
        param.requires_grad = False


In [None]:
model.to(device)

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params = filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [None]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

In [None]:
EPOCHS = 20
for epoch in range(EPOCHS):
    train(epoch)
    acc = valid(model, testing_loader)
    print("Accuracy on test data = %0.2f%%" % acc)

In [242]:
TWEETS.head(5)
TWEETS = TWEETS.drop(columns=['user_name', 'user_location', 'user_description', 'user_created', 'user_friends', 'user_favourites', 'user_verified', 'source', 'retweets', 'favorites', 'is_retweet'])
TWEETS.head(5)


Unnamed: 0,id,user_followers,date,text,hashtags
0,1340539111971516416,405,2020-12-20 06:06:44,same folks said daikon paste could treat a cyt...,['PfizerBioNTech']
1,1338158543359250433,834,2020-12-13 16:27:13,while the world has been on the wrong side of ...,
2,1337858199140118533,10,2020-12-12 20:33:45,# coronavirus # sputnikv # astrazeneca # pfize...,"['coronavirus', 'SputnikV', 'AstraZeneca', 'Pf..."
3,1337855739918835717,49165,2020-12-12 20:23:59,"facts are immutable , senator , even when you ...",
4,1337854064604966912,152,2020-12-12 20:17:19,explain to me again why we need a vaccine @ bo...,"['whereareallthesickpeople', 'PfizerBioNTech']"


In [243]:
TWEETS.to_csv('all_tweets_cleaned.csv')

In [None]:
predicting_tweets = TWEETS
predicting_tweets.head(5)
predicting_set = TweetSentiment(predicting_tweets, tokenizer, MAX_LEN)

In [None]:
text = "Good night"
encoded_input = tokenizer(text, return_tensors='pt')
model.to('cpu')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

print(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    s = scores[ranking[i]]
    print(f"{i+1}) {np.round(float(s), 4)}")

In [None]:
predicting_loader = DataLoader(predicting_set, **test_params)

In [None]:
LABELED.to_csv('annotated_clean.csv')