# Preparation

## Imports

In [106]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings; warnings.simplefilter('ignore')
import re


# NLTK
import nltk
import codecs
from nltk.tokenize import PunktSentenceTokenizer,sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("wordnet")

#Spacy


## Loading extra data

In [2]:
raw_data_merge = pd.read_csv('../data/merge_bez2016.csv', sep="\t", names=["label", "text"])
emoticons_file = pd.read_csv('../data/emoticons.txt', sep="  ->  ", names=["emoji", "meaning"])

In [3]:
def remove_duplicates(csv_data):
    clean_data = dict()
    for i in tqdm((range(len(csv_data)))):
        if csv_data.text[i] not in clean_data and isinstance(csv_data.text[i], str):
            clean_data.update({csv_data.text[i]: csv_data.label[i]})
    return clean_data

data = remove_duplicates(raw_data_merge)

tweets = list()
polarities = list()

for text in data.keys():
    tweets.append(text)
    
for polarity in data.values():
    polarities.append(polarity)

100%|██████████| 40256/40256 [00:02<00:00, 18268.52it/s]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import queue
vectorizer = CountVectorizer()
vocab_pos = queue.PriorityQueue()
vocab_net = queue.PriorityQueue()
vocab_neg = queue.PriorityQueue()
data, data_pos, data_net, data_neg = tweets, list(), list(), list()
for i in range(len(tweets)):
    if polarities[i] == "positive":
        data_pos.append(tweets[i])
    elif polarities[i] == "neutral":
        data_net.append(tweets[i])
    else:
        data_neg.append(tweets[i])
        
        
transformed_data = vectorizer.fit_transform(data)
        
transformed_data_pos = vectorizer.fit_transform(data_pos)
transformed_data_net = vectorizer.fit_transform(data_net)
transformed_data_neg = vectorizer.fit_transform(data_neg)

for word, count in zip(vectorizer.get_feature_names(), np.ravel(transformed_data.sum(axis=0))): 
    vocab.put((int(count), word)) 

for word, count in zip(vectorizer.get_feature_names(), np.ravel(transformed_data_pos.sum(axis=0))): 
    vocab_pos.put((int(count), word)) 
for word, count in zip(vectorizer.get_feature_names(), np.ravel(transformed_data_net.sum(axis=0))): 
    vocab_net.put((int(count), word))
for word, count in zip(vectorizer.get_feature_names(), np.ravel(transformed_data_neg.sum(axis=0))): 
    vocab_neg.put((int(count), word)) 


# Special functions

## Emoticons

In [4]:
EMOTICONS = dict()
for i in range(len(emoticons_file)):
    e = emoticons_file.emoji[i]
    m = emoticons_file.meaning[i]
    EMOTICONS.update({e:m})

In [5]:
def emoticons_distribution(tweets, polarities):
    distribution = dict()
    count = 0
    [distribution.update({emot: [0,0,0,0]}) for emot in EMOTICONS.keys()]
    for i in tqdm(range(len(tweets))):
        tweets[i] = remove_links(tweets[i])
        tweets[i] = repaire_chars(tweets[i])
        tweets[i] = remove_usernames(tweets[i])
        flag = 0
        for emot in EMOTICONS:
            if tweets[i].find(emot) != -1:
                flag = 1
                distribution[emot][3] += 1
#                 print(emot + "   ->   "  + dataset[i].text[0])
                if polarities[i] == "positive":
                    distribution[emot][0] += 1
                elif polarities[i] == "neutral":
                    distribution[emot][1] += 1
                else:
                    distribution[emot][2] += 1

        if flag != 1:
            count += 1
    print("Sentences without emoticons: " + str(count * 100 / len(tweets)) + "%")
    return distribution
    

In [6]:
def find_useful_emoticons(tweets, polarities):
    distribution = emoticons_distribution(tweets, polarities)
    emoticons_score = dict()
    good_scores = dict()
    for k,v in distribution.items():
        if v[3] == 0:
            score = 0
        else:
            score = (v[0] - v[2]) / v[3]
        emoticons_score.update({k:score})

    for k,v in emoticons_score.items():
        if v < -0.1 or v > 0.1:
            good_scores.update({k:v})

    return good_scores

In [23]:
useful_emoticons = find_useful_emoticons(tweets, polarities)

100%|██████████| 39608/39608 [00:01<00:00, 30624.33it/s]

Sentences without emoticons: 94.39507170268632%





# Hooks

## Pretokenization

### Removing links

In [7]:
def remove_links(raw):
    return re.sub(r'http.*\b', '[URL]', raw)

### Repairing some lost characters

In [19]:
def repaire_chars(raw):
    raw = re.sub(r'\\u2019', "\'", raw)
    raw = re.sub(r'\\u002c', ',', raw)
    raw = re.sub(r'&lt', '>', raw)
    raw = re.sub(r'&gt', '<', raw)        
    raw = re.sub(r'&amp;', '&', raw)
    raw = re.sub(r'\\\"\"', '', raw)
    raw = re.sub(r'\"\"\\', '', raw)
    return raw

### Removing usernames

In [9]:
def remove_usernames(raw):
    return re.sub(r'@[^\s]*', '[USER]', raw)

### Replacing unuseful emoticons

In [10]:
def replace_unuseful_emoticons(raw):
    for k,v in EMOTICONS.items():
        if k in useful_emoticons.keys() and k in raw:
            raw.replace(k,v)
    return raw

### Annotation and normalization (from github)

In [11]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[EMOTICONS, emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [12]:
def annotation_normalization(raw):
    return " ".join(text_processor.pre_process_doc(raw))

### Removing 

### Spell check (from github)

In [101]:
from ekphrasis.classes.spellcorrect import SpellCorrector
from nltk.corpus import words
sp = SpellCorrector(corpus="english") 
nltk.download("words")
words = set(words.words())
def spell_check(raw):
    correct_raw = list()
    for word in raw.split(" "):
        if(word in words):
            correct_raw.append(word)
        else:
            correct_raw.append(sp.correct(word))
    return " ".join(correct_raw)

Reading english - 1grams ...


[nltk_data] Downloading package words to /home/ivan/nltk_data...
[nltk_data]   Package words is already up-to-date!


## Posttokenization

### Removing stopwords

In [14]:
def remove_stopwords(raw, tokenized):
    stop_words_set = set(stopwords.words('english'))
    tokens = []
    for token in tokenized:
        token_lower = token if token.islower() else token.lower()
        if token_lower not in stop_words_set:
            tokens.append(token)
    return (raw, tokens)

# Data loading

## Loading dataset

In [125]:
from podium.datasets import Iterator
from podium.storage import Vocab, Field, LabelField, MultioutputField
from podium.storage.vectorizers.tfidf import CountVectorizer
from podium.datasets import TabularDataset
import functools
import spacy

nlp = spacy.load('en_core_web_lg', disable=["parser", "ner"])

def extract_text_hook(raw, tokenized):
    return raw, [token.text for token in tokenized]

def extract_pos_hook(raw, tokenized):
    return raw, [token.pos_ for token in tokenized]

def extract_vec_hook(raw, tokenized):
    return raw, [token.vector_norm for token in tokenized]


text = Field(name='text', vocab=Vocab(), store_as_raw=True)
text.add_posttokenize_hook(extract_text_hook)

pos = Field(name='pos', vocab=Vocab())
pos.add_posttokenize_hook(extract_pos_hook)

vec = Field(name='vec')
vec.add_posttokenize_hook(extract_vec_hook)

text = MultioutputField([text, pos, vec], tokenizer=nlp)

label = LabelField(name='label')
fields = {'text': text, 'label':label}



In [126]:
dataset = TabularDataset('../data/merge_bez2016.csv', format='tsv', fields=fields)

### Pretokenization hooks

In [21]:
text.add_pretokenize_hook(repaire_chars)
text.add_pretokenize_hook(annotation_normalization)
text.add_pretokenize_hook(replace_unuseful_emoticons)

### Posttokenization hooks

In [244]:
text.add_posttokenize_hook(remove_stopwords)

In [127]:
print(dataset[10])

Example[label: ('positive', None); pos: (None, ['PUNCT', 'VERB', 'ADJ', 'PRON', 'CCONJ', 'DET', 'NOUN', 'VERB', 'NOUN', 'NUM', 'SYM', 'NUM', 'ADP', 'SYM', 'PROPN', 'PUNCT', 'ADP', 'NOUN', 'NUM', 'NOUN', 'PROPN', 'NUM', 'PUNCT', 'SPACE', 'PROPN', 'NUM', 'CCONJ', 'NUM', 'AUX', 'DET', 'ADJ', 'PUNCT']); text: ('@hughhefner Make sure you & the girls watch seasons 1-5 of #Dexter, before season 6 premieres Oct 2.  Seasons 1 & 4 are the best!', ['@hughhefner', 'Make', 'sure', 'you', '&', 'the', 'girls', 'watch', 'seasons', '1', '-', '5', 'of', '#', 'Dexter', ',', 'before', 'season', '6', 'premieres', 'Oct', '2', '.', ' ', 'Seasons', '1', '&', '4', 'are', 'the', 'best', '!']); vec: (None, [0.0, 5.0838113, 4.9412875, 5.1979666, 5.9343824, 4.70935, 6.974511, 6.519909, 7.012366, 5.269974, 5.6033444, 5.069743, 4.97793, 6.7399955, 6.29456, 5.094723, 5.2121177, 7.2339334, 5.067633, 6.335264, 5.5206485, 5.163114, 4.9316354, 0.0, 7.012366, 5.269974, 5.9343824, 5.05721, 5.41568, 4.70935, 5.2471824, 5.62