In [None]:
import pandas as pd
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import spacy 
from datetime import datetime
import re
import string
import unicodedata
import random
from gensim import corpora

### Sentiment analysis
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

### Load Twitter data

In [None]:
df = pd.read_pickle("../tutorial_3_twitter_networks_and_visualization/Raw_Tweets.pkl") # Load your tweets
df.head()

### Preprocessing Twitter Data

In [None]:
# remove HTML links, mentions, hashtags, and special characters
# choose functions that suit your neeeds

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ' ')    
    return text

def strip_mentions(text):
    entity_prefixes = ['@']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def strip_hashtags(text):
    entity_prefixes = ['#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)
        
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
# generate a random tweet: 
i = random.choice(df.index)
tweet = df.loc[i, 'text']
print(tweet)

In [None]:
tweet = strip_links(tweet)
tweet = strip_mentions(tweet)
tweet = strip_hashtags(tweet)
tweet = remove_special_characters(tweet)
print(tweet)

In [None]:
def process_tweets(tweet):
    tweet = strip_links(tweet)
    tweet = strip_mentions(tweet)
    tweet = strip_hashtags(tweet)
    tweet = remove_special_characters(tweet)
    return tweet

In [None]:
df.loc[:, 'text'] = df.text.apply(process_tweets)
df.text

### Sentiment Analysis
#### TextBlob

In [None]:
blob = TextBlob(tweet)
blob.sentiment

Textblob sentiment output: 

Polarity in [-1, 1] := [most negative, most positive]

Subjectivity in [0, 1] := [factual, personal opinion]

In [None]:
blob.tokens

In [None]:
blob.tags

In [None]:
blob.noun_phrases

In [None]:
test_msg1 = 'this is not the best football team'

In [None]:
blob = TextBlob(test_msg)
blob.sentiment

In [None]:
test_msg2 = 'hey this is not too bad'

In [None]:
blob = TextBlob(test_msg)
blob.sentiment

In [None]:
# NaiveBayesAnalyzer option, trained on movie reviews
from textblob.sentiments import NaiveBayesAnalyzer

print(test_msg1)
blob = TextBlob(test_msg1, analyzer=NaiveBayesAnalyzer())
print(blob.sentiment)

print(test_msg2)
blob = TextBlob(test_msg2, analyzer=NaiveBayesAnalyzer())
print(blob.sentiment)

In [None]:
blob = TextBlob(tweet, analyzer=NaiveBayesAnalyzer())
blob.sentiment

#### VADER (Valence Aware Dictionary and Sentiment Reasoner)

In [None]:
#find sentiment vader
analyser = SentimentIntensityAnalyzer()

In [None]:
help(analyser.polarity_scores)

In [None]:
sentiment = analyser.polarity_scores('This is an example of a happy tweet')
print(sentiment)

In [None]:
print(test_msg1)
sentiment = analyser.polarity_scores(test_msg1)
print(sentiment)

print(test_msg2)
sentiment = analyser.polarity_scores(test_msg2)
print(sentiment)

#### Extension

Twitter roBERTa

https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment

Multilingual Twitter roBERTa: 8 languages (Ar, En, Fr, De, Hi, It, Sp, Pt)

https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment


### Stemming/Lemming

In [None]:
# Stemming / Lemming

### loading a spacy language model
# python -m spacy download en_core_web_sm
# https://spacy.io/models/en

nlp = spacy.load('en_core_web_sm') 

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [None]:
print(tweet)
print('------\nstemmed tweet:')
print(simple_stemmer(tweet))
print('------\nlemmatized tweet:')
print(lemmatize_text(tweet))

### Tokenizing and Corpus Creation

In [None]:
### Run this the first time
nltk.download('stopwords')

In [None]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
#Create corupus of all words
words_corpus = []
for elem in df.text.iloc[1:100]:
    # remove stop words
    elem = remove_stopwords(elem)
    # lemmatize text
    elem = lemmatize_text(elem)
    words_corpus.append(elem.lower().split())
print(len(words_corpus))

dictionary = corpora.Dictionary(words_corpus)
print(len(dictionary))

In [None]:
dictionary.num_docs, dictionary.num_pos

In [None]:
# get token-id mapping
dictionary.token2id

In [None]:
dictionary.most_common(10)

In [None]:
# get bag-of-words repesentation of documents: list of (token_id, token_count) tuples
dictionary.doc2bow(words_corpus[0])