## Imports

In [1]:
import pandas as pd
import json
import re
import numpy as np
import nltk
import itertools
from sklearn.metrics import precision_recall_curve, auc,confusion_matrix
from sklearn.metrics import classification_report, f1_score, fbeta_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load

In [2]:
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))

In [3]:
import Stemmer
english_stemmer = Stemmer.Stemmer('en')

## Tweet Data

In [4]:
sam_tweets = pd.read_csv("dataset/SamsungTweetsSent.csv")
sam_tweets["text"] = sam_tweets.full_text
sam_tweets.drop("full_text", axis = 1, inplace = True)
sam_tweets.head()

Unnamed: 0,lang,user_screen_name,sent,text
0,en,mrsshinde,o,RT @mrsshinde: @SamsungMobile @Moto @oneplus @...
1,en,olutobi_og,o,@SamsungMobile kindly include play next in the...
2,en,themobileindian,o,@SamsungMobile has started rolling out the And...
3,en,sobakhani,o,@SamsungMobile how to find lost Samsung note 1...
4,en,Imchetan_p,n,@SamsungMobile @SamsungIndia @Samsung I must s...


In [5]:
ces_tweets = pd.read_csv("dataset/ces2020_tweets_full_text.csv")
ces_tweets.head()

Unnamed: 0,lang,text,username
0,en,@Razer debuted an incredibly compact all-in-on...,techthelead
1,en,"In keynote address, @Delta Unveils New #OOH Pa...",YourOAAA
2,en,👍 We are ready for Day 2 at #CES2020. Discover...,Sio_db
3,en,GO-&gt; #CES2020 #France #USA !\n#BusinessFran...,dillardmarg
4,en,The industry's smallest and lightest 4K60P pro...,HoldanBlog


## Cleaning and Preprocessing Tweets

Dictionaries and preprocessing functions from previous notebook:

In [6]:
emoticon_repl = {
    # positive emoticons
    r":-?d+": " good ",  r":[- ]?\)+": " good ", r";-?\)+": " good ",
    r"\(+-?:": " good ", r"=\)+" : " good ", r"<3" : " good ",
    # negative emoticons
    r"[\s\r\t\n]+:/+": " bad ", r":\\+": " bad ", r"[\s\r\t\n]+\)-?:": " bad ",
    r":-?\(+": " bad ", r"[\s\t\r\n]+d+-?:": " bad "
}

contracted_repl = {
    # casi particolari
    r"won\'t" : "will not", r"won\'" : "will not", r"can\'t": "can not", r"shan\'t": "shall not",
    r"shan\'": "shall not", r"ain\'t": "is not", r"ain\'": "is not",
    # casi generali
    r"n\'t": " not", r"\'t": " not", r"n\'": " not", r"\'s": " is", r"\'ve": " have", 
    r"\'re": " are", 
    r"\'ll": " will", r"\'d": " would",
}

with open('slang_subset_manual.json', 'r') as fid:
    slang_repl = json.load(fid)
    
def preprocess(sent, translate_slang = True):
    
    sent = sent.lower()
    sent = re.sub(r'^<div id="video.*>&nbsp;', '', sent) # Video-review part
    sent = re.sub('https?://[A-Za-z0-9./]+', '', sent) # URLs
    
    for k in emoticon_repl:
        sent = re.sub(k, emoticon_repl[k], sent)

    if translate_slang:
        for k in slang_repl:
            sent = re.sub(r"\b"+re.escape(k)+r"\b", slang_repl[k], sent)
        
    for k in contracted_repl:
        sent = re.sub(k, contracted_repl[k], sent)
    
    sent = re.sub('[/]+', ' ', sent) # word1/word2 to word1 word2
    sent = re.sub('[^A-Za-z0-9-_ ]+', '', sent)
    sent = re.sub('\b\d+\b', '', sent)
    
    return sent

Shold we remove Hashtags?

In [7]:
def preprocess_tweets(tweets_df):
    tweets_df["textPreprocessed"] = tweets_df.text
    tweets_df["textPreprocessed"] =  tweets_df["textPreprocessed"].str.replace("@\w+", "") # remove AT's
    tweets_df["textPreprocessed"] = tweets_df["textPreprocessed"].str.replace("^(RT)+", "") # Remove RT at beginning of retweets
    
    # Add stuff probably
    
    tweets_df["textPreprocessed"] = tweets_df["textPreprocessed"].apply(preprocess)

In [8]:
preprocess_tweets(ces_tweets)
ces_tweets.head()

Unnamed: 0,lang,text,username,textPreprocessed
0,en,@Razer debuted an incredibly compact all-in-on...,techthelead,debuted an incredibly compact all-in-one syst...
1,en,"In keynote address, @Delta Unveils New #OOH Pa...",YourOAAA,in keynote address unveils new ooh parallel r...
2,en,👍 We are ready for Day 2 at #CES2020. Discover...,Sio_db,we are ready for day 2 at ces2020 discover ou...
3,en,GO-&gt; #CES2020 #France #USA !\n#BusinessFran...,dillardmarg,go-gt ces2020 france usa businessfrance cce in...
4,en,The industry's smallest and lightest 4K60P pro...,HoldanBlog,the industry is smallest and lightest 4k60p pr...


In [9]:
preprocess_tweets(sam_tweets)
sam_tweets.head()

Unnamed: 0,lang,user_screen_name,sent,text,textPreprocessed
0,en,mrsshinde,o,RT @mrsshinde: @SamsungMobile @Moto @oneplus @...,we must work to save safeguard human...
1,en,olutobi_og,o,@SamsungMobile kindly include play next in the...,kindly include play next in the next samsungm...
2,en,themobileindian,o,@SamsungMobile has started rolling out the And...,has started rolling out the android 10 update...
3,en,sobakhani,o,@SamsungMobile how to find lost Samsung note 1...,how to find lost samsung note 10 plus in paki...
4,en,Imchetan_p,n,@SamsungMobile @SamsungIndia @Samsung I must s...,i must say that your sales services really ...


## Sentiwordnet Sentiment Classifier

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import sentiwordnet as swn
nltk.download('universal_tagset')
nltk.download('sentiwordnet')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\gianc\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\gianc\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

The function returns, for a list of tweets:

- Their tokens and their tags
- Their positivity score
- Their negativity score
- Their sentiment score (-1 for negative, 0 for neutral, 1 for positive)

TODO: add reference to article.

In [11]:
def pos_senti(X):
    lem = WordNetLemmatizer()
    pstem = PorterStemmer()
    X_tagged = []
    li_swn=[]
    li_swn_pos=[]
    li_swn_neg=[]
    missing_words=[]
    for i in range(len(X)):
        text = X[i]
        tokens = nltk.word_tokenize(text)
        tagged_sent = nltk.tag.pos_tag(tokens)
        store_it = [(word, nltk.tag.map_tag('en-ptb', 'universal', tag)) for word, tag in tagged_sent]
        X_tagged.append(store_it)
        #print("Tagged Parts of Speech:",store_it)

        pos_total=0
        neg_total=0
        for word,tag in store_it:
            # print(tag)
            if(tag=='NOUN'):
                tag='n'
            elif(tag=='VERB'):
                tag='v'
            elif(tag=='ADJ'):
                tag='a'
            elif(tag=='ADV'):
                tag = 'r'
            else:
                tag='nothing'

                
            if(tag!='nothing'):
                concat = word+'.'+tag+'.01'
                try:
                    this_word_pos=swn.senti_synset(concat).pos_score()
                    this_word_neg=swn.senti_synset(concat).neg_score()
                    # print(word,tag,':',this_word_pos,this_word_neg)
                except Exception as e:
                    wor = lem.lemmatize(word)
                    concat = wor+'.'+tag+'.01'
                    # Checking if there's a possiblity of lemmatized word be accepted into SWN corpus
                    try:
                        this_word_pos=swn.senti_synset(concat).pos_score()
                        this_word_neg=swn.senti_synset(concat).neg_score()
                    except Exception as e:
                        wor = pstem.stem(word)
                        concat = wor+'.'+tag+'.01'
                        # Checking if there's a possiblity of lemmatized word be accepted
                        try:
                            this_word_pos=swn.senti_synset(concat).pos_score()
                            this_word_neg=swn.senti_synset(concat).neg_score()
                        except:
                            missing_words.append(word)
                            continue
                pos_total+=this_word_pos
                neg_total+=this_word_neg
        li_swn_pos.append(pos_total)
        li_swn_neg.append(neg_total)

        if(pos_total!=0 or neg_total!=0):
            if(pos_total>neg_total):
                li_swn.append(1)
            else:
                li_swn.append(-1)
        else:
            li_swn.append(0)
    # df_copy.insert(5,"pos_score",li_swn_pos,True)
    # df_copy.insert(6,"neg_score",li_swn_neg,True)
    # df_copy.insert(7,"sent_score",li_swn,True)
    return X_tagged, li_swn_pos, li_swn_neg, li_swn
    # end-of pos-tagging&sentiment

In [12]:
def scores(predictions, y_test):

    prec = precision_score(y_test, predictions) # Precision
    rec = recall_score(y_test, predictions) # Recall
    f1 = f1_score(y_test, predictions) # F1
    f2 = fbeta_score(y_test, predictions, 2) # F2
    cm = confusion_matrix(y_test, predictions)
    
    scores_strings = ["Test Precision",
                      "Test Recall", "F1", "F2"]
    
    scores = [prec, rec, f1, f2]
    
    print(("{:20s} {:.5f}\n"*4)[:-1].format(*itertools.chain(*zip(scores_strings, scores))))
    
    print(classification_report(y_test, predictions))


## CES Tweets sentiment

In [13]:
ces_X = np.array(ces_tweets.text.values)

In [14]:
ces_X_prep = np.array(ces_tweets.textPreprocessed.values)

In [15]:
ces_X[2]

'👍 We are ready for Day 2 at #CES2020. Discover our innovative Database Siodb 👨\u200d💻. Nicolas Penot and Grégory Steulet will be pleased to show you it ensures data security and privacy. 👉https://t.co/dYRiXeHjyV\n#SwissPavilion #SwissTech https://t.co/IBIqH8Qei9'

In [16]:
ces_X_prep[2]

' we are ready for day 2 at ces2020 discover our innovative database siodb  nicolas penot and grgory steulet will be pleased to show you it ensures data security and privacy swisspavilion swisstech '

In [17]:
len(ces_X_prep)

2000

In [18]:
ces_X_tagged, ces_pos_score_swn, ces_neg_score_swn, ces_sent_score_swn = pos_senti(ces_X_prep)

SentiWordNet classifies 463 tweets as negative, 319 as neutral and 1218 as positive:

In [19]:
np.unique(ces_sent_score_swn, return_counts = True)

(array([-1,  0,  1]), array([ 463,  319, 1218], dtype=int64))

Our clf:

In [20]:
vectorizer = load('tfidf_vect_pystemmer.joblib')

In [21]:
clf = load('clf_nb_pystemmer.joblib')

Let's take only the tweets labelled negative/positive by SentiWordNet:

In [22]:
neg_pos_ind = np.array(ces_sent_score_swn) != 0

In [23]:
neg_pos_ces_X = ces_X_prep[neg_pos_ind]

In [24]:
len(neg_pos_ces_X)

1681

In [25]:
neg_pos_ces_score = np.array(ces_sent_score_swn)[neg_pos_ind] == 1

In [26]:
ces_X_vect = vectorizer.transform(neg_pos_ces_X)

In [27]:
ces_X_vect

<1681x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 18265 stored elements in Compressed Sparse Row format>

In [28]:
predictions = clf.predict(ces_X_vect)

In [29]:
np.unique(predictions, return_counts = True)

(array([False,  True]), array([729, 952], dtype=int64))

In [31]:
cm = confusion_matrix(neg_pos_ces_score, predictions)
cm

array([[263, 200],
       [466, 752]], dtype=int64)

Assuming the binary scores from SentiWordNet are reliable, score our classifier (trained on the balanced dataset):

In [32]:
scores(predictions, neg_pos_ces_score)

Test Precision       0.78992
Test Recall          0.61741
F1                   0.69309
F2                   0.64560
              precision    recall  f1-score   support

       False       0.36      0.57      0.44       463
        True       0.79      0.62      0.69      1218

    accuracy                           0.60      1681
   macro avg       0.58      0.59      0.57      1681
weighted avg       0.67      0.60      0.62      1681



## Samsung Tweets Sentiment

Remove NA's from samsung tweets (one sentiment is missing, TODO relabel)

In [33]:
sam_tweets = sam_tweets[sam_tweets.sent.notna()]

We take only tweets labelled as positive or negative:

In [34]:
sam_tweets_sentiment = sam_tweets[sam_tweets.sent != "o"]

In [35]:
sam_X = np.array(sam_tweets_sentiment.text)

Manual labels: 139 negative, 19 positive

In [36]:
sam_manual_y = np.array(sam_tweets_sentiment.sent)== "p"
np.unique(sam_manual_y, return_counts=True)

(array([False,  True]), array([139,  18], dtype=int64))

In [37]:
sam_X_prep = np.array(sam_tweets_sentiment.textPreprocessed)

In [38]:
sam_X[0]

'@SamsungMobile @SamsungIndia @Samsung I must say that your sales services really suck. I have been trying to reach out to your customer care via your samsung shop tollfree/email &amp; Live chat but none worked. Wasted an hour and really disappointed to place my trust in Samsung again'

In [39]:
sam_X_prep[0]

'   i must say that your sales services really suck i have been trying to reach out to your customer care via your samsung shop tollfree email amp live chat but none worked wasted an hour and really disappointed to place my trust in samsung again'

In [40]:
len(sam_X_prep)

157

In [41]:
sam_X_tagged, sam_pos_score_swn, sam_neg_score_swn, sam_sent_score_swn = pos_senti(sam_X_prep)

SentiWordNet classifies 98 tweets as negative, 6 as neutral and 53 as positive:

In [42]:
np.unique(sam_sent_score_swn, return_counts = True)

(array([-1,  0,  1]), array([98,  6, 53], dtype=int64))

Consider negative and neutral tweets as negative:

In [43]:
sam_sent_score_swn_bin = np.array(sam_sent_score_swn) == 1

In [44]:
from sklearn.metrics import confusion_matrix

In [45]:
np.unique(sam_manual_y, return_counts = 1)

(array([False,  True]), array([139,  18], dtype=int64))

In [46]:
np.unique(sam_sent_score_swn_bin, return_counts = 1)

(array([False,  True]), array([104,  53], dtype=int64))

In [47]:
cm = confusion_matrix(sam_manual_y, sam_sent_score_swn_bin)
cm

array([[98, 41],
       [ 6, 12]], dtype=int64)

Using our classifier:

In [48]:
vectorizer = load('tfidf_vect_pystemmer.joblib')

In [49]:
clf = load('clf_nb_pystemmer.joblib')

In [50]:
sam_X_vect = vectorizer.transform(sam_X_prep)

In [51]:
sam_X_vect

<157x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 2606 stored elements in Compressed Sparse Row format>

In [52]:
predictions = clf.predict(sam_X_vect)

In [53]:
np.unique(predictions, return_counts = True)

(array([False,  True]), array([136,  21], dtype=int64))

Our classifier against SentiWordNet

In [54]:
cm = confusion_matrix(sam_sent_score_swn_bin, predictions)
cm

array([[98,  6],
       [38, 15]], dtype=int64)

In [55]:
scores(predictions, sam_sent_score_swn_bin)

Test Precision       0.71429
Test Recall          0.28302
F1                   0.40541
F2                   0.32189
              precision    recall  f1-score   support

       False       0.72      0.94      0.82       104
        True       0.71      0.28      0.41        53

    accuracy                           0.72       157
   macro avg       0.72      0.61      0.61       157
weighted avg       0.72      0.72      0.68       157



Our classifier against the manual labels:

In [56]:
cm = confusion_matrix(sam_manual_y, predictions)
cm

array([[132,   7],
       [  4,  14]], dtype=int64)

In [57]:
scores(predictions, sam_manual_y)

Test Precision       0.66667
Test Recall          0.77778
F1                   0.71795
F2                   0.75269
              precision    recall  f1-score   support

       False       0.97      0.95      0.96       139
        True       0.67      0.78      0.72        18

    accuracy                           0.93       157
   macro avg       0.82      0.86      0.84       157
weighted avg       0.94      0.93      0.93       157

