## Imports

In [1]:
import pandas as pd
import json
import re
import numpy as np
import nltk
import itertools
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix
from sklearn.metrics import classification_report, f1_score, fbeta_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load

In [2]:
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))

In [3]:
import Stemmer
english_stemmer = Stemmer.Stemmer('en')

## Tweet Data

In [4]:
SANTweets = pd.read_csv("dataset/SandersAnalyticsTweets.csv")
SANTweets["text"] = SANTweets["TweetText"]
SANTweets = SANTweets[["Topic", "Sentiment", "text"]]
SANTweets.head()

Unnamed: 0,Topic,Sentiment,text
0,apple,positive,Now all @Apple has to do is get swype on the i...
1,apple,positive,@Apple will be adding more carrier support to ...
2,apple,positive,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,@RIM you made it too easy for me to switch to ...
4,apple,positive,I just realized that the reason I got into twi...


In [5]:
SANTweets.shape

(5113, 3)

In [6]:
SANTweets.groupby("Sentiment").size()

Sentiment
irrelevant    1689
negative       572
neutral       2333
positive       519
dtype: int64

In [7]:
SANTweets.groupby("Topic").size()

Topic
apple        1142
google       1317
microsoft    1364
twitter      1290
dtype: int64

In [8]:
def translate_sent(s):
    if s == "irrelevant" or s == "neutral":
        s = 0
    if s == "positive":
        s = 1
    if s == "negative":
        s = -1
    return s

In [9]:
SANTweets["sent"] = SANTweets["Sentiment"].apply(translate_sent)

In [10]:
SANTweets.head()

Unnamed: 0,Topic,Sentiment,text,sent
0,apple,positive,Now all @Apple has to do is get swype on the i...,1
1,apple,positive,@Apple will be adding more carrier support to ...,1
2,apple,positive,Hilarious @youtube video - guy does a duet wit...,1
3,apple,positive,@RIM you made it too easy for me to switch to ...,1
4,apple,positive,I just realized that the reason I got into twi...,1


In [11]:
SMTweets = pd.read_csv("dataset/SamsungTweetsSent.csv")
SMTweets.head()

Unnamed: 0,lang,user_screen_name,text,sent
0,en,mrsshinde,RT @mrsshinde: @SamsungMobile @Moto @oneplus @...,0
1,en,olutobi_og,@SamsungMobile kindly include play next in the...,0
2,en,themobileindian,@SamsungMobile has started rolling out the And...,0
3,en,sobakhani,@SamsungMobile how to find lost Samsung note 1...,0
4,en,Imchetan_p,@SamsungMobile @SamsungIndia @Samsung I must s...,-1


In [12]:
SMTweets.groupby("sent").size()

sent
-1    132
 0    138
 1     19
dtype: int64

In [13]:
CESTweets = pd.read_csv("dataset/ces2020_tweets_full_text.csv")
CESTweets.head()

Unnamed: 0,lang,text,username
0,en,@Razer debuted an incredibly compact all-in-on...,techthelead
1,en,"In keynote address, @Delta Unveils New #OOH Pa...",YourOAAA
2,en,👍 We are ready for Day 2 at #CES2020. Discover...,Sio_db
3,en,GO-&gt; #CES2020 #France #USA !\n#BusinessFran...,dillardmarg
4,en,The industry's smallest and lightest 4K60P pro...,HoldanBlog


## Cleaning and Preprocessing Tweets

Dictionaries and preprocessing function from previous notebook:

In [14]:
emoticon_repl = {
    # positive emoticons
    r":-?d+": " good ", r":[- ]?\)+": " good ", r";-?\)+": " good ",
    r"\(+-?:": " good ", r"=\)+" : " good ", r"\b<3\b" : " good ",    
    # negative emoticons
    r"[\s\r\t\n]+:/+": " bad ", r":\\+": " bad ", r"[\s\r\t\n]+\)-?:": " bad ",
    r":-?\(+": " bad ", r"[\s\t\r\n]+d+-?:": " bad "
}

contracted_repl = {
    # casi particolari
    r"won\'t" : "will not", r"won\'" : "will not", r"can\'t": "can not", r"shan\'t": "shall not",
    r"shan\'": "shall not", r"ain\'t": "is not", r"ain\'": "is not",
    # casi generali
    r"n\'t": " not", r"\'t": " not", r"n\'": " not", r"\'s": " is", r"\'ve": " have", 
    r"\'re": " are", 
    r"\'ll": " will", r"\'d": " would",
}

with open('dataset/slang_subset_manual.json', 'r') as fid:
    slang_repl = json.load(fid)
    
def preprocess(sent, translate_slang = True):
    
    sent = sent.lower()
    sent = re.sub(r'^<div id="video.*>&nbsp;', '', sent) # Video-review part
    sent = re.sub('https?://[A-Za-z0-9./]+', '', sent) # URLs
    
    for k in emoticon_repl:
        sent = re.sub(k, emoticon_repl[k], sent)

    if translate_slang:
        for k in slang_repl:
            sent = re.sub(r"\b"+k+r"\b", slang_repl[k], sent)
        
    for k in contracted_repl:
        sent = re.sub(k, contracted_repl[k], sent)
    
    sent = re.sub('[/]+', ' ', sent) # word1/word2 to word1 word2
    sent = re.sub('[^A-Za-z0-9-_ ]+', '', sent)
    sent = re.sub('\b\d+\b', '', sent)
    
    return sent

Function for preprocessing tweets:

In [15]:
def preprocess_tweets(tweets_df):
    from spellchecker import SpellChecker
    spell = SpellChecker(distance=1)
    
    tweets_df["textPreprocessed"] = tweets_df.text
    tweets_df["textPreprocessed"] =  tweets_df["textPreprocessed"].str.replace("@\w+", "") # remove AT's
    tweets_df["textPreprocessed"] = tweets_df["textPreprocessed"].str.replace("^(RT)+", "") # Remove RT at beginning of retweets
    
    # Add stuff probably
    
    tweets_df["textPreprocessed"] = tweets_df["textPreprocessed"].apply(preprocess)
    tweets_df["textPreprocessed"] = tweets_df["textPreprocessed"].apply(
        lambda x : " ".join([spell.correction(el) for el in x.split()]))

Applying this function to the datasets:

In [16]:
preprocess_tweets(CESTweets)
CESTweets.head()

Unnamed: 0,lang,text,username,textPreprocessed
0,en,@Razer debuted an incredibly compact all-in-on...,techthelead,debuted an incredibly compact all-in-one syste...
1,en,"In keynote address, @Delta Unveils New #OOH Pa...",YourOAAA,in keynote address unveils new ooh parallel re...
2,en,👍 We are ready for Day 2 at #CES2020. Discover...,Sio_db,we are ready for day 2 at ces2020 discover our...
3,en,GO-&gt; #CES2020 #France #USA !\n#BusinessFran...,dillardmarg,go-go ces2020 france usa businessfrance ice in...
4,en,The industry's smallest and lightest 4K60P pro...,HoldanBlog,the industry is smallest and lightest 4k60p pr...


In [17]:
preprocess_tweets(SMTweets)
SMTweets.head()

Unnamed: 0,lang,user_screen_name,text,sent,textPreprocessed
0,en,mrsshinde,RT @mrsshinde: @SamsungMobile @Moto @oneplus @...,0,we must work to save safeguard humans from mob...
1,en,olutobi_og,@SamsungMobile kindly include play next in the...,0,kindly include play next in the next samsungmu...
2,en,themobileindian,@SamsungMobile has started rolling out the And...,0,has started rolling out the android 10 update ...
3,en,sobakhani,@SamsungMobile how to find lost Samsung note 1...,0,how to find lost samsung note 10 plus in pakistan
4,en,Imchetan_p,@SamsungMobile @SamsungIndia @Samsung I must s...,-1,i must say that your sales services really suc...


In [18]:
preprocess_tweets(SANTweets)
SANTweets.head()

Unnamed: 0,Topic,Sentiment,text,sent,textPreprocessed
0,apple,positive,Now all @Apple has to do is get swype on the i...,1,now all has to do is get swipe on the phone an...
1,apple,positive,@Apple will be adding more carrier support to ...,1,will be adding more carrier support to the pho...
2,apple,positive,Hilarious @youtube video - guy does a duet wit...,1,hilarious video - guy does a duet with is sir ...
3,apple,positive,@RIM you made it too easy for me to switch to ...,1,you made it too easy for me to switch to phone...
4,apple,positive,I just realized that the reason I got into twi...,1,i just realized that the reason i got into twi...


## Sentiwordnet Sentiment Classifier

In [19]:
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import sentiwordnet as swn
nltk.download('universal_tagset')
nltk.download('sentiwordnet')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\gianc\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\gianc\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

The SWNClassifier takes in input the pre-processed tweets and  works as follows:


- Each tweet is split in tokens using nltk's tokenizer;
- A *Part of Speech* tag is assigned to each token with nltk's `pos_tag` function;
- Each tag is translated to a SentiWordNet tag;
- Each token/tag pair is assigned the positivity and negativity score defined by SentiWordNet;
- The positivity and negativity score of a tweet is computed as the sum of positivity and negativity scores of the constituting tokens;
- If both positivity and negativity scores are 0, the tweet is labelled as neutral. If the positivity score is greater than the negativity score, the tweet is labelled as positive, and negative otherwise.

The function returns, for a list of tweets:

- Their tokens and their tags
- Their positivity score
- Their negativity score
- Their sentiment score (-1 for negative, 0 for neutral, 1 for positive)

In [20]:
def SWNClassifier(X):
    # Adapted from https://towardsdatascience.com/sentiment-analysis-on-swachh-bharat-using-twitter-216369cfa534
    lem = WordNetLemmatizer()
    pstem = PorterStemmer()
    X_tagged = []
    li_swn=[]
    li_swn_pos=[]
    li_swn_neg=[]
    missing_words=[]
    for i in range(len(X)):
        text = X[i]
        tokens = nltk.word_tokenize(text)
        tagged_sent = nltk.tag.pos_tag(tokens)
        store_it = [(word, nltk.tag.map_tag('en-ptb', 'universal', tag)) for word, tag in tagged_sent]
        X_tagged.append(store_it)
        #print("Tagged Parts of Speech:",store_it)

        pos_total=0
        neg_total=0
        for word,tag in store_it:
            # print(tag)
            if(tag=='NOUN'):
                tag='n'
            elif(tag=='VERB'):
                tag='v'
            elif(tag=='ADJ'):
                tag='a'
            elif(tag=='ADV'):
                tag = 'r'
            else:
                tag='nothing'

                
            if(tag!='nothing'):
                concat = word+'.'+tag+'.01'
                try:
                    this_word_pos=swn.senti_synset(concat).pos_score()
                    this_word_neg=swn.senti_synset(concat).neg_score()
                    # print(word,tag,':',this_word_pos,this_word_neg)
                except Exception as e:
                    wor = lem.lemmatize(word)
                    concat = wor+'.'+tag+'.01'
                    # Checking if there's a possiblity of lemmatized word be accepted into SWN corpus
                    try:
                        this_word_pos=swn.senti_synset(concat).pos_score()
                        this_word_neg=swn.senti_synset(concat).neg_score()
                    except Exception as e:
                        wor = pstem.stem(word)
                        concat = wor+'.'+tag+'.01'
                        # Checking if there's a possiblity of lemmatized word be accepted
                        try:
                            this_word_pos=swn.senti_synset(concat).pos_score()
                            this_word_neg=swn.senti_synset(concat).neg_score()
                        except:
                            missing_words.append(word)
                            continue
                pos_total+=this_word_pos
                neg_total+=this_word_neg
        li_swn_pos.append(pos_total)
        li_swn_neg.append(neg_total)

        if(pos_total!=0 or neg_total!=0):
            if(pos_total>neg_total):
                li_swn.append(1)
            else:
                li_swn.append(-1)
        else:
            li_swn.append(0)
            
    return X_tagged, li_swn_pos, li_swn_neg, li_swn

In [21]:
def scores(y_test, predictions):

    prec = precision_score(y_test, predictions) # Precision
    rec = recall_score(y_test, predictions) # Recall
    f1 = f1_score(y_test, predictions) # F1
    f2 = fbeta_score(y_test, predictions, 2) # F2
    cm = confusion_matrix(y_test, predictions)
    
    scores_strings = ["Test Precision",
                      "Test Recall", "F1", "F2"]
    
    scores = [prec, rec, f1, f2]
    
    print(("{:20s} {:.5f}\n"*4)[:-1].format(*itertools.chain(*zip(scores_strings, scores))))
    
    print(classification_report(y_test, predictions, digits=5))


### Evaluating the SWNClassifier

#### Evaluating the SWNClassifier on SanTweets

Let us compare the positive/negative/neutral labels assigned by the SWNClassifier with the original labels assigned to SANTweets.

In [22]:
%%time
SANTweets_tagged, SANTweets_SWN_POS, SANTweets_SWN_NEG, SANTweets_SWN_SENT = SWNClassifier(SANTweets.textPreprocessed.values)

Wall time: 8.11 s


In [23]:
SANTweets_SENT = SANTweets.sent.values

Evaluation on positive/negative/neutral labels:

In [24]:
print(classification_report(SANTweets_SENT, SANTweets_SWN_SENT, digits=5))

              precision    recall  f1-score   support

          -1    0.23529   0.54545   0.32877       572
           0    0.93148   0.37519   0.53492      4022
           1    0.16844   0.70328   0.27178       519

    accuracy                        0.42754      5113
   macro avg    0.44507   0.54131   0.37849      5113
weighted avg    0.77614   0.42754   0.48514      5113



Evaluation on Neutral/Sentiment labels:

In [25]:
scores(SANTweets_SENT==0, np.array(SANTweets_SWN_SENT)==0)

Test Precision       0.93148
Test Recall          0.37519
F1                   0.53492
F2                   0.42608
              precision    recall  f1-score   support

       False    0.28056   0.89826   0.42757      1091
        True    0.93148   0.37519   0.53492      4022

    accuracy                        0.48680      5113
   macro avg    0.60602   0.63672   0.48125      5113
weighted avg    0.79259   0.48680   0.51201      5113



#### Evaluating the SWNClassifier on SMTweets

Let us compare the positive/negative/neutral labels assigned by the SWNClassifier with the manual labels assigned to SMTweets.

In [26]:
%%time
SMTweets_tagged, SMTweets_SWN_POS, SMTweets_SWN_NEG, SMTweets_SWN_SENT = SWNClassifier(SMTweets.textPreprocessed.values)

Wall time: 499 ms


In [27]:
SMTweets_MANUAL_SENT = SMTweets.sent.values

Evaluation on positive/negative/neutral labels:

In [28]:
print(classification_report(SMTweets_MANUAL_SENT, np.array(SMTweets_SWN_SENT), digits=5))

              precision    recall  f1-score   support

          -1    0.66418   0.67424   0.66917       132
           0    0.75000   0.15217   0.25301       138
           1    0.08661   0.57895   0.15068        19

    accuracy                        0.41869       289
   macro avg    0.50026   0.46845   0.35762       289
weighted avg    0.66719   0.41869   0.43637       289



Evaluation on Neutral/Sentiment labels:

In [29]:
scores(SMTweets_MANUAL_SENT == 0, np.array(SMTweets_SWN_SENT)==0)

Test Precision       0.75000
Test Recall          0.15217
F1                   0.25301
F2                   0.18103
              precision    recall  f1-score   support

       False    0.55172   0.95364   0.69903       151
        True    0.75000   0.15217   0.25301       138

    accuracy                        0.57093       289
   macro avg    0.65086   0.55291   0.47602       289
weighted avg    0.64640   0.57093   0.48605       289



# Evaluating Multinomial Naive Bayes Classifier from previous notebook

Load joblib files:

In [30]:
import zipfile
with zipfile.ZipFile('joblib_data/tfidf_vect_nostemmer.zip', 'r') as zip_ref:
    zip_ref.extractall('joblib_data')

In [31]:
vectorizer = load('joblib_data/tfidf_vect_nostemmer.joblib')



In [32]:
clf = load('joblib_data/clf_nb_nostemmer.joblib')



## Evaluating MNB Classifier on Samsung Mobile Tweet Replies

Now we evaluate our classifier on the SMTweets. Because our classifier only outputs positive/negative, we have to filter out neutral tweets. Hence, we take into account:

- SMTweets manually labelled as positive/negative;
- SMTweets labelled as positive/negative by the SWNClassifier.

#### SMTweets manually labelled as positive or negative:

In [33]:
SMTweets_MANUAL_POS_NEG = SMTweets[SMTweets.sent != 0]

In [34]:
SMTweets_MANUAL_POS_NEG_x = SMTweets_MANUAL_POS_NEG.textPreprocessed.values
SMTweets_MANUAL_POS_NEG_y = SMTweets_MANUAL_POS_NEG.sent.values == 1

Manual labels: 139 negative, 19 positive

In [35]:
np.unique(SMTweets_MANUAL_POS_NEG_y, return_counts=True)

(array([False,  True]), array([132,  19], dtype=int64))

Applying the vectorizer:

In [36]:
SMTweets_MANUAL_POS_NEG_x_vect = vectorizer.transform(SMTweets_MANUAL_POS_NEG_x)
SMTweets_MANUAL_POS_NEG_x_vect

<151x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 2393 stored elements in Compressed Sparse Row format>

Predictions from the MNB Classifier:

In [37]:
%%time
predictions = clf.predict(SMTweets_MANUAL_POS_NEG_x_vect)

Wall time: 997 µs


In [38]:
np.unique(predictions, return_counts = True)

(array([False,  True]), array([133,  18], dtype=int64))

MNB predictions against the manual labels:

In [39]:
scores(SMTweets_MANUAL_POS_NEG_y, predictions)

Test Precision       0.83333
Test Recall          0.78947
F1                   0.81081
F2                   0.79787
              precision    recall  f1-score   support

       False    0.96992   0.97727   0.97358       132
        True    0.83333   0.78947   0.81081        19

    accuracy                        0.95364       151
   macro avg    0.90163   0.88337   0.89220       151
weighted avg    0.95274   0.95364   0.95310       151



#### SMTweets labelled as positive or negative by the SWNClassifier:

In [40]:
SMTweets_SWN_SENT = np.array(SMTweets_SWN_SENT)
SMTweets_SWN_POS_NEG_x = SMTweets.textPreprocessed.values[SMTweets_SWN_SENT != 0]
SMTweets_SWN_POS_NEG_y = SMTweets_SWN_SENT[SMTweets_SWN_SENT != 0] == 1

SWN Labels: 134 negative, 127 positive.

In [41]:
np.unique(SMTweets_SWN_POS_NEG_y, return_counts=True)

(array([False,  True]), array([134, 127], dtype=int64))

Applying the vectorizer:

In [42]:
SMTweets_SWN_POS_NEG_x_vect = vectorizer.transform(SMTweets_SWN_POS_NEG_x)
SMTweets_SWN_POS_NEG_x_vect

<261x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 3739 stored elements in Compressed Sparse Row format>

Predictions from the MNB Classifier:

In [43]:
%%time
predictions = clf.predict(SMTweets_SWN_POS_NEG_x_vect)

Wall time: 0 ns


In [44]:
np.unique(predictions, return_counts = True)

(array([False,  True]), array([209,  52], dtype=int64))

MNB predictions against the SWN labels on SMTweets:

In [45]:
scores(SMTweets_SWN_POS_NEG_y, predictions)

Test Precision       0.78846
Test Recall          0.32283
F1                   0.45810
F2                   0.36607
              precision    recall  f1-score   support

       False    0.58852   0.91791   0.71720       134
        True    0.78846   0.32283   0.45810       127

    accuracy                        0.62835       261
   macro avg    0.68849   0.62037   0.58765       261
weighted avg    0.68581   0.62835   0.59113       261



## Evaluating MNB Classifier on Sanders Analytics Tweets

Now we evaluate our classifier on the SANTweets. Because our classifier only outputs positive/negative, we have to filter out neutral tweets. Hence, we take into account:

- SANTweets originally labelled as positive/negative;
- SANTweets labelled as positive/negative by the SWNClassifier.

#### SANTweets labelled as positive or negative:

In [46]:
SANTweets_SENT = SANTweets.sent.values
SANTweets_POS_NEG_x = SANTweets.textPreprocessed.values[SANTweets_SENT != 0]
SANTweets_POS_NEG_y = SANTweets_SENT[SANTweets_SENT != 0] == 1

Original labels: 572 negative, 519 positive

In [47]:
np.unique(SANTweets_POS_NEG_y, return_counts=True)

(array([False,  True]), array([572, 519], dtype=int64))

Applying the vectorizer:

In [48]:
SANTweets_POS_NEG_x_vect = vectorizer.transform(SANTweets_POS_NEG_x)
SANTweets_POS_NEG_x_vect

<1091x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 9149 stored elements in Compressed Sparse Row format>

Predictions from the MNB Classifier:

In [49]:
%%time
predictions = clf.predict(SANTweets_POS_NEG_x_vect)

Wall time: 998 µs


In [50]:
np.unique(predictions, return_counts = True)

(array([False,  True]), array([724, 367], dtype=int64))

MNB predictions against the original SANTweets labels:

In [51]:
scores(SANTweets_POS_NEG_y, predictions)

Test Precision       0.78202
Test Recall          0.55299
F1                   0.64786
F2                   0.58739
              precision    recall  f1-score   support

       False    0.67956   0.86014   0.75926       572
        True    0.78202   0.55299   0.64786       519

    accuracy                        0.71402      1091
   macro avg    0.73079   0.70656   0.70356      1091
weighted avg    0.72830   0.71402   0.70626      1091



#### SANTweets labelled as positive or negative by the SWNClassifier:

In [52]:
SANTweets_SWN_SENT = np.array(SANTweets_SWN_SENT)
SANTweets_SWN_POS_NEG_x = SANTweets.textPreprocessed.values[SANTweets_SWN_SENT != 0]
SANTweets_SWN_POS_NEG_y = SANTweets_SWN_SENT[SANTweets_SWN_SENT != 0] == 1

SWN SANTweets Labels: 1326 negative, 2167 positive.

In [53]:
np.unique(SANTweets_SWN_POS_NEG_y, return_counts=True)

(array([False,  True]), array([1326, 2167], dtype=int64))

Applying the vectorizer:

In [54]:
SANTweets_SWN_POS_NEG_x_vect = vectorizer.transform(SANTweets_SWN_POS_NEG_x)
SANTweets_SWN_POS_NEG_x_vect

<3493x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 27226 stored elements in Compressed Sparse Row format>

Predictions from the MNB Classifier:

In [55]:
%%time
predictions = clf.predict(SANTweets_SWN_POS_NEG_x_vect)

Wall time: 978 µs


In [56]:
np.unique(predictions, return_counts = True)

(array([False,  True]), array([1977, 1516], dtype=int64))

MNB predictions against the SWN labels on SANTweets:

In [57]:
scores(SANTweets_SWN_POS_NEG_y, predictions)

Test Precision       0.73879
Test Recall          0.51684
F1                   0.60820
F2                   0.54988
              precision    recall  f1-score   support

       False    0.47041   0.70136   0.56312      1326
        True    0.73879   0.51684   0.60820      2167

    accuracy                        0.58689      3493
   macro avg    0.60460   0.60910   0.58566      3493
weighted avg    0.63691   0.58689   0.59109      3493



## Evaluating MNB Classifier on CESTweets

Because CESTweets do not have any label, we can only rely on the SWNClassifier.

In [58]:
CESTweets_tagged, CESTweets_SWN_POS, CESTweets_SWN_NEG, CESTweets_SWN_SENT = SWNClassifier(CESTweets.textPreprocessed.values)

#### CESTweets labelled as positive or negative by the SWNClassifier:

In [59]:
CESTweets_SWN_SENT = np.array(CESTweets_SWN_SENT)
CESTweets_SWN_POS_NEG_x = CESTweets.textPreprocessed.values[CESTweets_SWN_SENT != 0]
CESTweets_SWN_POS_NEG_y = CESTweets_SWN_SENT[CESTweets_SWN_SENT != 0] == 1

SWN CESTweets Labels: 465 negative, 1229 positive.

In [60]:
np.unique(CESTweets_SWN_POS_NEG_y, return_counts=True)

(array([False,  True]), array([ 465, 1229], dtype=int64))

Applying the vectorizer:

In [61]:
CESTweets_SWN_POS_NEG_x_vect = vectorizer.transform(CESTweets_SWN_POS_NEG_x)
CESTweets_SWN_POS_NEG_x_vect

<1694x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 17878 stored elements in Compressed Sparse Row format>

Predictions from the MNB Classifier:

In [62]:
%%time
predictions = clf.predict(CESTweets_SWN_POS_NEG_x_vect)

Wall time: 998 µs


In [63]:
np.unique(predictions, return_counts = True)

(array([False,  True]), array([915, 779], dtype=int64))

MNB predictions against the SWN labels on SMTweets:

In [64]:
scores(CESTweets_SWN_POS_NEG_y, predictions)

Test Precision       0.81130
Test Recall          0.51424
F1                   0.62948
F2                   0.55487
              precision    recall  f1-score   support

       False    0.34754   0.68387   0.46087       465
        True    0.81130   0.51424   0.62948      1229

    accuracy                        0.56080      1694
   macro avg    0.57942   0.59906   0.54518      1694
weighted avg    0.68400   0.56080   0.58320      1694

