# Helper Functions

In [None]:
!pip install tweepy
!pip install TextBlob
!pip install emoji

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import json
import emoji as em
from textblob import TextBlob

In [None]:
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
 
import twitter_credentials
 
# # # # TWITTER STREAMER # # # #
class TwitterStreamer():
    """
    Class for streaming and processing live tweets.
    """
    def __init__(self):
        pass

    def stream_tweets(self, fetched_tweets_filename, hash_tag_list):
        # This handles Twitter authetification and the connection to Twitter Streaming API
        listener = StdOutListener(fetched_tweets_filename)
        auth = OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)
        auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET)
        stream = Stream(auth, listener)

        # This line filter Twitter Streams to capture data by the keywords: 
        stream.filter(track = hash_tag_list)


# # # # TWITTER STREAM LISTENER # # # #
class StdOutListener(StreamListener):
    """
    This is a basic listener that just prints received tweets to stdout.
    """
    def __init__(self, fetched_tweets_filename):
        self.fetched_tweets_filename = fetched_tweets_filename

    def on_data(self, data):
        try:
            #print(data)
            print('Running...')
            with open(self.fetched_tweets_filename, 'a') as tf:
                tf.write(data)
            return True
        except BaseException as e:
            print("Error on_data %s" % str(e))
        return True
          

    def on_error(self, status):
        if status == 420:
            # Returning False on_data method in case rate limit occurs.
            return False
        print(status)

In [2]:
#Reading data from json file
def saveToDataFrame(localfile):
    tweets_tmp = []
    thread_all = []

    with open(localfile) as jsonfile:
        for i, line in enumerate(jsonfile):
            thread = json.loads(line)
            thread_all.append(thread)

            if all (k in thread for k in ('id','created_at', 'extended_tweet', 'entities','lang')):
                if len(thread['extended_tweet']['entities']['hashtags']) == 0:
                    tweets_tmp.append((thread['id'], 
                                       thread['created_at'], 
                                       thread['extended_tweet']['full_text'],
                                       len(thread['extended_tweet']['full_text']), 
                                       '0', 
                                       thread['lang']
                                      ))

                else:
                    tweets_tmp.append((thread['id'], 
                                       thread['created_at'], 
                                       thread['extended_tweet']['full_text'],
                                       len(thread['extended_tweet']['full_text']),
                                       [tag['text'] for tag in thread['extended_tweet']['entities']['hashtags']],
                                       thread['lang']
                                      ))


            elif all (k in thread for k in ('id','created_at', 'text', 'entities')):
                if len(thread['entities']['hashtags']) == 0:
                    tweets_tmp.append((thread['id'], 
                                       thread['created_at'], 
                                       thread['text'],
                                       len(thread['text']), 
                                       '0',
                                       thread['lang']
                                      ))

                else:
                    tweets_tmp.append((thread['id'], 
                                       thread['created_at'], 
                                       thread['text'], 
                                       len(thread['text']), 
                                       [tag['text'] for tag in thread['entities']['hashtags']],
                                       thread['lang']
                                      ))

    #print(len(tweets_tmp))

    # Create the tweets data frame.  
    labels = ['id', 'created_at', 'tweets', 'tweets_len', 'hashtags','language']
    tweets_frame = pd.DataFrame(tweets_tmp, columns=labels)

    tweets_frame = tweets_frame[tweets_frame.language == 'en'] #filter english
    
    tweets_frame = tweets_frame.drop_duplicates(subset='tweets', keep='first', inplace=False) #drop duplicate
    
    tweets_frame.reset_index(drop=True, inplace=True)

    print("Number of Tweets got: ", len(tweets_frame))

    #tweets_frame.head()
    
    
    return tweets_frame, thread_all

In [3]:
#Tokenization, normalization, remove stop words, stemming, lemmatization
#Modify from code in Text as Data course

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import nltk
from nltk.stem.porter import *

nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')

#@Tokenize
def spacy_tokenize(string):
    tokens = []
    doc = nlp(string)
    for token in doc:
        tokens.append(token)
    return tokens

#@Normalize
def normalize(tokens):
    normalized_tokens = []
    for token in tokens:
        normalized = token.text.lower().strip()
        if ((token.is_alpha or token.is_digit)):
            normalized_tokens.append(normalized)
    return normalized_tokens

def spellCorrection(normalized_tokens):
    corrected = TextBlob(' '.join(normalized_tokens))
    
    return str(corrected.correct()).split()

#Remove Stop Words
def removeStopWord(normalized_tokens):
    filtered_sentence =[]
    for word in normalized_tokens:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)   
    return filtered_sentence

#Stemming using PorterStemmer
def stemming(filtered_sentence):
    stemmed_sentence = []
    stemmer = PorterStemmer()
    for word in filtered_sentence:
        token = stemmer.stem(word)
        stemmed_sentence.append(token)
    return ' '.join(stemmed_sentence)

#lemming
def lemming(filtered_sentence):
    lemmed_sentence = []
    tokens = nlp(' '.join(filtered_sentence))
    for token in tokens:
        lemmed_sentence.append(token.lemma_)
    return ' '.join(lemmed_sentence)

#@Tokenize, normalize, and lemming
def tokenize_normalize_lem(string):
    return lemming(normalize(spacy_tokenize(string)))

# Crawling excitement, happy, pleasant, surprise, fear, angry data

## Do not run crawl codes in this section.

In [None]:
# all hashtag list
hash_tag_list = ['#excitement', '#exciting', '#excited', '#thrilled', '#amazing',
                 '#Happy', '#happiness', '#joy', '#love', '#cheerful', '#delighted',  '#laughing',
                 '#pleasant', '#glad', '#satisfied', '#appreciated', '#appreciate',
                 '#shock', '#sad', '#frustration', '#frustrated',
                 '#fear', '#disgust', '#depressed', '#depression', '#afraid', '#scary', '#awful',
                 '#angry', '#pissed', '#furious', '#outraged', '#indignant', 
                 em.emojize(':relaxed:', use_aliases=True),
                 em.emojize(':pensive:', use_aliases=True),
                 em.emojize(':cry:', use_aliases=True),
                 em.emojize(':sob:', use_aliases=True),
                 em.emojize(':fearful:', use_aliases=True),
                 em.emojize(':anger:', use_aliases=True),
                 em.emojize(':angry:', use_aliases=True)]

twitter_streamer = TwitterStreamer()
#twitter_streamer.stream_tweets('tweets_all.json', hash_tag_list)
twitter_streamer.stream_tweets('sample_tweets.json', hash_tag_list)


In [None]:
# Crawling excitement, done
hash_tag_list = ['#excitement', '#exciting', '#excited', '#thrilled', '#amazing']

twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets('tweets_excitement.json', hash_tag_list)

In [None]:
# Crawling happy, done, 51 mins
hash_tag_list = ['#Happy', '#happiness', '#joy', '#love', '#cheerful', '#delighted',  '#laughing']

twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets('tweets_happy.json', hash_tag_list)

In [None]:
# Crawling pleasant, done
hash_tag_list = ['#pleasant', '#glad', '#satisfied', '#appreciated', '#appreciate', 
                 em.emojize(':relaxed:', use_aliases=True)]

twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets('tweets_pleasant.json', hash_tag_list)

In [None]:
# Crawling surprise, done
hash_tag_list = ['#shock', '#sad', '#frustration', '#frustrated', 
                 em.emojize(':pensive:', use_aliases=True),
                 em.emojize(':cry:', use_aliases=True),
                 em.emojize(':sob:', use_aliases=True)]

twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets('tweets_surprise.json', hash_tag_list)

In [None]:
# Crawling fear, done
hash_tag_list = ['#fear', '#disgust', '#depressed', '#depression', '#afraid', '#scary', '#awful', 
                 em.emojize(':fearful:', use_aliases=True)]

twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets('tweets_fear.json', hash_tag_list)

In [None]:
# Crawling Angry, done
hash_tag_list = ['#angry', '#pissed', '#furious', '#outraged', '#indignant', 
                 em.emojize(':anger:', use_aliases=True),
                 em.emojize(':angry:', use_aliases=True)]

twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets('tweets_angry.json', hash_tag_list)

# NLP Processing

In [4]:
#Read
#tweets_all, _ = saveToDataFrame('tweets_all.json')
tweets_all, _ = saveToDataFrame('sample_tweets.json')

Number of Tweets got:  70941


In [5]:
#Extract emoji
#Citation: https://stackoverflow.com/questions/43146528/how-to-extract-all-the-emojis-from-text

def extract_emojis(str):
    return ''.join(c for c in str if c in em.UNICODE_EMOJI)

tweets_all['emoji'] = np.array([extract_emojis(tweet) for tweet in tweets_all['tweets']])

In [6]:
#Clean tweets
#Citation

def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

tweets_all['clean_tweents'] = np.array([clean_tweet(tweet) for tweet in tweets_all['tweets']])

In [7]:
#Sentiment analysis to tell the text is possitive, neutral, or negative.
#Citation: https://github.com/vprusso/youtube_tutorials/blob/master/twitter_python/part_5_sentiment_analysis_tweet_data/sentiment_anaylsis_twitter_data.py

def analyze_sentiment(tweet):
        analysis = TextBlob(tweet)
        
        if analysis.sentiment.polarity > 0:
            return 1
        elif analysis.sentiment.polarity == 0:
            return 0
        else:
            return -1
        
tweets_all['sentiment'] = np.array([analyze_sentiment(tweet) for tweet in tweets_all['clean_tweents']])


In [8]:
#Apply tokenization, normalization, spelling correction, and lemmatization
tweets_all['clean_tweets_lemmed'] = np.array([tokenize_normalize_lem(tweet) for tweet in tweets_all['tweets']])


In [9]:
#Drop duplicate the third time
#Drop duplicate the second time
tweets_all.drop_duplicates(subset='tweets', keep='first', inplace=True)
tweets_all.drop_duplicates(subset='clean_tweets_lemmed', keep='first', inplace=True)
tweets_all.drop_duplicates(subset='clean_tweents', keep='first', inplace=True)
tweets_all.reset_index(drop=True, inplace=True)

In [10]:
#Drop more duplicates, expecially those with same text but different web link:
len(tweets_all)

67610

In [11]:
#Normalized the hashtags
def lowertag(tags):
    lowered_tags = []
    if tags == '0':
        lowered_tags.append(tags)
    else:
        for i in range(len(tags)):
            tag = str(tags[i])
            lowered = tag.lower().strip()
            lowered_tags.append(lowered)
    return lowered_tags

tweets_all['cleaned_tags'] = np.array([lowertag(tag) for tag in tweets_all.hashtags.values])

In [12]:
#hashtag counts
def countHashtags(hashlist):
    counts = []
    for tags in tweets_all.cleaned_tags:
        count = 0
        for i in tags:
            if i in hashlist:
                count += 1
        counts.append(count)
    
    return np.array(counts)

#emoji counts
def countEmojis(emojilist):
    counts = []
    for emojis in tweets_all.emoji:
        count = 0
        for i in emojis:
            if i in emojilist:
                count += 1
        counts.append(count)
    
    return np.array(counts)

In [13]:
tweets_all['excitement'] = countHashtags(['excitement', 'exciting', 'excited', 
                                                   'thrilled', 'amazing']) + countEmojis([
    em.emojize(':laughing:', use_aliases=True),
    em.emojize(':grimacing:', use_aliases=True),
    em.emojize(':grinning:', use_aliases=True)])

tweets_all['happy'] = countHashtags(['happy', 'happiness', 'joy', 'love', 
                                              'cheerful', 'delighted',  'laughing']) + countEmojis([
    em.emojize(':kissing_closed_eyes:', use_aliases=True),
    em.emojize(':joy:', use_aliases=True)])

tweets_all['pleasant'] = countHashtags(['pleasant', 'glad', 'satisfied', 
                                                 'appreciated', 'appreciate']) + countEmojis([
    em.emojize(':relaxed:', use_aliases=True),
    em.emojize(':satisfied:', use_aliases=True),
    em.emojize(':simple_smile:', use_aliases=True)])

tweets_all['surprise'] = countHashtags(['shock', 'sad', 'frustration', 'frustrated']) + countEmojis([
    em.emojize(':pensive:', use_aliases=True), 
    em.emojize(':cry:', use_aliases=True),
    em.emojize(':sob:', use_aliases=True)])

tweets_all['fear'] = countHashtags(['fear', 'disgust', 'depressed', 'depression', 
                                    'afraid', 'scary', 'awful']) + countEmojis([
    em.emojize(':fearful:', use_aliases=True),
    em.emojize(':scream:', use_aliases=True),
    em.emojize(':sleepy:', use_aliases=True)])

tweets_all['angry'] = countHashtags(['angry', 'pissed', 'furious', 'outraged', 'indignant']) + countEmojis([
    em.emojize(':anger:', use_aliases=True), 
    em.emojize(':angry:', use_aliases=True),
    em.emojize(':rage:', use_aliases=True)])

In [14]:
#Labeling
#according to score
labeldata = tweets_all[['excitement', 'happy','pleasant', 'surprise', 'fear', 'angry']]
newlabels = labeldata.idxmax(axis = 1)
#deal with 0, if 0, keepe original label
for i in range(len(tweets_all)):
    if labeldata.sum(axis = 1)[i] == 0:
        newlabels[i] = '0'

tweets_all['label'] = newlabels

In [15]:
tweets_all = tweets_all[~(tweets_all['label'] == '0')]

In [16]:
#Keep those longger than 40 characters
tweets_all = tweets_all[tweets_all.tweets_len > 40]

## Processing angry label

In [17]:
angry = tweets_all[tweets_all.label == 'angry']
angry.reset_index(drop=True, inplace=True)

In [18]:
#angry should be negative emotion, filter using sentiment <= 0
angry = angry[angry.sentiment <= 0]
angry.reset_index(drop=True, inplace=True)

In [19]:
len(angry)

617

## Processing fear

In [20]:
fear = tweets_all[tweets_all.label == 'fear']
fear.reset_index(drop=True, inplace=True)

In [21]:
#fear should be negative emotion, filter using sentiment <= 0
fear = fear[fear.sentiment <= 0]
fear.reset_index(drop=True, inplace=True)

In [22]:
len(fear)

685

## Processing surprise

In [23]:
surprise = tweets_all[tweets_all.label == 'surprise']
surprise.reset_index(drop=True, inplace=True)

In [24]:
#should be negative feelings, filter sentiment, delet sentiment>0(possitive)
surprise = surprise[surprise.sentiment <= 0]
surprise.reset_index(drop=True, inplace=True)

In [25]:
len(surprise)

15346

## Processing pleasant

In [26]:
pleasant = tweets_all[tweets_all.label == 'pleasant']
pleasant.reset_index(drop=True, inplace=True)

In [27]:
#pleasant should be possitive emotion, filter using sentiment >= 0
pleasant = pleasant[pleasant.sentiment >= 0]
pleasant.reset_index(drop=True, inplace=True)

In [28]:
len(pleasant)

265

## Processing happy

In [29]:
happy = tweets_all[tweets_all.label == 'happy']
happy.reset_index(drop=True, inplace=True)

In [30]:
#happy should be possitive, filter with sentiment > 0(excluding 0)
happy = happy[happy.sentiment > 0]
happy.reset_index(drop=True, inplace=True)

In [31]:
len(happy)

7663

## Processing excitment

In [32]:
excitement = tweets_all[tweets_all.label == 'excitement']
excitement.reset_index(drop=True, inplace=True)

In [33]:
#should be possitive, filter with sentiment > 0(excluding 0)
excitement = excitement[excitement.sentiment > 0]
excitement.reset_index(drop=True, inplace=True)

In [34]:
len(excitement)

1117

In [35]:
excitement

Unnamed: 0,id,created_at,tweets,tweets_len,hashtags,language,emoji,clean_tweents,sentiment,clean_tweets_lemmed,cleaned_tags,excitement,happy,pleasant,surprise,fear,angry,label
0,1236043548459274240,Fri Mar 06 21:38:41 +0000 2020,@buddyboi94 @deannamarsh751 @BrandonSunday23 A...,92,[Excited],en,,And a new department joins the lineup Excited,1,and a new department join the lineup excite,[excited],1,0,0,0,0,0,excitement
1,1236043554788487168,Fri Mar 06 21:38:42 +0000 2020,RT @lilyachty: Just woke up bout to dive into ...,75,"[goodday, excited]",en,,RT Just woke up bout to dive into goodday excited,1,rt just wake up bout to diva into goodday excite,"[goodday, excited]",1,0,0,0,0,0,excitement
2,1236043596480106497,Fri Mar 06 21:38:52 +0000 2020,It's : 2020-03-06T21:38:52.1734695Z \n\n#tuesd...,230,"[tuesdaymotivation, funny, pets, followme, f4f...",en,,It s 2020 03 06T21 38 52 1734695Z tuesdaymotiv...,1,it 2020 03 tuesdaymotivation funny pet followm...,"[tuesdaymotivation, funny, pets, followme, f4f...",1,1,0,0,0,0,excitement
3,1236043643363852288,Fri Mar 06 21:39:03 +0000 2020,RT @AuthorAaronHowe: REMINDER: You deserve #Ha...,72,"[Happiness, Success]",en,😀,RT REMINDER You deserve Happiness and Success,1,rt reminder you deserve happiness and success,"[happiness, success]",1,1,0,0,0,0,excitement
4,1236043662842368004,Fri Mar 06 21:39:08 +0000 2020,silverstein and four year strong tonight with ...,82,[amazing],en,,silverstein and four year strong tonight with ...,1,silverstein and four year strong tonight with ...,[amazing],1,0,0,0,0,0,excitement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1112,1236104067501314048,Sat Mar 07 01:39:10 +0000 2020,"Well done, Mr. Bloomberg!\nWorth watching! Twi...",78,"[TrumpSUCKS, VoteBlue2020]",en,😆,Well done Mr Bloomberg Worth watching Twice Tr...,1,good do bloomberg worth watch twice trumpsucks,"[trumpsucks, voteblue2020]",1,0,1,0,0,0,excitement
1113,1236104070013710336,Sat Mar 07 01:39:10 +0000 2020,@gibsonfilms @JosephBiwald That skull pattern ...,267,0,en,😬,That skull pattern is killer on the Hunter hel...,1,that skull pattern be killer on the hunter hel...,[0],1,0,0,0,0,0,excitement
1114,1236104072421240832,Sat Mar 07 01:39:11 +0000 2020,Lol been home for a hour now &amp; all I’ve do...,88,0,en,😆,Lol been home for a hour now amp all I ve done...,1,lol be home for a hour now amp all i do be tak...,[0],1,0,1,0,0,0,excitement
1115,1236104092444962817,Sat Mar 07 01:39:15 +0000 2020,ECCC got postponed and I'm bummed but look! Th...,279,"[CaptainAmerica, SteveRogers, Funko, FunkoPop,...",en,😆🇺🇸,ECCC got postponed and I m bummed but look Thi...,1,eccc get postpone and i bum but look this be l...,"[captainamerica, steverogers, funko, funkopop,...",1,0,1,0,0,0,excitement


# Saving Data & Sampling

In [47]:
final_tweets_all = tweets_all
final_tweets_all.to_csv('final_data/final_tweets_all.csv', index = False)

In [None]:
final_excitement = excitement[['id', 'tweets', 'created_at']]
final_happy = happy[['id', 'tweets', 'created_at']]
final_pleasant = pleasant[['id', 'tweets', 'created_at']]
final_surprise = surprise[['id', 'tweets', 'created_at']]
final_fear = fear[['id', 'tweets', 'created_at']]
final_angry = angry[['id', 'tweets', 'created_at']]

In [None]:
final_excitement.to_csv('final_data/excitement.csv', index = False)
final_happy.to_csv('final_data/happy.csv', index = False)
final_pleasant.to_csv('final_data/pleasant.csv', index = False)
final_surprise.to_csv('final_data/furprise.csv', index = False)
final_fear.to_csv('final_data/fear.csv', index = False)
final_angry.to_csv('final_data/angry.csv', index = False)

In [None]:
crowdsorsing_data = pd.concat([excitement[['tweets', 'label']].sample(n=30), 
                               happy[['tweets', 'label']].sample(n=30), 
                               pleasant[['tweets', 'label']].sample(n=30), 
                               surprise[['tweets', 'label']].sample(n=30), 
                               fear[['tweets', 'label']].sample(n=30), 
                               angry[['tweets', 'label']].sample(n=30)], ignore_index=True)
crowdsorsing_data.to_csv('final_data/crowdsorsing_data.csv', index = False)

# Crowdsoursing Result

In [36]:
# Crowdsoursing result
cs_result = pd.read_csv('crowdsoursing result aggregated.csv')

In [37]:
cs_result

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,which_category_best_fits_this_text_,which_category_best_fits_this_text_:confidence,label,tweets,which_category_best_fits_this_text__gold
0,2684818958,False,finalized,3,3/13/2020 16:59:27,happy_love_joy,0.6667,excitement,RT @zwkishi: Haven't draw yuzu that much but I...,
1,2684818959,False,finalized,3,3/13/2020 16:56:18,excitement,0.6667,excitement,RT @epollsurveys: Happy Friday everyone! 😀\n#M...,
2,2684818960,False,finalized,5,3/13/2020 16:33:52,happy_love_joy,0.6000,excitement,@FedExHelp Hi Lauryn! Luckily with some flatte...,
3,2684818961,False,finalized,4,3/13/2020 16:28:23,surprise_sad_frustration,0.2500,excitement,@meakoopa My brother made me watch the entire ...,
4,2684818962,False,finalized,3,3/13/2020 16:28:39,surprise_sad_frustration,0.3333,excitement,@BeatsNdBang Undefeated! I’m tryna work with m...,
...,...,...,...,...,...,...,...,...,...,...
214,2686752167,True,golden,1,,angry,1.0000,angry,@tsukasaslilfang only maane will get mad 😠,angry
215,2686758825,True,golden,1,,surprise_sad_frustration,1.0000,surprise,@DannyPaps94 @MeechIsDEAD No I can’t take the ...,surprise_sad_frustration
216,2686760556,True,golden,1,,happy_love_joy,1.0000,happy,Animals #m#: #Animals#: #Penguins #in #Love #|...,happy_love_joy
217,2686761386,True,golden,1,,surprise_sad_frustration,1.0000,surprise,why it always da single moms getting spicy 😢,fear_disgust_depression


In [38]:
cs_sorted = cs_result[['tweets', 'which_category_best_fits_this_text_', 'label']]

In [39]:
cs_sorted[cs_sorted.which_category_best_fits_this_text_ == 'happy_love_joy'].which_category_best_fits_this_text_ = 'happy'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [40]:
cs_label = []
for label in cs_sorted.which_category_best_fits_this_text_.values:
    if label == 'happy_love_joy':
        cs_label.append('happy') 
    elif label == 'surprise_sad_frustration':
        cs_label.append('surprise')
    elif label == 'fear_disgust_depression':
        cs_label.append('fear')
    else:
        cs_label.append(label)
    

In [41]:
cs_sorted['cs_label'] = np.array(cs_label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [42]:
print('Accuracy is: ', sum(cs_sorted.label == cs_sorted.cs_label)/len(cs_sorted))

Accuracy is:  0.4931506849315068


In [43]:
#Evaluation
#Code from Lab4 Text as Data
#Train and Evaluate Classifiers
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

def evaluation_summary(description, predictions, true_labels):
    print("Evaluation for: " + description)
    precision = precision_score(true_labels, predictions, average='macro')
    recall = recall_score(true_labels, predictions, average='macro')
    accuracy = accuracy_score(true_labels, predictions)
    f1 = fbeta_score(true_labels, predictions, 1, average='macro') #1 means f_1 measure
    print("Classifier '%s' has Acc=%0.3f P=%0.3f R=%0.3f F1=%0.3f" % (description,accuracy,precision,recall,f1))
    # Specify three digits instead of the default two.
    print(classification_report(true_labels, predictions, digits=3))
    print('\nConfusion matrix:\n',confusion_matrix(true_labels, predictions)) # Note the order here is true, predicted, odd.
    
    return accuracy, precision, recall, f1



In [44]:
(accuracy, 
 precision, 
 recall, f1) = evaluation_summary("Crowdsoursing Result", 
                                  cs_sorted.label, 
                                  cs_sorted.cs_label)

Evaluation for: Crowdsoursing Result
Classifier 'Crowdsoursing Result' has Acc=0.493 P=0.487 R=0.523 F1=0.452
              precision    recall  f1-score   support

       angry      0.833     0.909     0.870        33
  excitement      0.205     0.500     0.291        16
        fear      0.242     0.667     0.356        12
       happy      0.775     0.383     0.512        81
    pleasant      0.086     0.250     0.128        12
    surprise      0.778     0.431     0.554        65

    accuracy                          0.493       219
   macro avg      0.487     0.523     0.452       219
weighted avg      0.676     0.493     0.533       219


Confusion matrix:
 [[30  2  1  0  0  0]
 [ 0  8  2  2  4  0]
 [ 0  0  8  0  0  4]
 [ 1 18  2 31 26  3]
 [ 0  2  2  4  3  1]
 [ 5  9 18  3  2 28]]


In [45]:
#category accuracy
#Accuracy of each class
#Get the confusion matrix
cm = confusion_matrix(cs_sorted.cs_label, cs_sorted.label)

#Now the normalize the diagonal entries
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

#The diagonal entries are the accuracies of each class
cm.diagonal()

array([0.90909091, 0.5       , 0.66666667, 0.38271605, 0.25      ,
       0.43076923])

# Modeling

## Transform into TFIDF matrix

In [48]:
#Text used here is clean tweets after lemmed
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(final_tweets_all.clean_tweets_lemmed)

In [49]:
X.shape

(44132, 31671)

In [50]:
print(X)

  (0, 9580)	0.3110453537225645
  (0, 29144)	0.17430504145521622
  (0, 12253)	0.1885678357004289
  (0, 1628)	0.292145807122746
  (0, 25204)	0.3578637878051896
  (0, 23821)	0.3578637878051896
  (0, 27407)	0.07527768987593568
  (0, 2034)	0.12857150606289064
  (0, 11895)	0.18978773758013692
  (0, 11642)	0.1782031791803431
  (0, 27588)	0.18442379861656227
  (0, 10287)	0.146983526989967
  (0, 31140)	0.22529821072501083
  (0, 7598)	0.29552586184894863
  (0, 21300)	0.34715327469268836
  (0, 25078)	0.22712418845821622
  (0, 20189)	0.1794375224634039
  (0, 23520)	0.08857598825641594
  (1, 19492)	0.21480289229445185
  (1, 23205)	0.25619013103408045
  (1, 2037)	0.41082751466390827
  (1, 8903)	0.39570669800239056
  (1, 19883)	0.1701498861248553
  (1, 21263)	0.2667202338450074
  (1, 21771)	0.4083759478931913
  :	:
  (44128, 17412)	0.1334005041656718
  (44128, 27853)	0.1017247432955413
  (44128, 4309)	0.14846495698106463
  (44128, 27407)	0.0989728339899038
  (44128, 27588)	0.12123742130202816
  (4412