In [1]:
# Hafizh Mulya H.-18117030, Muhammad Athallah Rizki Putra-18117002, & Muhammad Hanif Naufal Eka Wiratama-18117027
# Update: 20 Agustus 2021
# Tugas Mata Kuliah ET4047 Inteligensi Buatan
# Depression Detection pada tweet
# Juga tersedia di https://colab.research.google.com/drive/1c5Ix4HA1q7BjnR6hBdrHEhge0I8a1EGI?usp=sharing

Detecting depression in Tweets using TF - IDF

Installing and importing libraries

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from math import log, sqrt
import pandas as pd
import numpy as np
import re
%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
url = 'https://raw.githubusercontent.com/RizkiPutra660/Depression-Tweets-Detection/main/Depression%20Sentiment%20Tweets.csv'
tweets = pd.read_csv(url, error_bad_lines=False)
tweets.drop(['Unnamed: 0'], axis = 1, inplace = True)
tweets.head()

Unnamed: 0,message,label
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga http://plurk.com/p/mzp1e,0
2,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [4]:
tweets['label'].value_counts()

0    8000
1    2314
Name: label, dtype: int64

Splitting the Data in Training and Testing Sets

In [5]:
totalTweets = 8000 + 2314
trainIndex, testIndex = list(), list()
for i in range(tweets.shape[0]):
    if np.random.uniform(0, 1) < 0.98:
        trainIndex += [i]
    else:
        testIndex += [i]
trainData = tweets.iloc[trainIndex]
testData = tweets.iloc[testIndex]

In [6]:
trainData['label'].value_counts()

0    7845
1    2264
Name: label, dtype: int64

In [7]:
trainData.head()

Unnamed: 0,message,label
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga http://plurk.com/p/mzp1e,0
2,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [8]:
testData['label'].value_counts()

0    155
1     50
Name: label, dtype: int64

In [9]:
testData.head()

Unnamed: 0,message,label
73,Is in Athens,0
90,"@kdurose yep, but we fought back well. think i...",0
135,"Okay, have torn apart my work. Fresh coffee &a...",0
155,@KoldCastTV and @beautyboutique Thanks for fol...,0
156,@RealWorldMom,0


Pre-processing the data for the training: Tokenization, stopwords removal, and stemming. 2-gram is used.

In [10]:
def process_message(message, lower_case = True, stem = True, stop_words = True, gram = 2):
    if lower_case:
        message = message.lower()
    words = word_tokenize(message)
    words = [w for w in words if len(w) > 2]
    if stop_words:
        sw = stopwords.words('english')
        words = [word for word in words if word not in sw]
    if gram > 1:
        w = []
        for i in range(len(words) - gram + 1):
            w += [' '.join(words[i:i + gram])]
        return w
    if stem:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]   
    return words

TF-IDF is used to reflect how important a word is to a document in a collection or corpus, in this case TF-IDF is used to reflect how important a word is to a tweet in a set of tweets.

In [11]:
class TweetClassifier(object):
    def __init__(self, trainData, method = 'tf-idf'):
        self.tweets, self.labels = trainData['message'], trainData['label']
        self.method = method

    def train(self):
        self.calc_TF_and_IDF()
        if self.method == 'tf-idf':
            self.calc_TF_IDF()
        else:
            self.calc_prob()

    def calc_prob(self):
        self.prob_depressive = dict()
        self.prob_positive = dict()
        for word in self.tf_depressive:
            self.prob_depressive[word] = (self.tf_depressive[word] + 1) / (self.depressive_words + \
                                                                len(list(self.tf_depressive.keys())))
        for word in self.tf_positive:
            self.prob_positive[word] = (self.tf_positive[word] + 1) / (self.positive_words + \
                                                                len(list(self.tf_positive.keys())))
        self.prob_depressive_tweet, self.prob_positive_tweet = self.depressive_tweets / self.total_tweets, self.positive_tweets / self.total_tweets 

    def calc_TF_and_IDF(self):
        noOfMessages = self.tweets.shape[0]
        self.depressive_tweets, self.positive_tweets = self.labels.value_counts()[1], self.labels.value_counts()[0]
        self.total_tweets = self.depressive_tweets + self.positive_tweets
        self.depressive_words = 0
        self.positive_words = 0
        self.tf_depressive = dict()
        self.tf_positive = dict()
        self.idf_depressive = dict()
        self.idf_positive = dict()
        for i in range(noOfMessages):
            message_processed = process_message(self.tweets.iloc[i])
            #print(message_processed)
            count = list() #To keep track of whether the word has occured in the message or not.
                           #For IDF
            for word in message_processed:
                if self.labels.iloc[i]:
                    self.tf_depressive[word] = self.tf_depressive.get(word, 0) + 1
                    self.depressive_words += 1
                else:
                    self.tf_positive[word] = self.tf_positive.get(word, 0) + 1
                    self.positive_words += 1
                if word not in count:
                    count += [word]
            for word in count:
                if self.labels.iloc[i]:
                    self.idf_depressive[word] = self.idf_depressive.get(word, 0) + 1
                else:
                    self.idf_positive[word] = self.idf_positive.get(word, 0) + 1


    def calc_TF_IDF(self):
        self.prob_depressive = dict()
        self.prob_positive = dict()
        self.sum_tf_idf_depressive = 0
        self.sum_tf_idf_positive = 0
        for word in self.tf_depressive:
            self.prob_depressive[word] = (self.tf_depressive[word]) * log((self.depressive_tweets + self.positive_tweets) \
                                                          / (self.idf_depressive[word] + self.idf_positive.get(word, 0)))
            self.sum_tf_idf_depressive += self.prob_depressive[word]
        for word in self.tf_depressive:
            self.prob_depressive[word] = (self.prob_depressive[word] + 1) / (self.sum_tf_idf_depressive + len(list(self.prob_depressive.keys())))
            
        for word in self.tf_positive:
            self.prob_positive[word] = (self.tf_positive[word]) * log((self.depressive_tweets + self.positive_tweets) \
                                                          / (self.idf_depressive.get(word, 0) + self.idf_positive[word]))
            self.sum_tf_idf_positive += self.prob_positive[word]
        for word in self.tf_positive:
            self.prob_positive[word] = (self.prob_positive[word] + 1) / (self.sum_tf_idf_positive + len(list(self.prob_positive.keys())))
            
    
        self.prob_depressive_tweet, self.prob_positive_tweet = self.depressive_tweets / self.total_tweets, self.positive_tweets / self.total_tweets

    def classify(self, processed_message):
        pDepressive, pPositive = 0, 0
        for word in processed_message:                
            if word in self.prob_depressive:
                pDepressive += log(self.prob_depressive[word])
            else:
                if self.method == 'tf-idf':
                    pDepressive -= log(self.sum_tf_idf_depressive + len(list(self.prob_depressive.keys())))
                else:
                    pDepressive -= log(self.depressive_words + len(list(self.prob_depressive.keys())))
            if word in self.prob_positive:
                pPositive += log(self.prob_positive[word])
            else:
                if self.method == 'tf-idf':
                    pPositive -= log(self.sum_tf_idf_positive + len(list(self.prob_positive.keys()))) 
                else:
                    pPositive -= log(self.positive_words + len(list(self.prob_positive.keys())))
            pDepressive += log(self.prob_depressive_tweet)
            pPositive += log(self.prob_positive_tweet)
        return pDepressive >= pPositive
    
    def predict(self, testData):
        result = dict()
        for (i, message) in enumerate(testData):
            processed_message = process_message(message)
            result[i] = int(self.classify(processed_message))
        return result 

In [12]:
def metrics(labels, predictions):
    true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
    for i in range(len(labels)):
        true_pos += int(labels.iloc[i] == 1 and predictions[i] == 1)
        true_neg += int(labels.iloc[i] == 0 and predictions[i] == 0)
        false_pos += int(labels.iloc[i] == 0 and predictions[i] == 1)
        false_neg += int(labels.iloc[i] == 1 and predictions[i] == 0)
    print('apaan teh tadi')
    print(true_pos, true_neg, false_pos, false_neg)
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    Fscore = 2 * precision * recall / (precision + recall)
    accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F-score: ", Fscore)
    print("Accuracy: ", accuracy)

In [13]:
# Misal pakai metode selain tf-idf (baru tersedia kalkulasi probabilistik biasa)
sc_tf_idf = TweetClassifier(trainData, 'else')
sc_tf_idf.train()
preds_tf_idf = sc_tf_idf.predict(testData['message'])
metrics(testData['label'], preds_tf_idf)

apaan teh tadi
10 152 3 40
Precision:  0.7692307692307693
Recall:  0.2
F-score:  0.31746031746031744
Accuracy:  0.7902439024390244


In [14]:
# Misal pakai metode tf-idf
sc_tf_idf = TweetClassifier(trainData, 'tf-idf')
sc_tf_idf.train()
preds_tf_idf = sc_tf_idf.predict(testData['message'])
metrics(testData['label'], preds_tf_idf)

apaan teh tadi
22 152 3 28
Precision:  0.88
Recall:  0.44
F-score:  0.5866666666666667
Accuracy:  0.848780487804878



Predictions with TF-IDF

Depressive Tweets

In [15]:
pm = process_message('I\'m depressed')
sc_tf_idf.classify(pm)

True

In [16]:
pm = process_message('Depression are the worst')
sc_tf_idf.classify(pm)

True

In [17]:
pm = process_message('Lately I have been feeling unsure of myself as a person & an artist')
sc_tf_idf.classify(pm)

False

Non-Depressive Tweets

In [18]:
pm = process_message('Loving how me and my lovely partner is talking about what we want.')
sc_tf_idf.classify(pm)

False

In [19]:
pm = process_message('ICT Competition takes a lot of effort')
sc_tf_idf.classify(pm)

False

In [20]:
# 2-gram words yang memiliki label depressive, sort by value
x = sc_tf_idf.tf_depressive
{k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}


{'anxiety depression': 136,
 'depression anxiety': 136,
 'depression https': 86,
 'mental health': 52,
 "depression n't": 43,
 'face emoji': 43,
 'emoji face': 40,
 'crying face': 40,
 'depression emoji': 37,
 'depression http': 36,
 'emoji loudly': 33,
 'loudly crying': 33,
 'mom depression': 33,
 'emoji heavy': 31,
 'smiling face': 31,
 'depression tied': 31,
 'tied kids': 31,
 'kids emotional': 31,
 'skin tone': 31,
 'heavy red': 30,
 'red heart': 30,
 'great depression': 30,
 'mental illness': 30,
 'emotional intellectual': 30,
 'intellectual development': 30,
 'feel like': 27,
 "n't know": 26,
 'risk depression': 26,
 'emoji smiling': 25,
 'depression nap': 25,
 'face tears': 24,
 'tears joy': 24,
 '... https': 24,
 'cured depression': 22,
 'depression real': 22,
 'ë°\x95ë´\x84ì \x9cë°\x9cë\x82´ë²\x84ë': 22,
 'heart emoji': 21,
 'depression ...': 20,
 '\x9cë°\x9cë\x82´ë²\x84ë ¤ë\x91\x90ì\x84¸ì\x9a\x94': 20,
 'cure depression': 19,
 'light skin': 19,
 'severe depression': 18,
 '...

In [25]:
# 2-gram words yang memiliki label non-depressive, sort by value
x = sc_tf_idf.tf_positive
{k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

{'good morning': 83,
 "n't wait": 67,
 '... ...': 61,
 'last night': 51,
 'good night': 38,
 'happy birthday': 37,
 'getting ready': 35,
 "n't know": 33,
 'good luck': 33,
 "... n't": 33,
 'cant wait': 31,
 'looking forward': 28,
 'let know': 27,
 'great day': 25,
 'good day': 25,
 'quot quot': 24,
 '... good': 22,
 'get 100': 22,
 'sounds like': 21,
 "n't get": 21,
 'wait see': 20,
 'mothers day': 20,
 '100 followers': 20,
 'followers day': 20,
 'day using': 20,
 'add everyone': 20,
 'everyone train': 20,
 'train pay': 20,
 'pay vip': 20,
 'much fun': 19,
 '... love': 19,
 "hope 're": 18,
 'happy mothers': 18,
 'beautiful day': 18,
 'next time': 17,
 'first time': 17,
 'ice cream': 17,
 'much better': 17,
 'next week': 17,
 '... http': 16,
 'lol ...': 16,
 'going bed': 16,
 'make sure': 15,
 'one day': 15,
 'thank much': 15,
 "'ve got": 15,
 'got home': 15,
 'last day': 15,
 'going see': 15,
 'day today': 14,
 "n't worry": 14,
 'best friend': 14,
 '... lol': 14,
 'good time': 13,
 "co