In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import data_helpers 
import re 

from bs4 import BeautifulSoup
from data_helpers import BPE

# Data Preparation 1: HTML decoding
pat1 = r'@[A-Za-z0-9]+'   # @mention
pat2 = r'https?://[A-Za-z0-9./]+' # URL
combined_pat = r'|'.join((pat1, pat2))

def tweet_cleaner(text):
    
    soup = BeautifulSoup(text, 'lxml')  # HTML decoding
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped) # URL, @mention
    
    try:
        clean = stripped.replace(u'ï¿½', ' ') # Latin character encoding
    except:
        clean = stripped
    
    lower_case = clean.lower()
    return lower_case

def filter_tweet(row):

    tweet = tweet_cleaner(row['tweet'])
    return tweet

# Convert subword to index, function version
def subword2index(texts, vocab):
    
    sentences = []
    for s in texts:
    
        s = s.split()
        one_line = []
        
        for word in s:
            
            if word not in vocab.keys():
                one_line.append(vocab['unk'])
            else:
                one_line.append(vocab[word])
        
        sentences.append(one_line)
    
    return sentences

def subword_rep(tweets):

    texts = [re.sub('\d', '0', s) for s in tweets]

    # replace all URLs with <url>
    url_reg = r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b'
    texts = [re.sub(url_reg, '<url>', s) for s in texts]

    # Convert string to subword, this process may take several minutes
    bpe = BPE("en.wiki.bpe.op25000.vocab")
    texts = [bpe.encode(s) for s in texts]

    # Build vocab, {token: index}
    vocab = {}
    for i, token in enumerate(bpe.words):
        vocab[token] = i + 1

    # Convert texts
    sentences = subword2index(texts, vocab)
    return sentences

In [3]:
import twint
import nest_asyncio
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.sequence import pad_sequences

from keras.models import load_model
cnn_model = load_model('cnn-sub.h5')
gru_model = load_model('gru-sub.h5')
lstm_cnn_model = load_model('lstm-cnn-sub.h5')

def classify_cnn(username):

    nest_asyncio.apply()

    c = twint.Config()
    c.Username = username
    c.Hide_output = True
    c.Limit = 500
    c.Pandas = True
    twint.run.Search(c)
    
    df = twint.storage.panda.Tweets_df
    
    if (len(df)<10):
        print("Insufficient Data")
        return 

    else:
        df = df[['tweet']]
        df['tweet'] = df.apply(filter_tweet, axis=1)

        tweets = df['tweet'].values
        sentences = subword_rep(tweets)
        data = pad_sequences(sentences, maxlen=252, padding='post')

        labels_pred = cnn_model.predict(data)
        labels_pred = np.round(labels_pred)
        labels_pred = np.argmax(labels_pred, axis=1)
        
        total = len(df)
        positive = labels_pred.sum()
        ratio = positive/total
        
        print(f"Number of tweets obtained : {total}")
        print(f"Fraction of tweets that are indicative of depression : {ratio}")
        
        if (ratio<=0.02):
            print("At low to no risk of depression")
            return
        elif (ratio<=0.1):
            print("At mild risk of depression")
            return
        elif (ratio<=0.35):
            print("At moderate risk of depression")
            return
        else:
            print("At significant risk of depression")
            return
           
def classify_gru(username):

    nest_asyncio.apply()

    c = twint.Config()
    c.Username = username
    c.Hide_output = True
    c.Limit = 500
    c.Pandas = True
    twint.run.Search(c)
    
    df = twint.storage.panda.Tweets_df
    
    if (len(df)<10):
        print("Insufficient Data")
        return 

    else:
        df = df[['tweet']]
        df['tweet'] = df.apply(filter_tweet, axis=1)

        tweets = df['tweet'].values
        sentences = subword_rep(tweets)
        data = pad_sequences(sentences, maxlen=252, padding='post')

        labels_pred = gru_model.predict(data)
        labels_pred = np.round(labels_pred)
        labels_pred = np.argmax(labels_pred, axis=1)
        
        total = len(df)
        positive = labels_pred.sum()
        ratio = positive/total
        
        print(f"Number of tweets obtained : {total}")
        print(f"Fraction of tweets that are indicative of depression : {ratio}")
        
        if (ratio<=0.02):
            print("At low to no risk of depression")
            return
        elif (ratio<=0.1):
            print("At mild risk of depression")
            return
        elif (ratio<=0.35):
            print("At moderate risk of depression")
            return
        else:
            print("At significant risk of depression")
            return

#### Celebrities with Depression

In [4]:
username = "TheRock"
classify_cnn(username)
print()
classify_gru(username)

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.154
At moderate risk of depression

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.152
At moderate risk of depression


In [5]:
username = "katyperry"
classify_cnn(username)
print()
classify_gru(username)

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.128
At moderate risk of depression

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.178
At moderate risk of depression


In [6]:
username = "ladygaga"
classify_cnn(username)
print()
classify_gru(username)

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.078
At mild risk of depression

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.12
At moderate risk of depression


In [7]:
username = "jk_rowling"
classify_cnn(username)
print()
classify_gru(username)

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.096
At mild risk of depression

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.106
At moderate risk of depression


In [8]:
username = "deepikapadukone"
classify_cnn(username)
print()
classify_gru(username)

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.106
At moderate risk of depression

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.03
At mild risk of depression


#### Random Twitter Pages

In [9]:
username = "ndtv"
classify_cnn(username)
print()
classify_gru(username)

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.012
At low to no risk of depression

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.002
At low to no risk of depression


In [11]:
username = "bonappetit"
classify_cnn(username)
print()
classify_gru(username)

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.006
At low to no risk of depression

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.002
At low to no risk of depression


In [12]:
username = "IPL"
classify_cnn(username)
print()
classify_gru(username)

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.006
At low to no risk of depression

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.022
At mild risk of depression


In [13]:
username = "AnimalPlanet"
classify_cnn(username)
print()
classify_gru(username)

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.006
At low to no risk of depression

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.044
At mild risk of depression


In [14]:
username = "Funfacts"
classify_cnn(username)
print()
classify_gru(username)

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.016
At low to no risk of depression

Number of tweets obtained : 500
Fraction of tweets that are indicative of depression : 0.02
At low to no risk of depression
