In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from stop_words import get_stop_words
from nltk.tokenize import word_tokenize
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import matplotlib.pyplot as plt
from keras.layers import *
from keras.models import *
from keras.utils import to_categorical
from keras import backend as K
import lightgbm as lgb
from tensorflow.keras.preprocessing.text import one_hot
from keras.utils import pad_sequences
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [None]:
#read the dataset
pt = pd.read_csv("./Dataset1.csv")

In [None]:
pos = 0
neg = 0
for v in pt["Sentiment"]:
    if(v >= 0):
        pos = pos + 1
    else:
        neg = neg + 1
        
print("Depressive : ",neg)
print("Non-Depressive : ",pos)
print("( S-pos : S-neg ) : (1:", neg/pos,")")

# 1 : depressed
# 0 : depressed

def diagnose(x):
    if(x <= 0):
        return 1
    else:
        return 0

pt["Diagnose"] = pt["Sentiment"].apply(lambda x: diagnose(x))

In [None]:
#this implements the mechanism to recorrect words to their correct most probable value
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [None]:
count = {}

lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = TweetTokenizer()

#tokenizes the tweet and corrects the words to their most probable similar form
def tokenize_text(text):
    tokens = w_tokenizer.tokenize((text))
    tok_ar = []
    for element in tokens:
        corrected_word = correction(element)
        tok_ar.append(corrected_word)
        if element in count:
            count[corrected_word] = count[corrected_word] + 1
        else:
             count[corrected_word] = 1;
    return tok_ar

#lemmatizes the tokenized entries from a tweet ti their original form
def lemmatize_text(text):
    return [(lemmatizer.lemmatize(w)) for w in text]

#removing punctutations like ., , , ? etc.
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', (word))
        if new_word != '':
            new_words.append(new_word)
    return new_words

def preprocess_data(data):
    words = data.apply(lemmatize_text)
    words = words.apply(remove_punctuation)
    return pd.DataFrame(words)

#removing numbers if present from tweets
pt['Tweet'] = pt['Tweet'].astype(str).apply(lambda x: re.sub('\d+', '', x))
lower_text = pt['Tweet'].str.lower()

#calling tokenization on tweets
pt['Tweet'] = pt['Tweet'].apply(tokenize_text)

#stop-worlds like is are was, removed from the tweets
stop_words = get_stop_words('english')
pt['Tweet'] = pt['Tweet'].apply(lambda x: [item for item in x if item not in stop_words] )

pre_tweets = preprocess_data(pt['Tweet'])
pt['Tweet'] = pre_tweets


In [None]:
# time of tweet to timesine and timecos
pt['TimeSin'] = np.sin(2 * np.pi * pd.to_datetime(pt['Time'],format='%H:%M:%S').dt.hour / 24)
pt['TimeCos'] = np.cos(2 * np.pi * pd.to_datetime(pt['Time'],format='%H:%M:%S').dt.hour / 24)

# encoding location to a numerical data
le = LabelEncoder()
pt["location"]=le.fit_transform(pt["location"])
print(pt)

In [None]:
# rejoin the tokens
def joinop(x):
    s = ""
    for i in x:
        s = s + i + " ";
    print(s)
    return s
pt['Tweet'] = pt['Tweet'].apply(lambda x:  joinop(x))
print(pt['Tweet'])

In [None]:
#prepare for att-bilstm

X = pt.iloc[:,4].copy()
target_label = pt.iloc[:,9]

voc_size= len(count) + 1
onehot_repr=[one_hot(words,voc_size)for words in X] 
onehot_repr

sent_length = 0
for x in onehot_repr:
    if sent_length < len(x):
        sent_length = len(x)

embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)

X_l=np.array(embedded_docs)
y_l=np.array(target_label)

# train test split for A_BiLSTM
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_l, y_l, test_size=0.2, random_state=42)

In [None]:
# attention layer 

class attention(Layer):
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences

        super(attention,self).__init__()

    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1), initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),initializer="normal")
        super(attention,self).build(input_shape)


    def call(self, x):
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        if self.return_sequences:
            return output
        return K.sum(output, axis=1)

In [None]:
# model1 : attention bi-lstm

model1 = Sequential()
model1.add(Embedding(voc_size, 128, input_length=sent_length))
model1.add(Bidirectional(LSTM(100, return_sequences=True)))
model1.add(attention(return_sequences=False))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
model1.summary()

In [None]:
# model1 : attention bi-lstm training
model1.fit(X_train_1,y_train_1,validation_data=(X_test_1,y_test_1),epochs=6,batch_size=64)

In [None]:
# model2 : naive bias

tfidf = TfidfVectorizer(sublinear_tf = True, max_df = 0.5,min_df = 0.001, stop_words='english',ngram_range = (1,2))
tf_X = tfidf.fit_transform(X)
tf_X = tf_X.toarray()
tf_y = np.array(target_label)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(tf_X, tf_y, test_size=0.2, random_state=42)
model2 = MultinomialNB()

# model2 : naive bias training
model2.fit(X_train_2,y_train_2)

In [None]:
#model3 : xgboost
location_feature_array = pt.iloc[:,5].values.reshape(-1, 1)
time_feature_array_sin = pt.iloc[:,10].values.reshape(-1, 1)
time_feature_array_cos = pt.iloc[:,11].values.reshape(-1, 1)
text_feature_dense = tf_X

all_features = np.concatenate((text_feature_dense, location_feature_array, time_feature_array_sin, time_feature_array_cos), axis=1)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(all_features, tf_y, test_size=0.2, random_state=42)
model3 = XGBClassifier(use_label_encoder=False, 
                      booster='gbtree', # boosting algorithm to use, default gbtree, othera: gblinear, dart
                      n_estimators=100, # number of trees, default = 100
                      eta=0.3, # this is learning rate, default = 0.3
                      max_depth=6, # maximum depth of the tree, default = 6
                      gamma = 0, # used for pruning, if gain < gamma the branch will be pruned, default = 0
                      reg_lambda = 1, # regularization parameter, defautl = 1
                      #min_child_weight=0 # this refers to Cover which is also responsible for pruning if not set to 0
                     )

#training
model3.fit(X_train_3, y_train_3)

In [None]:
#model4 : lightgbm

model4 = lgb.LGBMClassifier()
model4.fit(X_train_3, y_train_3)


In [None]:
#model5 : random forest

model5 = RandomForestClassifier()#(n_estimators = 1000, random_state = 42)
model5.fit(X_train_3, y_train_3)

In [None]:
#model6 : linear SVC classifier

model6 = LinearSVC(max_iter=100000,random_state=123)
model6.fit(X_train_3, y_train_3)

In [None]:
# prediction model 1

temp_pred1 = model1.predict(X_test_1)
threshold = 0.5
pred1 = (temp_pred1 > threshold).astype(int)

# prediction model 2
pred2 = model2.predict(X_test_2)

# prediction model 3
pred3 = model3.predict(X_test_3)

# prediction model 3
pred4 = model4.predict(X_test_3)

# prediction model 3
pred5 = model5.predict(X_test_3)

# prediction model 3
pred6 = model6.predict(X_test_3)

print("Accuracy : lstm : ",metrics.accuracy_score(y_test_1, pred1))
print("Accuracy : naive : ",metrics.accuracy_score(y_test_2, pred2))
print("Accuracy : xgboost : ",metrics.accuracy_score(y_test_3, pred3))
print("Accuracy : lightgbm : ",metrics.accuracy_score(y_test_3, pred4))
print("Accuracy : random forest : ",metrics.accuracy_score(y_test_3, pred5))
print("Accuracy : linear SVC : ",metrics.accuracy_score(y_test_3, pred6))

In [None]:
import itertools
y_actual = y_test_1
elements = [pred1, pred2, pred3, pred4, pred5,pred6]

def func(a,index):
    if(a == 0):
        return -1*W[index]
    else:
        return 1*W[index]
    
names = [f"pred{i}" for i in range(1, len(elements) + 1)]
res = {}

w_final = []

for i in range(1, len(elements) + 1):
    # Generate all combinations of length i
    for indices in itertools.combinations(range(len(elements)), i):
        
        combination_names = tuple(names[index] for index in indices)
        combination = [elements[index] for index in indices]
        
        y_preds = combination
        D = []
        for pred in y_preds:
            ct = 0
            for obd,act in zip(pred,y_actual):
                if(obd != act):
                    ct = ct + 1
            D.append(ct)

        denom = 0
        W = []

        for Dk in D:
            denom = denom + 1/Dk

        for Dk in D:
            weight = (1/Dk)/denom
            W.append(weight)
        if len(combination_names) == 3 and combination_names[0] == 'pred1' and combination_names[1] == 'pred5' and combination_names[2] == 'pred6':
            w_final = W #model weights for max accuracy
        y_pred = []
        for row in zip(*iter(y_preds)):
            pred = 0
            for index,value in enumerate(row):
                pred = pred + func(value,index)

            if(pred >=0):
                y_pred.append(1)
            else:
                y_pred.append(0)
        
        res[combination_names] = metrics.accuracy_score(y_actual, y_pred)

sortens = sorted(res.items(), key=lambda x:x[1], reverse=True)
converted_dict = dict(sortens)


In [None]:
#all the combinations of model and ensemble accuracy

for item in converted_dict.items():
    print(item,"\n")

In [None]:
# Read the dataset 2 for predicting (unlabelled dataset)

gt = pd.read_csv('./Dataset2.csv')

In [None]:
# final chosen weights
w_final

In [None]:
#preprocessing the new datatset 2

ct = {}

lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = TweetTokenizer()

#tokenizes the tweet and corrects the words to their most probable similar form
def tokenize_text(text):
    tokens = w_tokenizer.tokenize((text))
    tok_ar = []
    for element in tokens:
        corrected_word = correction(element)
        tok_ar.append(corrected_word)
        if element in count:
            ct[corrected_word] = count[corrected_word] + 1
        else:
             ct[corrected_word] = 1;
    return tok_ar

#lemmatizes the tokenized entries from a tweet ti their original form
def lemmatize_text(text):
    return [(lemmatizer.lemmatize(w)) for w in text]

#removing punctutations like ., , , ? etc.
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', (word))
        if new_word != '':
            new_words.append(new_word)
    return new_words

def preprocess_data(data):
    words = data.apply(lemmatize_text)
    words = words.apply(remove_punctuation)
    return pd.DataFrame(words)

#removing numbers if present from tweets
gt['Tweet'] = gt['Tweet'].astype(str).apply(lambda x: re.sub('\d+', '', x))
lower_text = gt['Tweet'].str.lower()

#calling tokenization on tweets
gt['Tweet'] = gt['Tweet'].apply(tokenize_text)

#stop-worlds like is are was, removed from the tweets
stop_words = get_stop_words('english')
gt['Tweet'] = gt['Tweet'].apply(lambda x: [item for item in x if item not in stop_words] )

pre_tweets = preprocess_data(gt['Tweet'])
gt['Tweet'] = pre_tweets

gt['TimeSin'] = np.sin(2 * np.pi * pd.to_datetime(gt['Time'],format='%H:%M:%S').dt.hour / 24)
gt['TimeCos'] = np.cos(2 * np.pi * pd.to_datetime(gt['Time'],format='%H:%M:%S').dt.hour / 24)

gt["location"]=le.transform(gt["location"])
def joinop(x):
    s = ""
    for i in x:
        s = s + i + " ";
    print(s)
    return s
gt['Tweet'] = gt['Tweet'].apply(lambda x:  joinop(x))


In [None]:
rx = gt.iloc[:,4].copy()

voc_size= len(ct) + 1
oh_rep=[one_hot(words,voc_size)for words in rx] 
oh_rep

sent_length = 0
for x in oh_rep:
    if sent_length < len(x):
        sent_length = len(x)

embedded_docs=pad_sequences(oh_rep,padding='pre',maxlen=361)

res_1 = np.array(embedded_docs)


In [None]:
Conc_res = []
for a,b,c in zip(p1,p2,p3):
    sum  = 0
    if a == 0:
        sum = sum + -1*w_final[0]
    else:
        sum = sum + 1*w_final[0]
    if b == 0:
        sum = sum + -1*w_final[1]
    else:
        sum = sum + 1*w_final[1]
    if c == 0:
        sum = sum + -1*w_final[2]
    else:
        sum = sum + 1*w_final[2]
        
    if sum >= 0:
        Conc_res.append(1)
    else:
        Conc_res.append(0)

In [None]:
finalpush = pd.read_csv('./Dataset2.csv')
finalpush["Modelled Dep. Status"] = Conc_res
print(finalpush)

In [None]:
finalpush.to_csv('FinalResult.csv', sep=',', index=False)