In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from stop_words import get_stop_words
from nltk.tokenize import word_tokenize
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
#read the dataset
dt = pd.read_excel("./LABELDATA.xlsx")

In [3]:
valid_locations = ['south africa', 'nigeria', 'canada', 'philippines', 'united states', 'ireland', 'japan',
                   'pakistan', 'australia', 'india', 'kenya', 'united kingdom', 'mali', 'puerto rico']

# Filter rows based on the valid locations
dt = dt[dt['location'].isin(valid_locations)]

# Reset the index of the DataFrame
dt.reset_index(drop=True, inplace=True)

In [4]:
pos = 0
neg = 0
for v in dt["Sentiment"]:
    if(v >= 0):
        pos = pos + 1
    else:
        neg = neg + 1
        
print("Depressive : ",neg)
print("Non-Depressive : ",pos)
print("( S-pos : S-neg ) : (1:", neg/pos,")")

def diagnose(x):
    if(x <= 0):
        return 1
    else:
        return 0

dt["Diagnose"] = dt["Sentiment"].apply(lambda x: diagnose(x))

Depressive :  17503
Non-Depressive :  25155
( S-pos : S-neg ) : (1: 0.695806002782747 )


In [5]:
pt = dt.iloc[:100,:].copy()
print(pt["Tweet"])

0     I can't even express how angry I was when I fo...
1     Sadness and Anger for the ones who didn't vote...
2     Vale, Geoffrey Love. We would like to share ou...
3     Hahaha story if my life and sadness of my sex ...
4     I've never heard of your store before but now ...
                            ...                        
95    nah.just pitch black for eternity with a side ...
96    Really wanna explore the feeling of loneliness...
97    Hello . heres a podcast I thought you would li...
98    Aged or , anyone can volunteer. There are mill...
99    Maybe you can reach out to someone you love + ...
Name: Tweet, Length: 100, dtype: object


In [6]:
#this implements the mechanism to recorrect words to their correct most probable value


def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [7]:
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = TweetTokenizer()

#tokenizes the tweet and corrects the words to their most probable similar form
def tokenize_text(text):
    tokens = w_tokenizer.tokenize((text))
    tok_ar = []
    for element in tokens:
        tok_ar.append(correction(element))
    return tok_ar

#lemmatizes the tokenized entries from a tweet ti their original form
def lemmatize_text(text):
    return [(lemmatizer.lemmatize(w)) for w in text]

#removing punctutations like ., , , ? etc.
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', (word))
        if new_word != '':
            new_words.append(new_word)
    return new_words

def preprocess_data(data):
    words = data.apply(lemmatize_text)
    words = words.apply(remove_punctuation)
    return pd.DataFrame(words)

#removing numbers if present from tweets
pt['Tweet'] = pt['Tweet'].astype(str).apply(lambda x: re.sub('\d+', '', x))
lower_text = pt['Tweet'].str.lower()

#calling tokenization on tweets
pt['Tweet'] = pt['Tweet'].apply(tokenize_text)

#stop-worlds like is are was, removed from the tweets
stop_words = get_stop_words('english')
pt['Tweet'] = pt['Tweet'].apply(lambda x: [item for item in x if item not in stop_words] )

pre_tweets = preprocess_data(pt['Tweet'])
pt['Tweet'] = pre_tweets


In [8]:
pt['TimeSin'] = np.sin(2 * np.pi * pd.to_datetime(pt['Time'],format='%H:%M:%S').dt.hour / 24)
pt['TimeCos'] = np.cos(2 * np.pi * pd.to_datetime(pt['Time'],format='%H:%M:%S').dt.hour / 24)

le = LabelEncoder()
pt["location"]=le.fit_transform(pt["location"])
print(pt)

               User      Time Language             Tweet ID  \
0   MotherO12536172  10:10:59       en  1661310000000000000   
1      Soulseekerk9  08:07:53       en  1661280000000000000   
2   HousingFirstLtd  07:51:18       en  1661280000000000000   
3         engin_no9  07:09:59       en  1661270000000000000   
4         SmartIsnt  07:03:27       en  1661270000000000000   
..              ...       ...      ...                  ...   
95     TamatoaPride  04:11:33       en  1660140000000000000   
96          Carleey  23:46:20       en  1660070000000000000   
97     VitoCarrozzo  21:41:19       en  1660040000000000000   
98      HelloCareAU  21:30:00       en  1660040000000000000   
99           TWLOHA  18:16:00       en  1659990000000000000   

                                                Tweet  location  \
0   [canst, even, express, angry, found, everythin...         0   
1                   [sadness, anger, one, didn, vote]         0   
2   [pale, geoffrey, love, like, share, co

In [9]:
#vectorization of the tokenized tweets entries
def joinop(x):
    s = ""
    for i in x:
        s = s + i + " ";
    print(s)
    return s
pt['Tweet'] = pt['Tweet'].apply(lambda x:  joinop(x))
print(pt['Tweet'])

canst even express angry found everything taught australian history lie found sadness felt felt denied land family also history just grew immensely 
sadness anger one didn vote 
pale geoffrey love like share condolence remember geoffrey fondness geoffrey active liked member HousingFirst community sadness share farewell recent passing 
ahahah story life sadness sex life 
never heard store now course APSAP stockSadness 
ah seeing result sadness comment wish change answer 
daughter loved photo need puppy deal sadness see work public hospital psychiatric ward clinic 
feel two different way one time possible anyone depression fattest many u experience blissful moment happiness never losing awareness deep blue sadness lurking 
toe sadness 
livid little dermoid nina turner Netflix k TikTok couch unbearable sadness succession ending licorice allsorts pop culture chat show pm along fellow culture grab join u 
went uncontrollable sobbing total mental paralysis passing nine year ago now minute sa

In [10]:
X = pt.iloc[:,4].copy()
R = X
target_label = pt.iloc[:,9]

#ngram_range = (1,2)
tfidf = TfidfVectorizer(sublinear_tf = True, max_df = 0.5,min_df = 0.001, stop_words='english',ngram_range = (1,2))
X = tfidf.fit_transform(X)

location_feature_array = pt.iloc[:,5].values.reshape(-1, 1)
time_feature_array_sin = pt.iloc[:,10].values.reshape(-1, 1)
time_feature_array_cos = pt.iloc[:,11].values.reshape(-1, 1)
text_feature_dense = X.toarray()

all_features = np.concatenate((text_feature_dense, location_feature_array, time_feature_array_sin, time_feature_array_cos), axis=1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(all_features, target_label, test_size=0.5, random_state=42)

In [42]:
print(all_features.shape)
print(text_feature_dense.shape)

(100, 2025)
(100, 2022)


In [35]:
from sklearn.naive_bayes import MultinomialNB

X_train = X_train[:,:-3]
X_test = X_test[:,:-3]
model = MultinomialNB()
model.fit(X_train,y_train)
y_pred1 = model.predict(X_test)
tt = metrics.accuracy_score(y_test,y_pred1)
print(X_test.shape)

(50, 2013)


In [36]:
ans = 0
for i,j in zip(y_pred,y_pred1):
    if i == j :
        ans= ans + 1
print(ans)

32


In [31]:
xgb_m = XGBClassifier(use_label_encoder=False, 
                      booster='gbtree', # boosting algorithm to use, default gbtree, othera: gblinear, dart
                      n_estimators=100, # number of trees, default = 100
                      eta=0.3, # this is learning rate, default = 0.3
                      max_depth=6, # maximum depth of the tree, default = 6
                      gamma = 0, # used for pruning, if gain < gamma the branch will be pruned, default = 0
                      reg_lambda = 1, # regularization parameter, defautl = 1
                      #min_child_weight=0 # this refers to Cover which is also responsible for pruning if not set to 0
                     )
xgb_m.fit(X_train, y_train)

y_pred = xgb_m.predict(X_test)

Accuracy = metrics.accuracy_score(y_test, y_pred)
Precision = metrics.precision_score(y_test, y_pred)
Sensitivity_recall = metrics.recall_score(y_test, y_pred)
Specificity = metrics.recall_score(y_test, y_pred, pos_label=0)
F1_score = metrics.f1_score(y_test, y_pred)
print("\n")
print("Accuracy : ",Accuracy)
print("Precision : ",Precision)
print("Sensitivity_recall : ",Sensitivity_recall)
print("Specificity : ",Specificity)
print("F1_score : ",F1_score)


# # define metrics
# y_pred_proba = model.predict_proba(test)[::,1]
# fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred_proba)
# auc = metrics.roc_auc_score(ytest, y_pred_proba)

# #create ROC curve
# plt.plot(fpr,tpr,label="AUC="+str(auc))
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.legend(loc=4)
# plt.show()





Accuracy :  0.58
Precision :  0.65625
Sensitivity_recall :  0.6774193548387096
Specificity :  0.42105263157894735
F1_score :  0.6666666666666667


In [13]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
# LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
#                importance_type='split', learning_rate=0.1, max_depth=-1,
#                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
#                n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
#                random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
#                subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

y_pred0 =clf.predict(X_test)
Accuracy = metrics.accuracy_score(y_test, y_pred0)
Precision = metrics.precision_score(y_test, y_pred0)
Sensitivity_recall = metrics.recall_score(y_test, y_pred0)
Specificity = metrics.recall_score(y_test, y_pred0, pos_label=0)
F1_score = metrics.f1_score(y_test, y_pred0)
print("\n")
print("Accuracy : ",Accuracy)
print("Precision : ",Precision)
print("Sensitivity_recall : ",Sensitivity_recall)
print("Specificity : ",Specificity)
print("F1_score : ",F1_score)



Accuracy :  0.64
Precision :  0.6585365853658537
Sensitivity_recall :  0.8709677419354839
Specificity :  0.2631578947368421
F1_score :  0.75


In [None]:
from sklearn.base import is_classifier

print(is_classifier(xgb_m))
print(is_classifier(clf))

In [14]:

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10) # Define classifier
rf.fit(X_train, y_train) # Train model


from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5) # Define classifier
dt.fit(X_train, y_train) # Train model


from sklearn.svm import SVC

svm_rbf = SVC(gamma=2, C=1)
svm_rbf.fit(X_train, y_train)



In [41]:
# from sklearn.ensemble import StackingClassifier
# from sklearn.linear_model import LogisticRegression

# estimator_list = [
#     ('xgb_m',xgb_m),
#     ('svm_rbf',svm_rbf),
#     ('dt',dt),
#     ('rf',rf),
#     ('lightgbm',clf) 
    
from mlxtend.classifier import EnsembleVoteClassifier

eclf = EnsembleVoteClassifier(clfs=[xgb_m, svm_rbf, dt,rf,clf], weights=[1,1,1,1,1], fit_base_estimators=False)
eclf.fit(X_train,y_train)
y_pred = eclf.predict(X_test)
print(y_pred)
# Build stack model
# stack_model = StackingClassifier(
#     estimators=estimator_list, final_estimator=LogisticRegression()
# )

# # Train stacked model
# stack_model.fit(X_train, y_train)

# # Make predictions
# y_train_pred = stack_model.predict(X_train)
# y_test_pred = stack_model.predict(X_test)

# # Training set model performance
# stack_model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy


# # Test set model performance
# stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy


# print('Model performance for Training set')
# print('- Accuracy: %s' % stack_model_train_accuracy)

# print('----------------------------------')
# print('Model performance for Test set')
# print('- Accuracy: %s' % stack_model_test_accuracy)




ValueError: Feature shape mismatch, expected: 2016, got 2013

In [26]:
from sklearn.ensemble import VotingClassifier
from sklearn import metrics

# Define the VotingClassifier
voting_model = VotingClassifier(
    estimators=[('xgb_m',xgb_m),('svm_rbf',svm_rbf),('dt',dt),('rf',rf),('lightgbm',clf) ],voting='soft')

# Train the VotingClassifier
voting_model.fit(X_train, y_train)

# Make predictions
y_pred = voting_model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)

# from sklearn.ensemble import StackingClassifier
# from sklearn.linear_model import LogisticRegression

# estimator_list = [
#     ('clf',clf),
#     ('model',model)]

# # Build stack model
# stack_model = StackingClassifier(
#     estimators=estimator_list, final_estimator=LogisticRegression()
# )

# # Train stacked model
# stack_model.fit(X_train, y_train)

# # Make predictions
# y_train_pred = stack_model.predict(X_train)
# y_test_pred = stack_model.predict(X_test)

# # Training set model performance
# stack_model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy



AttributeError: predict_proba is not available when  probability=False

In [None]:
from sklearn.metrics import confusion_matrix

confusion_mat = confusion_matrix(y_test, y_pred0)
print(confusion_mat)
print(metrics.accuracy_score(y_train, xgb_m.predict(X_train)))

In [None]:
# y_pred_reshaped = np.reshape(y_pred, (-1, 1))
# X_test = np.concatenate((X_test,y_pred_reshaped), axis=1)


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Embedding, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from keras.layers import Input, Dense, LSTM, Conv1D, Dropout, Bidirectional, Multiply

def attention(inputs):
    """
    Custom attention layer to compute attention weights and apply them to the LSTM outputs.
    """
    attention_units = 64  # Number of units in the attention layer
    
    hidden_states = inputs[0]
    hidden_size = int(hidden_states.shape[2])
    
    # Compute attention scores
    attention_scores = Dense(attention_units, activation='tanh')(hidden_states)
    attention_scores = Dense(1, activation='softmax')(attention_scores)
    
    # Apply attention scores to LSTM outputs
    weighted_hidden_states = tf.multiply(hidden_states, attention_scores)
    
    # Sum the weighted LSTM outputs
    context_vector = tf.reduce_sum(weighted_hidden_states, axis=1)
    
    return context_vector

def create_attention_lstm_model(input_shape, num_classes):
    """
    Create an Attention-LSTM model for depression detection.
    """
#     inputs = Input(shape=(20, input_shape[1]))
#     lstm_units = 128
#     x = Conv1D(filters=64, kernel_size=1, activation='relu')(inputs)  # padding = 'same'
#     x = Dropout(0.3)(x)

#     # lstm_out = Bidirectional(LSTM(lstm_units, activation='relu'), name='bilstm')(x)
#     lstm_out = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
#     lstm_out = Dropout(0.3)(lstm_out)
#     #attention_mul = Flatten()(lstm_out)

#     output = Dense(1, activation='sigmoid')(lstm_out)
#     model = Model(inputs=[inputs], outputs=output)
#     return model
    lstm_units = 128  # Number of units in the LSTM layer
    dense_units = 64  # Number of units in the dense layer
    dropout_rate = 0.2  # Dropout rate
    
    # Input layer
    inputs = Input(shape=(20, input_shape[1]))
    # LSTM layer
    lstm_outputs = Bidirectional(LSTM(lstm_units, activation='relu'), name='bilstm')(inputs)
    
    # Apply attention mechanism
    context_vector = attention(lstm_outputs)
    
    # Dropout layer
    dropout = Dropout(dropout_rate)(context_vector)
    
    # Dense layer
    dense = Dense(dense_units, activation='relu')(dropout)
    
    # Output layer
    outputs = Dense(num_classes, activation='sigmoid')(dense)
    
    # Create the model
    model = Model(inputs=[inputs], outputs=outputs)
    
    return model

# Example usage
input_shape = X_test.shape  # Define the input shape of your data
num_classes = 1  # Number of classes (1 for binary classification)
model = create_attention_lstm_model(input_shape, num_classes)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_test, y_test, batch_size=32, epochs=10)


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_train, y_train)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)