In [1]:
import numpy as np 
import pandas as pd
import re
import string
import gensim
import pickle
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk import punkt
from nltk import wordnet

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, GRU, Bidirectional, Conv1D, GlobalMaxPooling1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [3]:
data_columns = ["target", "tweet", "username"]
data_encoding = "ISO-8859-1"
data=pd.read_csv('./TwitterDataset.csv',encoding = data_encoding, usecols=data_columns)

X = data.iloc[:, 1]
Y = data.iloc[:, 0]

print(X.head())
print(data['target'].value_counts())


0    the real reason why you're sad? you're attache...
1        my biggest problem is overthinking everything
2    the worst sadness is the sadness you've taught...
3    i cannot make you understand. i cannot make an...
4    i don't think anyone really understands how ti...
Name: tweet, dtype: object
0    4695
1    3440
Name: target, dtype: int64


In [4]:
def preprocess(tweet):
    username = "@\S+"
    new_tweet = re.sub(username, ' ',tweet) # Remove @tags
    
    new_tweet = new_tweet.lower() # Smart lowercase
    
    new_tweet = re.sub(r'\d+', ' ', new_tweet) # Remove numbers
    
    text_noise = "https?:\S+|http?:\S|[^A-Za-z0-9]+" 
        new_tweet = re.sub(text_noise, ' ', new_tweet) # Remove links
    
    new_tweet = new_tweet.translate(new_tweet.maketrans('','',string.punctuation)) # Remove Punctuation
    
    new_tweet = new_tweet.strip() # Remove white spaces
    
    new_tweet = word_tokenize(new_tweet) # Tokenize into words
    
    new_tweet = ' '.join([word for word in new_tweet if word.isalpha()]) # Remove non alphabetic tokens
    
    stop_words = set(stopwords.words('english'))
    new_tweet = ' '.join([word for word in new_tweet.split() if not word in stop_words]) # Filter out stop words
    
    lemmatizer = WordNetLemmatizer()
    new_tweet = ' '.join([lemmatizer.lemmatize(word,"v") for word in new_tweet.split()]) # Word Lemmatization
    
    return new_tweet

In [5]:
X = X.apply(preprocess)

print(X)

0       real reason sad attach people distant pay atte...
1                 biggest problem overthinking everything
2                        worst sadness sadness teach hide
3       make understand make anyone understand happen ...
4       think anyone really understand tire act okay a...
                              ...                        
8130    cardi b want trademark catchphrase okurr think...
8131    bet kellyanne george conway pretty disturb mak...
8132    fan always ask watch old stuff finally answer ...
8133    ray romano hilarious comedian kind soul rare n...
8134    mueller report may finish mine next week johnn...
Name: tweet, Length: 8135, dtype: object


In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("TRAIN size:", len(X_train))
print("TEST size:", len(X_test))

TRAIN size: 6508
TEST size: 1627


In [6]:
w2v_size = 300
w2v_win = 7
w2v_epoch = 32
w2v_mincount = 10

document = [tweet.split() for tweet in X_train]
word2vec_model = gensim.models.word2vec.Word2Vec(vector_size=w2v_size,
                                                window=w2v_win,
                                                min_count=w2v_mincount,
                                                workers=8)

word2vec_model.build_vocab(document)

In [7]:
words = word2vec_model.wv.index_to_key
vocabulary_size = len(words)
print('Vocabulary_size ::: ',vocabulary_size)

Vocabulary_size :::  964


In [8]:
word2vec_model.train(document,total_examples=len(document),epochs=w2v_epoch)

(1185735, 1851840)

In [9]:
max_sequence_length = 300
vector_size = 300

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
unique_tokens = tokenizer.word_index
tokens_size = len(unique_tokens)
print('No.of unique tokens === %s'%tokens_size)

No.of unique tokens === 8635


In [10]:
X_train_padded = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_padded,maxlen=max_sequence_length)

In [11]:
vector_matrix = np.zeros((tokens_size+1,w2v_size))

for word,i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        vector_matrix[i]=word2vec_model.wv[word]

print('vector matrix shape === ',vector_matrix.shape)

vector_matrix

vector matrix shape ===  (8636, 300)


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.1183901 ,  0.3607077 , -0.17306001, ..., -0.01970666,
         0.05476146,  0.02083734],
       [-0.57906866,  0.11009973, -0.4302156 , ..., -0.55453247,
         0.43191254, -0.45374912],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [12]:
X_test_padded = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_padded,maxlen=max_sequence_length)

In [22]:
def build_cnn_model(optimizer, dropout_rate, n_neurons, cnn_activation, dense_activation):
    cnn_model = Sequential(name='cnn_model')
    cnn_model.add(Embedding(tokens_size+1,w2v_size, weights=[vector_matrix],input_length=max_sequence_length,trainable=False))
    cnn_model.add(Conv1D(300,3, activation=cnn_activation))
    cnn_model.add(GlobalMaxPooling1D())
    cnn_model.add(Dense(n_neurons, activation= dense_activation))
    cnn_model.add(Dropout(dropout_rate))
    cnn_model.add(Dense(1, activation='sigmoid'))
    cnn_model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    return cnn_model

In [26]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from keras.callbacks import EarlyStopping

es_cb = EarlyStopping(monitor='val_loss', patience=20, mode='min', restore_best_weights=True)
classifier = KerasClassifier(build_cnn_model)

classifier.get_params().keys()

params = {
    'optimizer' : ['sgd', 'adam'],
    'dropout_rate' : [0.2, 0.3, 0.1],
    'n_neurons' : [300, 128, 64],
    'cnn_activation' : ['relu', 'elu'],
    'dense_activation' : ['relu', 'elu','sigmoid']
}

rand_search = RandomizedSearchCV(classifier, params, n_iter=10, cv=3, verbose=1, scoring='accuracy', return_train_score=True)
rand_search.fit(X_train_padded,Y_train,validation_split=0.1, epochs=100,
callbacks=es_cb)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Epoch 1/100


  classifier = KerasClassifier(build_cnn_model)


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13

RandomizedSearchCV(cv=3,
                   estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x7f3534122910>,
                   param_distributions={'cnn_activation': ['relu', 'elu'],
                                        'dense_activation': ['relu', 'elu',
                                                             'sigmoid'],
                                        'dropout_rate': [0.2, 0.3, 0.1],
                                        'n_neurons': [300, 128, 64],
                                        'optimizer': ['sgd', 'adam']},
                   return_train_score=True, scoring='accuracy', verbose=1)

In [27]:
display(rand_search.best_params_)
display(rand_search.best_score_)

{'optimizer': 'sgd',
 'n_neurons': 300,
 'dropout_rate': 0.3,
 'dense_activation': 'sigmoid',
 'cnn_activation': 'elu'}

0.8581745713053436

In [28]:
search_results_df = pd.DataFrame(rand_search.cv_results_)
search_results_df.to_csv('./random_search_results.csv', index=False, mode='a')

In [29]:
display(search_results_df)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_optimizer,param_n_neurons,param_dropout_rate,param_dense_activation,param_cnn_activation,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,33.837798,1.622423,0.378405,0.011062,adam,300,0.1,elu,relu,"{'optimizer': 'adam', 'n_neurons': 300, 'dropo...",...,0.860765,0.839096,0.846651,0.009988,10,0.890503,0.927403,0.893063,0.903656,0.016824
1,77.986314,9.0216,0.338913,0.006649,sgd,300,0.3,relu,elu,"{'optimizer': 'sgd', 'n_neurons': 300, 'dropou...",...,0.86722,0.839096,0.853872,0.011526,5,0.923698,0.933856,0.903434,0.920329,0.012646
2,37.535931,3.114021,0.352106,0.006372,adam,64,0.1,elu,relu,"{'optimizer': 'adam', 'n_neurons': 64, 'dropou...",...,0.863993,0.835408,0.848802,0.011739,8,0.938912,0.944457,0.883153,0.922174,0.027685
3,161.624404,1.185337,0.348859,0.003254,sgd,300,0.3,sigmoid,elu,"{'optimizer': 'sgd', 'n_neurons': 300, 'dropou...",...,0.870447,0.846012,0.858175,0.009976,1,0.903873,0.902743,0.907582,0.904733,0.002067
4,37.881663,0.314353,0.365941,0.007678,adam,300,0.1,relu,relu,"{'optimizer': 'adam', 'n_neurons': 300, 'dropo...",...,0.86722,0.846012,0.854948,0.008974,4,0.921853,0.911731,0.918875,0.917487,0.004248
5,72.902915,10.871068,0.367617,0.00601,sgd,128,0.1,elu,relu,"{'optimizer': 'sgd', 'n_neurons': 128, 'dropou...",...,0.85846,0.840018,0.85172,0.008306,7,0.942831,0.901821,0.89675,0.913801,0.020632
6,37.074926,1.268975,0.35286,0.014474,adam,300,0.3,sigmoid,elu,"{'optimizer': 'adam', 'n_neurons': 300, 'dropo...",...,0.861226,0.829414,0.846957,0.013192,9,0.895343,0.866559,0.875778,0.879227,0.012002
7,92.074269,14.403363,0.356855,0.00307,sgd,128,0.2,relu,relu,"{'optimizer': 'sgd', 'n_neurons': 128, 'dropou...",...,0.870447,0.844168,0.854948,0.011235,3,0.95574,0.941231,0.907352,0.934774,0.020275
8,38.630422,2.043095,0.35895,0.006703,adam,300,0.3,elu,elu,"{'optimizer': 'adam', 'n_neurons': 300, 'dropo...",...,0.872752,0.833564,0.852182,0.016058,6,0.893499,0.926481,0.883383,0.901121,0.018401
9,79.316304,9.600088,0.373266,0.013998,sgd,64,0.1,elu,elu,"{'optimizer': 'sgd', 'n_neurons': 64, 'dropout...",...,0.873213,0.837252,0.855562,0.014689,2,0.931074,0.934086,0.901821,0.922327,0.014552


In [30]:
best_model=rand_search.best_estimator_.model
display(best_model.evaluate(X_train_padded,Y_train), 'train')
display(best_model.evaluate(X_test_padded, Y_test), 'test')



[0.21421271562576294, 0.9137983918190002]

'train'



[0.3723171055316925, 0.8438844680786133]

'test'

In [33]:
from sklearn.metrics import classification_report

predictions = best_model.predict(X_train_padded)
predictions = [1 if p >= 0.5 else 0 for p in predictions]

print(classification_report(Y_train, predictions, target_names=['not depressed', 'depressed']))

               precision    recall  f1-score   support

not depressed       0.93      0.93      0.93      3777
    depressed       0.90      0.90      0.90      2731

     accuracy                           0.91      6508
    macro avg       0.91      0.91      0.91      6508
 weighted avg       0.91      0.91      0.91      6508



In [35]:
predictions = best_model.predict(X_test_padded)
predictions = [1 if p >= 0.5 else 0 for p in predictions]

print(classification_report(Y_test, predictions, target_names=['not depressed', 'depressed']))

               precision    recall  f1-score   support

not depressed       0.88      0.84      0.86       918
    depressed       0.81      0.85      0.83       709

     accuracy                           0.84      1627
    macro avg       0.84      0.84      0.84      1627
 weighted avg       0.85      0.84      0.84      1627



In [36]:
model_json = best_model.to_json()
with open('./model/cnn.json', 'w') as json_file:
    json_file.write(model_json)
best_model.save_weights('./model/weights.h5')