In [1]:
import pandas as pd
import numpy as np

# for handling text
import string
import nltk
import seaborn as sns
%matplotlib inline
import sklearn
import regex as re
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time
from termcolor import colored
stop_words = set(stopwords.words('english'))

# for plots
import matplotlib as plty
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to /home/konst/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/konst/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from text_class_basic_funcs import *

[nltk_data] Downloading package stopwords to /home/konst/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/konst/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2022-09-10 11:34:17.255713: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-10 11:34:17.348581: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-10 11:34:17.348597: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-10 11:34:17.363136: 

In [3]:
df = pd.read_csv("./Sheet_1.csv")
df.dropna(inplace=True, axis=1 )
df.head(4)

Unnamed: 0,response_id,class,response_text
0,response_1,not_flagged,I try and avoid this sort of conflict
1,response_2,flagged,Had a friend open up to me about his mental addiction to weed and how it was taking over his life and making him depressed
2,response_3,flagged,"I saved a girl from suicide once. She was going to swallow a bunch of pills and I talked her out of it in a very calm, loving way."
3,response_4,not_flagged,i cant think of one really...i think i may have indirectly


In [4]:
fig = px.histogram(df, x='class', title='distribution of records')
fig.show()

### Clean the data 🚿<a id="clean"></a>


In [5]:
df = tokenize_text(df)
df = clean(df)
df.loc[:, 'cleaned_text'] = join_tokens(df, 'cleaned_word_list')
display(df.head(1))

df_f = df[df['class'] == 'flagged']
df_n = df[df['class'] == 'not_flagged']

Unnamed: 0,response_id,class,response_text,word_list,cleaned_word_list,cleaned_text
0,response_1,not_flagged,I try and avoid this sort of conflict,"[I, try, and, avoid, this, sort, of, conflict]","[try, avoid, sort, conflict]",try avoid sort conflict


### Create a data frame with count and TFIDF scores for words


In [6]:
df_f_words_freq = convert_tokens_list_to_freq_df(df_f.cleaned_word_list)
df_n_words_freq = convert_tokens_list_to_freq_df(df_n.cleaned_word_list)

display(df_f_words_freq.head())

tfidf_scores_f = create_tfidf_df(df_f.loc[:, 'cleaned_text'])
tfidf_scores_n = create_tfidf_df(df_n.loc[:, 'cleaned_text'])

f_words_df = merge(df_f_words_freq, tfidf_scores_f)
n_words_df = merge(df_n_words_freq, tfidf_scores_n)

display(f_words_df.head(4))
display(n_words_df.head(4))

Unnamed: 0,words,freq
0,friend,13
1,people,10
2,friends,9
3,would,7
4,going,6


Unnamed: 0,words,freq,tfidf_score_sum
0,friend,13,1.848679
1,people,10,0.877912
2,friends,9,1.279314
3,would,7,0.569055


Unnamed: 0,words,freq,tfidf_score_sum
0,friends,15,3.921829
1,people,10,2.23048
2,friend,9,2.241216
3,helped,9,2.909667


In [7]:
# n most frequent
n = 30

fig = px.bar(
    data_frame=f_words_df[:n], x='words', y='freq', color='freq',
    text='tfidf_score_sum', labels={'x': 'words', 'y': 'frequency'},
    title='Frequency of words seen in <b> Flagged </b>records (with TFIDF in parentheses)' )

fig.update_xaxes(tickangle= -45)  
fig.update_traces(
    texttemplate='%{text:.2f}', textposition='outside',
     textfont_size=8)
     
fig.show()

In [8]:
fig = px.bar(
    data_frame=n_words_df[:n], x='words', y='freq', color='freq',
    text='tfidf_score_sum', labels={'x': 'words', 'y': 'frequency'},
    title='Frequency of words seen in <b> Flagged </b>records (with TFIDF in parentheses)')

fig.update_xaxes(tickangle= -45)  
fig.update_traces(
    texttemplate='%{text:.2f}', textposition='outside',
     textfont_size=8)
     
fig.show()

In [9]:
n_words_df = n_words_df.add_suffix('_n')
n_words_df.rename(columns={'words_n': 'words'}, inplace=True)

f_words_df = f_words_df.add_suffix('_f')
f_words_df.rename(columns={'words_f': 'words'}, inplace=True)

merged_df = merge(f_words_df, n_words_df)
display(merged_df)

Unnamed: 0,words,freq_f,tfidf_score_sum_f,freq_n,tfidf_score_sum_n
0,friend,13,1.848679,9.0,2.241216
1,people,10,0.877912,10.0,2.230480
2,friends,9,1.279314,15.0,3.921829
3,would,7,0.569055,4.0,0.877480
4,going,6,1.168151,5.0,1.022397
...,...,...,...,...,...
343,spent,1,0.170482,,
344,nights,1,0.170482,,
345,letting,1,0.170482,2.0,0.705452
346,vent,1,0.170482,,


In [10]:
merged_df_sample = merged_df[:n]
fig = go.Figure(data=[
    go.Bar(name='Flagged', x=merged_df_sample.words, y=merged_df_sample.freq_f, text=merged_df_sample.freq_f, marker_color='#BA0F30'),
    go.Bar(name='Not Flagged', x=merged_df_sample.words, y=merged_df_sample.freq_n, text=merged_df_sample.freq_n, marker_color='#98D7C2')
])


fig.update_xaxes(tickangle= -45)  
fig.update_traces(
    texttemplate='%{text:d}', textposition='outside',
     textfont_size=8)

# Change the bar mode
fig.update_layout(barmode='group', title_text='most frequent Flagged/not flagged words count comparison')
fig.show()

### Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()


In [12]:
transformed_data, words = get_tfidf_words_and_array(df.loc[:, 'cleaned_text'])
y = df.loc[:, 'class']
scores = cross_val_score(nb, transformed_data, y, cv=3)
display(scores)

array([0.44444444, 0.37037037, 0.69230769])

### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier()
scores = cross_val_score(random_forest, transformed_data, y, cv=3)
display(scores)

array([0.7037037 , 0.7037037 , 0.69230769])

# Deep Learning <a id="deeplearning"></a>

In [14]:
import tensorflow as tf
import keras

from tensorflow.keras import layers, callbacks
from sklearn.model_selection import KFold
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, LSTM, Dense, SimpleRNN, BatchNormalization

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
text = df.loc[:, 'response_text'].values
pad_type = 'post'
trunc_type='post'

# Tokenize our training data
tokenizer = Tokenizer()
texts = [str (item) for item in df.loc[:, 'response_text']]
tokenizer.fit_on_texts(texts)

# word index = {'word': idx}
word_index = tokenizer.word_index

# Encode into sequences
encoded_seqs = tokenizer.texts_to_sequences(texts)


In [16]:
padded_lengths = [len(encoded_seqs_i) for encoded_seqs_i in encoded_seqs]
px.histogram(padded_lengths)

In [17]:
max_len = 60
padded_texts = pad_sequences(encoded_seqs, padding=pad_type, truncating=trunc_type, maxlen=max_len)

In [18]:
# Output the results of our work
random_i = np.random.randint(len(df))

print("Encoded sample:", encoded_seqs[random_i])
print("Padded sample:", padded_texts[random_i])
print("Padded shape:", padded_texts.shape)
print("sequences data type:", type(encoded_seqs))
print("Padded  sequences data type:", type(padded_texts))

Encoded sample: [77, 31, 34, 18, 84, 1, 14, 597, 24, 61, 598, 291, 599, 42, 21, 292, 166, 44, 3, 190, 293, 11, 14, 75, 8, 70, 19, 292, 119, 8, 600, 170, 6, 249, 53, 38, 291, 84, 1, 14, 601, 1, 190, 34, 602, 61, 603, 17, 5, 604]
Padded sample: [ 77  31  34  18  84   1  14 597  24  61 598 291 599  42  21 292 166  44
   3 190 293  11  14  75   8  70  19 292 119   8 600 170   6 249  53  38
 291  84   1  14 601   1 190  34 602  61 603  17   5 604   0   0   0   0
   0   0   0   0   0   0]
Padded shape: (80, 60)
sequences data type: <class 'list'>
Padded  sequences data type: <class 'numpy.ndarray'>


### Simple RNN

In [19]:
num_features = len(word_index) + 1

In [20]:
sample = to_categorical_tensor(padded_texts[0], num_features, max_len)
print(sample.shape)

(60, 678)


2022-09-10 11:34:19.093379: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-09-10 11:34:19.093411: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-09-10 11:34:19.093426: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-ASL3LMV): /proc/driver/nvidia/version does not exist
2022-09-10 11:34:19.093686: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
# one-hot-encoding
X = [to_categorical_tensor(padded_texts[i], num_features, max_len) for i in range(len(df))]

# converting into an array
X_final = np.reshape(X, (len(X), max_len, num_features))

In [22]:
labels = np.array([1 if label=='flagged' else 0 for label in df.loc[:, 'class']])

In [23]:
def create_simpleRNN(rnn_nodes=32):

    model = Sequential([
        SimpleRNN(32, input_shape=(max_len, num_features)),
        BatchNormalization(),
        Dense(1, activation="sigmoid")
        ])
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

    return model

model = create_simpleRNN()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 32)                22752     
                                                                 
 batch_normalization (BatchN  (None, 32)               128       
 ormalization)                                                   
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 22,913
Trainable params: 22,849
Non-trainable params: 64
_________________________________________________________________


In [24]:
n_epochs = 3
n_splits = 3
batch_s = 16 # the default is 32
mean_val_acc_models = {}

In [25]:
mean_val_acc_models['SimpleRNN'] = evaluate(X=X_final, Y=labels, 
                                            model_=create_simpleRNN)
print(mean_val_acc_models)


fold 1
Epoch 1/20
2/2 - 1s - loss: 1.0035 - accuracy: 0.4151 - val_loss: 0.6716 - val_accuracy: 0.7037 - 723ms/epoch - 362ms/step
Epoch 2/20
2/2 - 0s - loss: 0.6977 - accuracy: 0.6226 - val_loss: 0.6647 - val_accuracy: 0.5926 - 39ms/epoch - 19ms/step
Epoch 3/20
2/2 - 0s - loss: 0.5506 - accuracy: 0.7547 - val_loss: 0.6654 - val_accuracy: 0.5926 - 36ms/epoch - 18ms/step
Epoch 4/20
2/2 - 0s - loss: 0.4739 - accuracy: 0.8679 - val_loss: 0.6645 - val_accuracy: 0.5926 - 34ms/epoch - 17ms/step
Epoch 5/20
2/2 - 0s - loss: 0.4397 - accuracy: 0.9245 - val_loss: 0.6611 - val_accuracy: 0.5926 - 34ms/epoch - 17ms/step
Epoch 6/20
2/2 - 0s - loss: 0.3566 - accuracy: 0.9245 - val_loss: 0.6560 - val_accuracy: 0.6296 - 33ms/epoch - 17ms/step
Epoch 7/20
2/2 - 0s - loss: 0.3208 - accuracy: 0.9434 - val_loss: 0.6484 - val_accuracy: 0.6296 - 37ms/epoch - 18ms/step
Epoch 8/20
2/2 - 0s - loss: 0.2622 - accuracy: 0.9623 - val_loss: 0.6397 - val_accuracy: 0.6296 - 35ms/epoch - 18ms/step
Epoch 9/20
2/2 - 0s - 

### GRU

In [26]:
def create_GRU(embedding_dim=32, gru_d=32):

    model = Sequential([
        Embedding(input_dim=num_features, output_dim=embedding_dim, input_length=max_len),
        GRU(gru_d),
        BatchNormalization(),
        Dense(1, activation="sigmoid")
        ])
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

    return model

model = create_GRU()
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 32)            21696     
                                                                 
 gru (GRU)                   (None, 32)                6336      
                                                                 
 batch_normalization_4 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 dense_4 (Dense)             (None, 1)                 33        
                                                                 
Total params: 28,193
Trainable params: 28,129
Non-trainable params: 64
_________________________________________________________________


In [27]:
mean_val_acc_models['GRU'] = evaluate(X=padded_texts, Y=labels, model_=create_GRU)


fold 1
Epoch 1/20
2/2 - 1s - loss: 0.7807 - accuracy: 0.2264 - val_loss: 0.6943 - val_accuracy: 0.4074 - 1s/epoch - 691ms/step
Epoch 2/20
2/2 - 0s - loss: 0.7053 - accuracy: 0.2453 - val_loss: 0.6913 - val_accuracy: 0.5926 - 54ms/epoch - 27ms/step
Epoch 3/20
2/2 - 0s - loss: 0.6459 - accuracy: 0.8302 - val_loss: 0.6888 - val_accuracy: 0.5926 - 50ms/epoch - 25ms/step
Epoch 4/20
2/2 - 0s - loss: 0.6145 - accuracy: 0.7925 - val_loss: 0.6866 - val_accuracy: 0.5926 - 51ms/epoch - 26ms/step
Epoch 5/20
2/2 - 0s - loss: 0.5920 - accuracy: 0.7925 - val_loss: 0.6848 - val_accuracy: 0.5926 - 52ms/epoch - 26ms/step
Epoch 6/20
2/2 - 0s - loss: 0.5865 - accuracy: 0.7736 - val_loss: 0.6833 - val_accuracy: 0.5926 - 50ms/epoch - 25ms/step
Epoch 7/20
2/2 - 0s - loss: 0.5594 - accuracy: 0.7736 - val_loss: 0.6822 - val_accuracy: 0.5926 - 48ms/epoch - 24ms/step
Epoch 8/20
2/2 - 0s - loss: 0.5534 - accuracy: 0.7925 - val_loss: 0.6813 - val_accuracy: 0.5926 - 50ms/epoch - 25ms/step
Epoch 9/20
2/2 - 0s - los

### Long Short-term Memory (LSTM)

In [28]:
def create_LSTM(embedding_dim=32, lstm_u=32):

    model = Sequential([
        Embedding(input_dim=num_features, output_dim=embedding_dim, input_length=max_len),
        LSTM(lstm_u),
        BatchNormalization(),
        Dense(1, activation="sigmoid")
        ])
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

    return model

model = create_LSTM()
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 60, 32)            21696     
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 batch_normalization_8 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 dense_8 (Dense)             (None, 1)                 33        
                                                                 
Total params: 30,177
Trainable params: 30,113
Non-trainable params: 64
_________________________________________________________________


In [29]:
mean_val_acc_models['LSTM'] = evaluate(X=padded_texts, Y=labels, model_=create_LSTM)


fold 1
Epoch 1/20
2/2 - 1s - loss: 0.7553 - accuracy: 0.2264 - val_loss: 0.6930 - val_accuracy: 0.4444 - 1s/epoch - 645ms/step
Epoch 2/20
2/2 - 0s - loss: 0.6700 - accuracy: 0.7925 - val_loss: 0.6904 - val_accuracy: 0.5926 - 61ms/epoch - 31ms/step
Epoch 3/20
2/2 - 0s - loss: 0.6260 - accuracy: 0.7925 - val_loss: 0.6881 - val_accuracy: 0.5926 - 51ms/epoch - 25ms/step
Epoch 4/20
2/2 - 0s - loss: 0.5894 - accuracy: 0.7736 - val_loss: 0.6861 - val_accuracy: 0.5926 - 48ms/epoch - 24ms/step
Epoch 5/20
2/2 - 0s - loss: 0.5770 - accuracy: 0.7736 - val_loss: 0.6844 - val_accuracy: 0.5926 - 51ms/epoch - 26ms/step
Epoch 6/20
2/2 - 0s - loss: 0.5630 - accuracy: 0.7736 - val_loss: 0.6831 - val_accuracy: 0.5926 - 46ms/epoch - 23ms/step
Epoch 7/20
2/2 - 0s - loss: 0.5566 - accuracy: 0.7925 - val_loss: 0.6821 - val_accuracy: 0.5926 - 44ms/epoch - 22ms/step
Epoch 8/20
2/2 - 0s - loss: 0.5340 - accuracy: 0.7925 - val_loss: 0.6814 - val_accuracy: 0.5926 - 43ms/epoch - 22ms/step
Epoch 9/20
2/2 - 0s - los

In [30]:
display(mean_val_acc_models)

{'SimpleRNN': [0.6444444417953491, 0.5962962985038758, 0.596153861284256],
 'GRU': [0.5833333373069763, 0.7777777910232544, 0.6884615540504455],
 'LSTM': [0.578947376263769, 0.7777777910232544, 0.692307710647583]}