In [1]:
# sc master running locally
sc.master

'local[*]'

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# spark is from the previous example.
sc = spark.sparkContext

In [4]:
path = "/user1/Suicide_Detection.csv"
df = spark.read.csv(path)

                                                                                

In [5]:
# The inferred schema can be visualized using the printSchema() method
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [6]:
# Rename the columns
df = df.withColumnRenamed('_c0', 'ID')      # Rename _c0 to ID
df = df.withColumnRenamed('_c1', 'Text')    # Rename _c1 to Text
df = df.withColumnRenamed('_c2', 'Label')   # Rename _c2 to Label

# Show the DataFrame with renamed columns
df.show(5)

+----+--------------------+-----------+
|  ID|                Text|      Label|
+----+--------------------+-----------+
|null|                text|      class|
|   2|Ex Wife Threateni...|    suicide|
|   3|Am I weird I don'...|non-suicide|
|   4|"Finally 2020 is ...|non-suicide|
|   8|i need helpjust h...|    suicide|
+----+--------------------+-----------+
only showing top 5 rows



In [7]:
# Count Rows
row_count = df.count()
print("Total Rows:", row_count)

# Get Schema
df.printSchema()



Total Rows: 664905
root
 |-- ID: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- Label: string (nullable = true)



                                                                                

In [8]:
df = df[df['Label'] == 'suicide']

row_count = df.count()
print("Total Rows:", row_count)



Total Rows: 65758


                                                                                

In [9]:
from pyspark.sql.functions import col

# "ID" column datatype changing
df = df.withColumn("ID", col("ID").cast("integer"))

In [10]:
# Remove rows with null values in any column
df = df.na.drop()

In [11]:
# Count Rows
row_count = df.count()
print("Total Rows:", row_count)

# Get Schema
df.printSchema()



Total Rows: 53612
root
 |-- ID: integer (nullable = true)
 |-- Text: string (nullable = true)
 |-- Label: string (nullable = true)



                                                                                

In [12]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
lists = stopwords.words('english')

[nltk_data] Downloading package wordnet to /home/hduser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hduser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/hduser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# Register UDFs with Spark
tokenize_udf = udf(tokenize, StringType())
remove_stopwords_udf = udf(remove_stopwords, StringType())
lemmatize_udf = udf(lemmatize, StringType())

# Tokenize, remove stopwords, and lemmatize the "text" column
df = df.withColumn("tokens", tokenize_udf(df["text"]))
df = df.withColumn("filtered_tokens", remove_stopwords_udf(df["tokens"]))
df = df.withColumn("processed_text", lemmatize_udf(df["filtered_tokens"]))

In [14]:
df.select("processed_text").show(1, truncate=False)



+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|processed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+-----------

                                                                                

First row original Text:

Ex Wife Threatening SuicideRecently I left my wife for good because she has cheated on me twice and lied to me so much that I have decided to refuse to go back to her. As of a few days ago, she began threatening suicide. I have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe I'll come back. I know a lot of people will threaten this in order to get their way, but what happens if she really does? What do I do and how am I supposed to handle her death on my hands? I still love my wife but I cannot deal with getting cheated on again and constantly feeling insecure. I'm worried today may be the day she does it and I hope so much it doesn't happen.

First row after Tokenization, Lemmatization and Removing Stopwords:

Ex, Wife, Threatening, SuicideRecently, left, wife, good, cheated, twice, lied, much, decided, refuse, go, back, ., day, ago, ,, began, threatening, suicide, ., tirelessly, spent, paat, day, talking, keep, hesitating, want, believe, 'll, come, back, ., know, lot, people, threaten, order, get, way, ,, happens, really, ?, supposed, handle, death, hand, ?, still, love, wife, deal, getting, cheated, constantly, feeling, insecure, ., 'm, worried, today, may, day, hope, much, n't, happen, .

In [15]:
df.show(5)

[Stage 12:>                                                         (0 + 1) / 1]

+---+--------------------+-------+--------------------+--------------------+--------------------+
| ID|                Text|  Label|              tokens|     filtered_tokens|      processed_text|
+---+--------------------+-------+--------------------+--------------------+--------------------+
|  2|Ex Wife Threateni...|suicide|[Ex, Wife, Threat...|[Ex, Wife, Threat...|[Ex, Wife, Threat...|
|  8|i need helpjust h...|suicide|[i, need, helpjus...|[need, helpjust, ...|[need, helpjust, ...|
| 18|My life is over a...|suicide|[My, life, is, ov...|[life, 20, years,...|[life, 20, year, ...|
| 19|I took the rest o...|suicide|[I, took, the, re...|[took, rest, slee...|[took, rest, slee...|
| 21|Do you think gett...|suicide|[Do, you, think, ...|[think, getting, ...|[think, getting, ...|
+---+--------------------+-------+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [16]:
# List of columns to remove
columns_to_remove = ["Text", "tokens", "filtered_tokens", "lemmatized_tokens"]

# Remove the specified columns
df = df.drop(*columns_to_remove)

df.show(2)

[Stage 13:>                                                         (0 + 1) / 1]

+---+-------+--------------------+
| ID|  Label|      processed_text|
+---+-------+--------------------+
|  2|suicide|[Ex, Wife, Threat...|
|  8|suicide|[need, helpjust, ...|
+---+-------+--------------------+
only showing top 2 rows



                                                                                

In [17]:
from pyspark.sql.functions import col

# Reorder the columns
df = df.select("ID", "processed_text", "Label")

df.show(2)

[Stage 14:>                                                         (0 + 1) / 1]

+---+--------------------+-------+
| ID|      processed_text|  Label|
+---+--------------------+-------+
|  2|[Ex, Wife, Threat...|suicide|
|  8|[need, helpjust, ...|suicide|
+---+--------------------+-------+
only showing top 2 rows



                                                                                

In [18]:
from pyspark.sql.functions import lower, regexp_replace
from functools import reduce

# List of preprocessing functions
preprocessing_functions = [
    lambda text: lower(text),                                     # Convert text to lowercase
    lambda text: regexp_replace(text, r'\S+@\S+', ''),           # Remove email addresses
    lambda text: regexp_replace(text, r'<.*?>', ''),             # Remove HTML tags
    lambda text: regexp_replace(text, r'[^a-zA-Z0-9\s]', ' '),   # Remove special characters
    lambda text: regexp_replace(text, r'[^\x00-\x7F]+', '')     # Remove accented characters
]

# Apply preprocessing functions to the "processed_text" column
for func in preprocessing_functions:
    df = df.withColumn("processed_text", func(df["processed_text"]))

In [19]:
# Show the updated DataFrame
df.show(20, truncate=True)

[Stage 15:>                                                         (0 + 1) / 1]

+---+--------------------+-------+
| ID|      processed_text|  Label|
+---+--------------------+-------+
|  2| ex  wife  threat...|suicide|
|  8| need  helpjust  ...|suicide|
| 18| life  20  year  ...|suicide|
| 19| took  rest  slee...|suicide|
| 21| think  getting  ...|suicide|
| 23| arrested     fee...|suicide|
| 39|    trashlol  nor...|suicide|
| 41| best  way       ...|suicide|
| 44| feel  like  drow...|suicide|
| 45| worth     troubl...|suicide|
| 67|  ve  become  acc...|suicide|
| 69| after   ukif  di...|suicide|
| 74| cant  stop  feel...|suicide|
| 79| since  aunt   s ...|suicide|
| 86| owthe  past  unf...|suicide|
| 87| ive  suicidal  l...|suicide|
| 90| n t  know  goi  ...|suicide|
|113|      s  point  l...|suicide|
|120| want  people  kn...|suicide|
|127| suicidenote info...|suicide|
+---+--------------------+-------+
only showing top 20 rows



                                                                                

In [20]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, Word2Vec

In [21]:
tokenizer = Tokenizer(inputCol="processed_text", outputCol="words")
wordsData = tokenizer.transform(df)

In [22]:
wordsData.show(2)

[Stage 16:>                                                         (0 + 1) / 1]

+---+--------------------+-------+--------------------+
| ID|      processed_text|  Label|               words|
+---+--------------------+-------+--------------------+
|  2| ex  wife  threat...|suicide|[, ex, , wife, , ...|
|  8| need  helpjust  ...|suicide|[, need, , helpju...|
+---+--------------------+-------+--------------------+
only showing top 2 rows



                                                                                

In [23]:
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="words", outputCol="word_vectors")
model = word2Vec.fit(wordsData)
result = model.transform(wordsData)

                                                                                

In [24]:
# Assuming you have a label column named "label" and features column named "word_vectors"
labeledData = result.select("ID", "word_vectors","Label")

In [25]:
labeledData.show(5, truncate=True)

[Stage 22:>                                                         (0 + 1) / 1]

+---+--------------------+-------+
| ID|        word_vectors|  Label|
+---+--------------------+-------+
|  2|[0.09567599713777...|suicide|
|  8|[0.06323466275352...|suicide|
| 18|[0.09200903690944...|suicide|
| 19|[0.08549859951936...|suicide|
| 21|[0.14425905434414...|suicide|
+---+--------------------+-------+
only showing top 5 rows



                                                                                

In [26]:
word_vectors = labeledData.select("word_vectors").rdd.map(lambda x: x[0]).collect()

                                                                                

In [27]:
num_words = len(word_vectors[0])

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score

2023-09-30 18:11:35.149607: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-30 18:11:39.392222: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-09-30 18:11:39.392375: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-09-30 18:11:39.413458: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-09-30 18:11:41.698040: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-30 18:11:41.743709: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

In [29]:
# Define the split ratios (80% for training, 10% for testing, 10% for validation)
train_ratio = 0.8
test_ratio = 0.1
validation_ratio = 0.1

# Split the data into training, testing, and validation
train_data, test_data, validation_data = df.randomSplit([train_ratio, test_ratio, validation_ratio], seed=42)

In [None]:
#  The number of rows in each split
print("Train Data Count: ", train_data.count())
print("Test Data Count: ", test_data.count())
print("Validation Data Count: ", validation_data.count())

In [30]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, SimpleRNN, Dense


In [31]:
# Assuming num_words is the number of unique words in your vocabulary
num_words = len(word_vectors[0])

EMBEDDING_DIM = 100

model = Sequential()
model.add(SimpleRNN(100, return_sequences=True, input_shape=(None, num_words)))  # Input shape adjusted
model.add(Dropout(0.2))
model.add(SimpleRNN(200, return_sequences=True))
model.add(Dropout(0.4))
model.add(SimpleRNN(100, return_sequences=False))
model.add(Dense(2, activation='softmax'))  # Output dimension adjusted for binary classification

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, None, 100)         20100     
                                                                 
 dropout (Dropout)           (None, None, 100)         0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, None, 200)         60200     
                                                                 
 dropout_1 (Dropout)         (None, None, 200)         0         
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 100)               30100     
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 110602 (432.04 KB)
Trainable params: 11060

In [36]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

#EarlyStopping and ModelCheckpoint
from keras.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 5)
mc = ModelCheckpoint('./model.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)

In [38]:
history_embedding = model.fit(train_data, train_labels, 
                                epochs = 25, batch_size = 128, 
                                validation_data=(validation_data, validation_labels),
                                verbose = 1, callbacks= [es, mc]  )

NameError: name 'train_labels' is not defined