In [1]:
# sc master running locally
sc.master

import warnings
warnings.filterwarnings("ignore")

# spark is from the previous example.
sc = spark.sparkContext

In [2]:
path = "/user1/Suicide_Detection.csv"
df = spark.read.csv(path)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [3]:
# The inferred schema can be visualized using the printSchema() method
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [4]:
# Rename the columns
df = df.withColumnRenamed('_c0', 'ID')      # Rename _c0 to ID
df = df.withColumnRenamed('_c1', 'Text')    # Rename _c1 to Text
df = df.withColumnRenamed('_c2', 'Label')   # Rename _c2 to Label

# Show the DataFrame with renamed columns
df.show(5)

+----+--------------------+-----------+
|  ID|                Text|      Label|
+----+--------------------+-----------+
|null|                text|      class|
|   2|Ex Wife Threateni...|    suicide|
|   3|Am I weird I don'...|non-suicide|
|   4|"Finally 2020 is ...|non-suicide|
|   8|i need helpjust h...|    suicide|
+----+--------------------+-----------+
only showing top 5 rows



In [5]:
# Count Rows
row_count = df.count()
print("Total Rows:", row_count)

# Get Schema
df.printSchema()

[Stage 2:>                                                          (0 + 4) / 4]

Total Rows: 664905
root
 |-- ID: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- Label: string (nullable = true)



                                                                                

In [6]:
from pyspark.sql.functions import col

# "ID" column datatype changing
df = df.withColumn("ID", col("ID").cast("integer"))

In [7]:
# Remove rows with null values in any column
df = df.na.drop()

In [8]:
from pyspark.sql.functions import col

# Filter the DataFrame to include only 'suicide' and 'non-suicide' labels
df = df.filter((col('Label') == 'suicide') | (col('Label') == 'non-suicide'))

# Show the resulting DataFrame
df.show()

+---+--------------------+-----------+
| ID|                Text|      Label|
+---+--------------------+-----------+
|  2|Ex Wife Threateni...|    suicide|
|  3|Am I weird I don'...|non-suicide|
|  4|"Finally 2020 is ...|non-suicide|
|  8|i need helpjust h...|    suicide|
| 18|My life is over a...|    suicide|
| 19|I took the rest o...|    suicide|
| 21|Do you think gett...|    suicide|
| 23|Been arrested - f...|    suicide|
| 24|Fuck the verizon ...|non-suicide|
| 31|Me: I know I have...|non-suicide|
| 37|Guys I want frien...|non-suicide|
| 39|I’m trashLol I no...|    suicide|
| 41|What is the best ...|    suicide|
| 43|Today's fact is R...|non-suicide|
| 44|I feel like I am ...|    suicide|
| 45|Is it worth it?Is...|    suicide|
| 47|I triple nipple d...|non-suicide|
| 54|Hey, im gonna sle...|non-suicide|
| 57|I learnt a new sk...|non-suicide|
| 62|Why does no one u...|non-suicide|
+---+--------------------+-----------+
only showing top 20 rows



In [9]:
# Assuming 'df' is your DataFrame and 'Label' is the column of interest
label_counts = df.groupBy("Label").count().orderBy(col("count").desc())

# Show the label counts
label_counts.show()



+-----------+-----+
|      Label|count|
+-----------+-----+
|non-suicide|76506|
|    suicide|53612|
+-----------+-----+



                                                                                

In [10]:
df.show(2, truncate=False)

+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|ID |Text                                                                                                                                                                                                                                                                 

In [11]:
from pyspark.ml.feature import Tokenizer

# Create a Tokenizer object
tokenizer = Tokenizer(inputCol="Text", outputCol="tokens")

# Tokenize the "processed_text" column
df = tokenizer.transform(df)

# Show the result
df.show(truncate=False)

+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
# Show the "tokens" column
token_df = df.select("tokens")

token_df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
df.show(5, truncate=True)

+---+--------------------+-----------+--------------------+
| ID|                Text|      Label|              tokens|
+---+--------------------+-----------+--------------------+
|  2|Ex Wife Threateni...|    suicide|[ex, wife, threat...|
|  3|Am I weird I don'...|non-suicide|[am, i, weird, i,...|
|  4|"Finally 2020 is ...|non-suicide|["finally, 2020, ...|
|  8|i need helpjust h...|    suicide|[i, need, helpjus...|
| 18|My life is over a...|    suicide|[my, life, is, ov...|
+---+--------------------+-----------+--------------------+
only showing top 5 rows



In [14]:
from pyspark.sql.functions import explode, desc
from pyspark.sql import functions as F

# Explode the tokens into separate rows
counting_df = df.withColumn("word", explode(df["tokens"]))

# Group by words and count their occurrences
word_counts = counting_df.groupBy("word").count()

# Order the word counts in descending order
word_counts = word_counts.orderBy(desc("count"))

# Select the top 50 words
top_50_words = word_counts.limit(50)

In [15]:
# Show the top 50 words and their counts
top_50_words.show(50)



+-------+------+
|   word| count|
+-------+------+
|      i|476118|
|     to|277623|
|    and|236422|
|    the|167139|
|     my|165837|
|      a|164026|
|     of|106558|
|     it| 87695|
|     me| 85530|
|   just| 83531|
|     is| 79251|
|     in| 78412|
|    but| 77786|
|   that| 72752|
|   have| 72492|
|    for| 70323|
|     so| 66496|
|   this| 57312|
|   like| 54352|
|    i'm| 53014|
|     be| 51307|
|   want| 48788|
|   with| 48505|
|    you| 47600|
|     do| 45516|
|    was| 44319|
|     on| 43693|
|    not| 40132|
|    i’m| 39662|
|     if| 37987|
|   feel| 36059|
|   know| 35882|
|  about| 34641|
|     or| 34344|
|    all| 33957|
|   what| 33199|
|    get| 33182|
|  don't| 32684|
| filler| 31819|
|     no| 30422|
|     at| 29498|
|because| 27107|
|    out| 26843|
|     am| 26665|
|    how| 26211|
|   been| 25391|
|    can| 25217|
|   they| 25155|
|    are| 24378|
|       | 24034|
+-------+------+



                                                                                

In [16]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Text: string (nullable = true)
 |-- Label: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [17]:
from pyspark.sql.functions import size
from pyspark.sql import types as T

# Define the maximum sequence length you want
max_sequence_length = 50

# Define a function to pad sequences
def pad_sequence(tokens, length):
    return tokens + [""] * (length - len(tokens))

# Create a UDF to apply the padding function
pad_udf = F.udf(lambda tokens: pad_sequence(tokens, max_sequence_length), T.ArrayType(T.StringType()))

# Apply the UDF to pad the sequences
df = df.withColumn("padded_tokens", pad_udf(df["tokens"]))

# Select the relevant columns: ID, padded_tokens, and Label
final_df = df.select("ID", "padded_tokens", "Label")

# Show the resulting DataFrame
final_df.show(5, truncate=False)

[Stage 17:>                                                         (0 + 1) / 1]

+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [18]:
final_df.show(5, truncate=True)

+---+--------------------+-----------+
| ID|       padded_tokens|      Label|
+---+--------------------+-----------+
|  2|[ex, wife, threat...|    suicide|
|  3|[am, i, weird, i,...|non-suicide|
|  4|["finally, 2020, ...|non-suicide|
|  8|[i, need, helpjus...|    suicide|
| 18|[my, life, is, ov...|    suicide|
+---+--------------------+-----------+
only showing top 5 rows



In [26]:
from pyspark.ml.feature import StringIndexer

# Initialize the StringIndexer
label_indexer = StringIndexer(inputCol="Label", outputCol="LabelIndex")

# Fit the StringIndexer on your DataFrame
indexed_df = label_indexer.fit(df).transform(df)

# Show the resulting DataFrame
indexed_df.show(2, truncate=True)

                                                                                

+---+--------------------+-----------+--------------------+--------------------+----------+
| ID|                Text|      Label|              tokens|       padded_tokens|LabelIndex|
+---+--------------------+-----------+--------------------+--------------------+----------+
|  2|Ex Wife Threateni...|    suicide|[ex, wife, threat...|[ex, wife, threat...|       1.0|
|  3|Am I weird I don'...|non-suicide|[am, i, weird, i,...|[am, i, weird, i,...|       0.0|
+---+--------------------+-----------+--------------------+--------------------+----------+
only showing top 2 rows



Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
[Stage 41:>                                                         (0 + 1) / 1]

In [28]:
# Show the sizes of the splits
print("Train Data Count: ", train_data.count())
print("Validation Data Count: ", validation_data.count())
print("Test Data Count: ", test_data.count())

                                                                                

Train Data Count:  91213


                                                                                

Validation Data Count:  19476




Test Data Count:  19429


[Stage 41:>                                                         (0 + 1) / 1]

In [29]:
from pyspark.ml.feature import Word2Vec

# Define Word2Vec parameters
word2vec = Word2Vec(
    vectorSize=100,       # Set the size of the word vectors
    seed=42,              # Set a seed for reproducibility
    inputCol="padded_tokens",  # Specify the input column with your tokenized and padded text
    outputCol="word_vectors"   # Specify the output column for word vectors
)

In [30]:
# Train the Word2Vec model
word2vec_model = word2vec.fit(indexed_df)

                                                                                

In [31]:
# Transform your DataFrame to include word vectors
df = word2vec_model.transform(indexed_df)

In [34]:
df.show(1, truncate=True)

+---+--------------------+-------+--------------------+--------------------+----------+--------------------+
| ID|                Text|  Label|              tokens|       padded_tokens|LabelIndex|        word_vectors|
+---+--------------------+-------+--------------------+--------------------+----------+--------------------+
|  2|Ex Wife Threateni...|suicide|[ex, wife, threat...|[ex, wife, threat...|       1.0|[0.10913591500423...|
+---+--------------------+-------+--------------------+--------------------+----------+--------------------+
only showing top 1 row



In [35]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline

# Initialize the StandardScaler
scaler = StandardScaler(inputCol="word_vectors", outputCol="scaled_word_vectors", withMean=True, withStd=True)

In [36]:
# Create a Pipeline to apply the scaler
pipeline = Pipeline(stages=[scaler])

In [37]:
# Fit and transform your DataFrame using the pipeline
scaled_df = pipeline.fit(df).transform(df)

                                                                                

In [38]:
scaled_df.show(5, truncate=True)

+---+--------------------+-----------+--------------------+--------------------+----------+--------------------+--------------------+
| ID|                Text|      Label|              tokens|       padded_tokens|LabelIndex|        word_vectors| scaled_word_vectors|
+---+--------------------+-----------+--------------------+--------------------+----------+--------------------+--------------------+
|  2|Ex Wife Threateni...|    suicide|[ex, wife, threat...|[ex, wife, threat...|       1.0|[0.10913591500423...|[1.26365459506834...|
|  3|Am I weird I don'...|non-suicide|[am, i, weird, i,...|[am, i, weird, i,...|       0.0|[-0.2058760570921...|[-0.3663998223869...|
|  4|"Finally 2020 is ...|non-suicide|["finally, 2020, ...|["finally, 2020, ...|       0.0|[-0.2428014789707...|[-0.5574733446064...|
|  8|i need helpjust h...|    suicide|[i, need, helpjus...|[i, need, helpjus...|       1.0|[-0.4018417432811...|[-1.3804397978084...|
| 18|My life is over a...|    suicide|[my, life, is, ov...|[my

In [46]:
scaled_df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Text: string (nullable = true)
 |-- Label: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- padded_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- LabelIndex: double (nullable = false)
 |-- word_vectors: vector (nullable = true)
 |-- scaled_word_vectors: vector (nullable = true)



In [48]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["scaled_word_vectors"],  # Use the column with word vectors
    outputCol="features"  # Create a new column called "features"
)

# Transform the DataFrame to include the "features" column
df_with_features = assembler.transform(scaled_df)

In [57]:
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1

train_data, validation_data, test_data = df_with_features.randomSplit([train_ratio, validation_ratio, test_ratio], seed=42)


In [52]:
import kerastuner as kt
from tensorflow import keras
from tensorflow.keras import layers

Using TensorFlow backend


2023-10-01 01:20:29.293253: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-01 01:20:32.539588: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-01 01:20:32.539644: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-01 01:20:32.581674: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-01 01:20:34.294203: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-01 01:20:34.303899: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

In [64]:
def build_model(hp):
    model = keras.Sequential()
    
    # Tune the number of units in the first RNN layer
    hp_units = hp.Int('units', min_value=32, max_value=128, step=32)
    
    # Input shape for text classification (input_dim is the length of word vectors)
    input_shape = (200, 1)  # Replace input_dim with the actual length of your word vectors
    
    model.add(layers.SimpleRNN(units=hp_units, activation='relu', input_shape=input_shape))
    
    # Output layer
    model.add(layers.Dense(2, activation='softmax'))
    
    # Compile the model
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [65]:
tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',  # You can choose a different metric
                     max_epochs=10,
                     factor=2,
                     directory='keras_tuner_dir',
                     project_name='simple_rnn')

In [66]:
# Define callbacks, e.g., early stopping
callbacks = [keras.callbacks.EarlyStopping(patience=3)]

# Perform the hyperparameter search
tuner.search(train_data, validation_data=validation_data, epochs=10, callbacks=callbacks)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 2 Complete [00h 00m 00s]

Best val_accuracy So Far: None
Total elapsed time: 00h 00m 02s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
96                |128               |units
0.01              |0.0001            |learning_rate
2                 |2                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
3                 |3                 |tuner/bracket
0                 |0                 |tuner/round



Traceback (most recent call last):
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/engine/base_tuner.py", line 270, in _try_run_and_update_trial
    self._run_and_update_trial(trial, *fit_args, **fit_kwargs)
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/engine/base_tuner.py", line 235, in _run_and_update_trial
    results = self.run_trial(trial, *fit_args, **fit_kwargs)
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/tuners/hyperband.py", line 427, in run_trial
    return super().run_trial(trial, *fit_args, **fit_kwargs)
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/engine/tuner.py", line 314, in run_trial
    obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/engine/tuner.py", line 233, in _build_and_fit_model
    results = self.hypermodel.fit(hp, model, *args, **kwargs)
  File "/home/hduser/.

RuntimeError: Number of consecutive failures exceeded the limit of 3.
Traceback (most recent call last):
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/engine/base_tuner.py", line 270, in _try_run_and_update_trial
    self._run_and_update_trial(trial, *fit_args, **fit_kwargs)
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/engine/base_tuner.py", line 235, in _run_and_update_trial
    results = self.run_trial(trial, *fit_args, **fit_kwargs)
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/tuners/hyperband.py", line 427, in run_trial
    return super().run_trial(trial, *fit_args, **fit_kwargs)
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/engine/tuner.py", line 314, in run_trial
    obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/engine/tuner.py", line 233, in _build_and_fit_model
    results = self.hypermodel.fit(hp, model, *args, **kwargs)
  File "/home/hduser/.local/lib/python3.10/site-packages/keras_tuner/src/engine/hypermodel.py", line 144, in fit
    return model.fit(*args, **kwargs)
  File "/home/hduser/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/hduser/.local/lib/python3.10/site-packages/keras/src/engine/data_adapter.py", line 1105, in select_data_adapter
    raise ValueError(
ValueError: Failed to find data adapter that can handle input: <class 'pyspark.sql.dataframe.DataFrame'>, <class 'NoneType'>


In [67]:
# Build the model with the best hyperparameters
final_model = tuner.hypermodel.build(best_hps)

# Train the model
final_model.fit(train_data, validation_data=validation_data, epochs=10, callbacks=callbacks)

NameError: name 'best_hps' is not defined

In [None]:
test_loss, test_accuracy = final_model.evaluate(test_data)
print(f"Test Accuracy: {test_accuracy:.4f}")


In [10]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
lists = stopwords.words('english')

[nltk_data] Downloading package wordnet to /home/hduser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hduser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/hduser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

In [12]:
# UDFs with Spark
tokenize_udf = udf(tokenize, StringType())
remove_stopwords_udf = udf(remove_stopwords, StringType())
lemmatize_udf = udf(lemmatize, StringType())

In [13]:
# Tokenize, remove stopwords, and lemmatize the "text" column
df = df.withColumn("tokens", tokenize_udf(df["text"]))
df = df.withColumn("filtered_tokens", remove_stopwords_udf(df["tokens"]))
df = df.withColumn("processed_text", lemmatize_udf(df["filtered_tokens"]))

In [14]:
df.select("processed_text").show(1, truncate=False)



+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|processed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+-----------

                                                                                

First row original Text:

Ex Wife Threatening SuicideRecently I left my wife for good because she has cheated on me twice and lied to me so much that I have decided to refuse to go back to her. As of a few days ago, she began threatening suicide. I have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe I'll come back. I know a lot of people will threaten this in order to get their way, but what happens if she really does? What do I do and how am I supposed to handle her death on my hands? I still love my wife but I cannot deal with getting cheated on again and constantly feeling insecure. I'm worried today may be the day she does it and I hope so much it doesn't happen.

First row after Tokenization, Lemmatization and Removing Stopwords:

Ex, Wife, Threatening, SuicideRecently, left, wife, good, cheated, twice, lied, much, decided, refuse, go, back, ., day, ago, ,, began, threatening, suicide, ., tirelessly, spent, paat, day, talking, keep, hesitating, want, believe, 'll, come, back, ., know, lot, people, threaten, order, get, way, ,, happens, really, ?, supposed, handle, death, hand, ?, still, love, wife, deal, getting, cheated, constantly, feeling, insecure, ., 'm, worried, today, may, day, hope, much, n't, happen, .

In [36]:
df.show(5)

[Stage 28:>                                                         (0 + 1) / 1]

+---+--------------------+-----------+--------------------+--------------------+--------------------+
| ID|                Text|      Label|              tokens|     filtered_tokens|      processed_text|
+---+--------------------+-----------+--------------------+--------------------+--------------------+
|  2|Ex Wife Threateni...|    suicide|[Ex, Wife, Threat...|[Ex, Wife, Threat...|[Ex, Wife, Threat...|
|  3|Am I weird I don'...|non-suicide|[Am, I, weird, I,...|[weird, n't, get,...|[weird, n't, get,...|
|  4|"Finally 2020 is ...|non-suicide|[``, Finally, 202...|[``, Finally, 202...|[``, Finally, 202...|
|  8|i need helpjust h...|    suicide|[i, need, helpjus...|[need, helpjust, ...|[need, helpjust, ...|
| 18|My life is over a...|    suicide|[My, life, is, ov...|[life, 20, years,...|[life, 20, year, ...|
+---+--------------------+-----------+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [37]:
# List of columns to remove
columns_to_remove = ["Text", "tokens", "filtered_tokens", "lemmatized_tokens"]

# Remove the specified columns
df = df.drop(*columns_to_remove)

df.show(2)

[Stage 29:>                                                         (0 + 1) / 1]

+---+-----------+--------------------+
| ID|      Label|      processed_text|
+---+-----------+--------------------+
|  2|    suicide|[Ex, Wife, Threat...|
|  3|non-suicide|[weird, n't, get,...|
+---+-----------+--------------------+
only showing top 2 rows



                                                                                

In [38]:
from pyspark.sql.functions import col

# Reorder the columns
df = df.select("ID", "processed_text", "Label")

df.show(5)

[Stage 30:>                                                         (0 + 1) / 1]

+---+--------------------+-----------+
| ID|      processed_text|      Label|
+---+--------------------+-----------+
|  2|[Ex, Wife, Threat...|    suicide|
|  3|[weird, n't, get,...|non-suicide|
|  4|[``, Finally, 202...|non-suicide|
|  8|[need, helpjust, ...|    suicide|
| 18|[life, 20, year, ...|    suicide|
+---+--------------------+-----------+
only showing top 5 rows



                                                                                

In [39]:
from pyspark.sql.functions import lower, regexp_replace
from functools import reduce

# List of preprocessing functions
preprocessing_functions = [
    lambda text: lower(text),                                     # Convert text to lowercase
    lambda text: regexp_replace(text, r'\S+@\S+', ''),           # Remove email addresses
    lambda text: regexp_replace(text, r'<.*?>', ''),             # Remove HTML tags
    lambda text: regexp_replace(text, r'[^a-zA-Z0-9\s,.]', ' '), # Remove special characters except , and .
    lambda text: regexp_replace(text, r'[^\x00-\x7F]+', ''),     # Remove accented characters
    lambda text: regexp_replace(text, r'\s+', ' ')              # Remove extra spaces
]

# Apply preprocessing functions to the "processed_text" column
for func in preprocessing_functions:
    df = df.withColumn("processed_text", func(df["processed_text"]))

In [40]:
# Show the updated DataFrame
df.show(20, truncate=True)

[Stage 31:>                                                         (0 + 1) / 1]

+---+--------------------+-----------+
| ID|      processed_text|      Label|
+---+--------------------+-----------+
|  2| ex, wife, threat...|    suicide|
|  3| weird, n t, get,...|non-suicide|
|  4|   , finally, 202...|non-suicide|
|  8| need, helpjust, ...|    suicide|
| 18| life, 20, year, ...|    suicide|
| 19| took, rest, slee...|    suicide|
| 21| think, getting, ...|    suicide|
| 23| arrested,  , fee...|    suicide|
| 24| fuck, verizon, s...|non-suicide|
| 31|  , know, really,...|non-suicide|
| 37| guys, want, frie...|non-suicide|
| 39|  , trashlol, nor...|    suicide|
| 41| best, way,  ,  ,...|    suicide|
| 43| today,  s, fact,...|non-suicide|
| 44| feel, like, drow...|    suicide|
| 45| worth,  , troubl...|    suicide|
| 47| triple, nipple, ...|non-suicide|
| 54| hey, ,, im, gon,...|non-suicide|
| 57| learnt, new, ski...|non-suicide|
| 62| one, use, email,...|non-suicide|
+---+--------------------+-----------+
only showing top 20 rows



                                                                                

In [41]:
from pyspark.sql.functions import col

# Filter the labels
df = df.filter(col('Label') == 'suicide')

# Show the resulting DataFrame
df.show(5)

[Stage 32:>                                                         (0 + 1) / 1]

+---+--------------------+-------+
| ID|      processed_text|  Label|
+---+--------------------+-------+
|  2| ex, wife, threat...|suicide|
|  8| need, helpjust, ...|suicide|
| 18| life, 20, year, ...|suicide|
| 19| took, rest, slee...|suicide|
| 21| think, getting, ...|suicide|
+---+--------------------+-------+
only showing top 5 rows



                                                                                

In [45]:
from pyspark.ml.feature import Tokenizer

# Create a Tokenizer instance
tokenizer = Tokenizer(inputCol="processed_text", outputCol="tokens")

# Tokenize the "processed_text" column
tokenized_df = tokenizer.transform(df)

# Show the resulting DataFrame
tokenized_df.select("Label", "tokens").show(truncate=True)



+-------+--------------------+
|  Label|              tokens|
+-------+--------------------+
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
|suicide|[, ex,, wife,, th...|
+-------+--------------------+
only showing top 20 rows



                                                                                

In [43]:
from pyspark.sql.functions import split, explode, col

# Split the processed_text column into individual words
df = df.withColumn("words", split(col("processed_text"), " "))

# Explode the words column to create a row for each word
df = df.withColumn("word", explode(col("words")))

# Group by words and count their occurrences
word_counts = df.groupBy("word").count()

# Order the word counts in descending order
word_counts = word_counts.orderBy(col("count").desc())

# Show the top 20 words and their counts
word_counts.show(20)



+------+---------+
|  word|    count|
+------+---------+
|      |621274847|
|    .,| 50609365|
|     ,| 24103133|
|    ,,| 21640827|
|  get,| 14320406|
| head,| 11650145|
|     n|  8481705|
|    t,|  7577626|
|    m,|  5363490|
| want,|  4832710|
| like,|  4607995|
| feel,|  4305347|
| life,|  3961358|
| know,|  3700578|
|    s,|  2748871|
| time,|  2672449|
| even,|  2556040|
|would,|  2493871|
|  one,|  2401013|
|     .|  2382773|
+------+---------+
only showing top 20 rows



                                                                                

In [None]:
X = df['processed_text']
y = df['Label']

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, Word2Vec, CountVectorizer
from pyspark.ml import Pipeline

In [None]:
# Step 1: Tokenize the text data
tokenizer = Tokenizer(inputCol="processed_text", outputCol="tokens")

In [None]:
# Step 2: Create a vocabulary from the tokens
count_vectorizer = CountVectorizer(inputCol="tokens", outputCol="features")

In [None]:
# Define a pipeline to execute these steps
pipeline = Pipeline(stages=[tokenizer, count_vectorizer])

In [None]:
# Fit and transform the data using the pipeline
model = pipeline.fit(df)
transformed_df = model.transform(df)

In [None]:
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="words", outputCol="word_vectors")
model = word2Vec.fit(wordsData)
result = model.transform(wordsData)

In [None]:
# Assuming you have a label column named "label" and features column named "word_vectors"
labeledData = result.select("ID", "word_vectors","Label")

In [None]:
labeledData.show(5, truncate=True)

In [None]:
word_vectors = labeledData.select("word_vectors").rdd.map(lambda x: x[0]).collect()

In [None]:
num_words = len(word_vectors[0])

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score

In [None]:
labeledData.show(2, truncate=True)

In [None]:
X = labeledData.select("word_vectors")

In [None]:
# Define the split ratios (80% for training, 10% for testing, 10% for validation)
train_ratio = 0.8
test_ratio = 0.1
validation_ratio = 0.1

# Split the data into training, testing, and validation
train_data, test_data, validation_data = df.randomSplit([train_ratio, test_ratio, validation_ratio], seed=42)

In [None]:
#  The number of rows in each split
print("Train Data Count: ", train_data.count())
print("Test Data Count: ", test_data.count())
print("Validation Data Count: ", validation_data.count())

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, SimpleRNN, Dense


In [None]:
# Assuming num_words is the number of unique words in your vocabulary
num_columns = len(X.columns)

EMBEDDING_DIM = 100

model.add(Embedding(input_dim=num_words,
                    output_dim=EMBEDDING_DIM,
                    input_length=num_columns,
                    weights=[gensim_weight_matrix],
                    trainable=False))
model.add(SimpleRNN(100, return_sequences=True, input_shape=(None, num_words)))  # Input shape adjusted
model.add(Dropout(0.2))
model.add(SimpleRNN(200, return_sequences=True))
model.add(Dropout(0.4))
model.add(SimpleRNN(100, return_sequences=False))
model.add(Dense(2, activation='softmax'))  # Output dimension adjusted for binary classification

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

#EarlyStopping and ModelCheckpoint
from keras.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 5)
mc = ModelCheckpoint('./model.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)

In [None]:
history_embedding = model.fit(train_data, train_labels, 
                                epochs = 25, batch_size = 128, 
                                validation_data=(validation_data, validation_labels),
                                verbose = 1, callbacks= [es, mc]  )