In [1]:
# sc master running locally
sc.master

'local[*]'

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# spark is from the previous example.
sc = spark.sparkContext

In [4]:
path = "/user1/Suicide_Detection.csv"
df = spark.read.csv(path)

                                                                                

In [5]:
# The inferred schema can be visualized using the printSchema() method
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [6]:
# Rename the columns
df = df.withColumnRenamed('_c0', 'ID')      # Rename _c0 to ID
df = df.withColumnRenamed('_c1', 'Text')    # Rename _c1 to Text
df = df.withColumnRenamed('_c2', 'Label')   # Rename _c2 to Label

# Show the DataFrame with renamed columns
df.show(5)

+----+--------------------+-----------+
|  ID|                Text|      Label|
+----+--------------------+-----------+
|null|                text|      class|
|   2|Ex Wife Threateni...|    suicide|
|   3|Am I weird I don'...|non-suicide|
|   4|"Finally 2020 is ...|non-suicide|
|   8|i need helpjust h...|    suicide|
+----+--------------------+-----------+
only showing top 5 rows



In [7]:
# Count Rows
row_count = df.count()
print("Total Rows:", row_count)

# Get Schema
df.printSchema()



Total Rows: 664905
root
 |-- ID: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- Label: string (nullable = true)



                                                                                

In [8]:
df = df[df['Label'] == 'suicide']

row_count = df.count()
print("Total Rows:", row_count)

[Stage 5:>                                                          (0 + 4) / 4]

Total Rows: 65758




In [9]:
from pyspark.sql.functions import col

# "ID" column datatype changing
df = df.withColumn("ID", col("ID").cast("integer"))

In [10]:
# Remove rows with null values in any column
df = df.na.drop()

In [11]:
# Count Rows
row_count = df.count()
print("Total Rows:", row_count)

# Get Schema
df.printSchema()



Total Rows: 53612
root
 |-- ID: integer (nullable = true)
 |-- Text: string (nullable = true)
 |-- Label: string (nullable = true)



                                                                                

In [12]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
lists = stopwords.words('english')

[nltk_data] Downloading package wordnet to /home/hduser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hduser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/hduser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# Register UDFs with Spark
tokenize_udf = udf(tokenize, StringType())
remove_stopwords_udf = udf(remove_stopwords, StringType())
lemmatize_udf = udf(lemmatize, StringType())

# Tokenize, remove stopwords, and lemmatize the "text" column
df = df.withColumn("tokens", tokenize_udf(df["text"]))
df = df.withColumn("filtered_tokens", remove_stopwords_udf(df["tokens"]))
df = df.withColumn("processed_text", lemmatize_udf(df["filtered_tokens"]))

In [14]:
df.select("processed_text").show(1, truncate=False)



+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|processed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+-----------

                                                                                

First row original Text:

Ex Wife Threatening SuicideRecently I left my wife for good because she has cheated on me twice and lied to me so much that I have decided to refuse to go back to her. As of a few days ago, she began threatening suicide. I have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe I'll come back. I know a lot of people will threaten this in order to get their way, but what happens if she really does? What do I do and how am I supposed to handle her death on my hands? I still love my wife but I cannot deal with getting cheated on again and constantly feeling insecure. I'm worried today may be the day she does it and I hope so much it doesn't happen.

First row after Tokenization, Lemmatization and Removing Stopwords:

Ex, Wife, Threatening, SuicideRecently, left, wife, good, cheated, twice, lied, much, decided, refuse, go, back, ., day, ago, ,, began, threatening, suicide, ., tirelessly, spent, paat, day, talking, keep, hesitating, want, believe, 'll, come, back, ., know, lot, people, threaten, order, get, way, ,, happens, really, ?, supposed, handle, death, hand, ?, still, love, wife, deal, getting, cheated, constantly, feeling, insecure, ., 'm, worried, today, may, day, hope, much, n't, happen, .

In [15]:
df.show(20)



+---+--------------------+-------+--------------------+--------------------+--------------------+
| ID|                Text|  Label|              tokens|     filtered_tokens|      processed_text|
+---+--------------------+-------+--------------------+--------------------+--------------------+
|  2|Ex Wife Threateni...|suicide|[Ex, Wife, Threat...|[Ex, Wife, Threat...|[Ex, Wife, Threat...|
|  8|i need helpjust h...|suicide|[i, need, helpjus...|[need, helpjust, ...|[need, helpjust, ...|
| 18|My life is over a...|suicide|[My, life, is, ov...|[life, 20, years,...|[life, 20, year, ...|
| 19|I took the rest o...|suicide|[I, took, the, re...|[took, rest, slee...|[took, rest, slee...|
| 21|Do you think gett...|suicide|[Do, you, think, ...|[think, getting, ...|[think, getting, ...|
| 23|Been arrested - f...|suicide|[Been, arrested, ...|[arrested, -, fee...|[arrested, -, fee...|
| 39|I’m trashLol I no...|suicide|[I, ’, m, trashLo...|[’, trashLol, nor...|[’, trashLol, nor...|
| 41|What is the bes

                                                                                

In [16]:
# List of columns to remove
columns_to_remove = ["Text", "tokens", "filtered_tokens", "lemmatized_tokens"]

# Remove the specified columns
df = df.drop(*columns_to_remove)

df.show(2)

[Stage 13:>                                                         (0 + 1) / 1]

+---+-------+--------------------+
| ID|  Label|      processed_text|
+---+-------+--------------------+
|  2|suicide|[Ex, Wife, Threat...|
|  8|suicide|[need, helpjust, ...|
+---+-------+--------------------+
only showing top 2 rows



                                                                                

In [17]:
from pyspark.sql.functions import col

# Reorder the columns
df = df.select("ID", "processed_text", "Label")

df.show(2)

[Stage 14:>                                                         (0 + 1) / 1]

+---+--------------------+-------+
| ID|      processed_text|  Label|
+---+--------------------+-------+
|  2|[Ex, Wife, Threat...|suicide|
|  8|[need, helpjust, ...|suicide|
+---+--------------------+-------+
only showing top 2 rows



                                                                                

In [18]:
from pyspark.sql.functions import lower, regexp_replace
from functools import reduce

# List of preprocessing functions
preprocessing_functions = [
    lambda text: lower(text),                                     # Convert text to lowercase
    lambda text: regexp_replace(text, r'\S+@\S+', ''),           # Remove email addresses
    lambda text: regexp_replace(text, r'<.*?>', ''),             # Remove HTML tags
    lambda text: regexp_replace(text, r'[^a-zA-Z0-9\s]', ' '),   # Remove special characters
    lambda text: regexp_replace(text, r'[^\x00-\x7F]+', '')     # Remove accented characters
]

# Apply preprocessing functions to the "processed_text" column
for func in preprocessing_functions:
    df = df.withColumn("processed_text", func(df["processed_text"]))

In [19]:
# Show the updated DataFrame
df.show(20, truncate=True)

[Stage 15:>                                                         (0 + 1) / 1]

+---+--------------------+-------+
| ID|      processed_text|  Label|
+---+--------------------+-------+
|  2| ex  wife  threat...|suicide|
|  8| need  helpjust  ...|suicide|
| 18| life  20  year  ...|suicide|
| 19| took  rest  slee...|suicide|
| 21| think  getting  ...|suicide|
| 23| arrested     fee...|suicide|
| 39|    trashlol  nor...|suicide|
| 41| best  way       ...|suicide|
| 44| feel  like  drow...|suicide|
| 45| worth     troubl...|suicide|
| 67|  ve  become  acc...|suicide|
| 69| after   ukif  di...|suicide|
| 74| cant  stop  feel...|suicide|
| 79| since  aunt   s ...|suicide|
| 86| owthe  past  unf...|suicide|
| 87| ive  suicidal  l...|suicide|
| 90| n t  know  goi  ...|suicide|
|113|      s  point  l...|suicide|
|120| want  people  kn...|suicide|
|127| suicidenote info...|suicide|
+---+--------------------+-------+
only showing top 20 rows



                                                                                

In [20]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, Word2Vec

In [21]:
tokenizer = Tokenizer(inputCol="processed_text", outputCol="words")
wordsData = tokenizer.transform(df)

In [22]:
wordsData.show(2)

[Stage 16:>                                                         (0 + 1) / 1]

+---+--------------------+-------+--------------------+
| ID|      processed_text|  Label|               words|
+---+--------------------+-------+--------------------+
|  2| ex  wife  threat...|suicide|[, ex, , wife, , ...|
|  8| need  helpjust  ...|suicide|[, need, , helpju...|
+---+--------------------+-------+--------------------+
only showing top 2 rows



                                                                                

In [23]:
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="words", outputCol="word_vectors")
model = word2Vec.fit(wordsData)
result = model.transform(wordsData)

                                                                                

In [24]:
# Assuming you have a label column named "label" and features column named "word_vectors"
labeledData = result.select("Label", "word_vectors")

In [29]:
labeledData.show(2, truncate=False)

[Stage 23:>                                                         (0 + 1) / 1]

+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [30]:
pip install numpy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score

# Assuming you have imported and preprocessed your data appropriately
X_train, y_train, X_test, y_test = ...  # Prepare your train and test data

# Create an LSTM model (customize this according to your needs)
model = Sequential()
model.add(LSTM(units=128, input_shape=(maxlen, vector_size)))  # Adjust units and input_shape as needed
model.add(Dense(num_classes, activation='softmax'))  # Adjust num_classes and activation as needed

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, to_categorical(y_train), epochs=10, batch_size=32)  # Adjust epochs and batch_size as needed

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, np.argmax(y_pred, axis=1))
print("Test accuracy:", accuracy)


2023-09-30 14:25:41.995115: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-30 14:25:47.196290: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-09-30 14:25:47.196336: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-09-30 14:25:47.224991: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-09-30 14:25:49.390277: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.


KeyboardInterrupt: 