In [1]:
# sc master running locally
sc.master

'local[*]'

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# spark is from the previous example.
sc = spark.sparkContext

In [4]:
path = "/user1/Suicide_Detection.csv"
df = spark.read.csv(path)

                                                                                

In [5]:
# The inferred schema can be visualized using the printSchema() method
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [6]:
# Use take() to get the first 2 rows
result = df.take(3)

# Display the result
for row in result:
    print(row)

Row(_c0=None, _c1='text', _c2='class')
Row(_c0='2', _c1="Ex Wife Threatening SuicideRecently I left my wife for good because she has cheated on me twice and lied to me so much that I have decided to refuse to go back to her. As of a few days ago, she began threatening suicide. I have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe I'll come back. I know a lot of people will threaten this in order to get their way, but what happens if she really does? What do I do and how am I supposed to handle her death on my hands? I still love my wife but I cannot deal with getting cheated on again and constantly feeling insecure. I'm worried today may be the day she does it and I hope so much it doesn't happen.", _c2='suicide')
Row(_c0='3', _c1="Am I weird I don't get affected by compliments if it's coming from someone I know irl but I feel really good when internet strangers do it", _c2='non-suicide')


In [7]:
# Rename the columns
df = df.withColumnRenamed('_c0', 'ID')      # Rename _c0 to ID
df = df.withColumnRenamed('_c1', 'Text')    # Rename _c1 to Text
df = df.withColumnRenamed('_c2', 'Label')   # Rename _c2 to Label

# Show the DataFrame with renamed columns
df.show(5)

+----+--------------------+-----------+
|  ID|                Text|      Label|
+----+--------------------+-----------+
|null|                text|      class|
|   2|Ex Wife Threateni...|    suicide|
|   3|Am I weird I don'...|non-suicide|
|   4|"Finally 2020 is ...|non-suicide|
|   8|i need helpjust h...|    suicide|
+----+--------------------+-----------+
only showing top 5 rows



In [8]:
# Remove rows with null values in any column
df = df.na.drop()

In [9]:
# Count Rows
row_count = df.count()
print("Total Rows:", row_count)

# Get Schema
df.printSchema()

[Stage 3:>                                                          (0 + 4) / 4]

Total Rows: 281474
root
 |-- ID: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- Label: string (nullable = true)



                                                                                

In [10]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
lists = stopwords.words('english')

[nltk_data] Downloading package wordnet to /home/hduser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hduser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/hduser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# Register UDFs with Spark
tokenize_udf = udf(tokenize, StringType())
remove_stopwords_udf = udf(remove_stopwords, StringType())
lemmatize_udf = udf(lemmatize, StringType())

# Tokenize, remove stopwords, and lemmatize the "text" column
df = df.withColumn("tokens", tokenize_udf(df["text"]))
df = df.withColumn("filtered_tokens", remove_stopwords_udf(df["tokens"]))
df = df.withColumn("processed_text", lemmatize_udf(df["filtered_tokens"]))

In [16]:
df.select("processed_text").show(1, truncate=False)

[Stage 8:>                                                          (0 + 1) / 1]

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|processed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+-----------

                                                                                

First row original Text:

Ex Wife Threatening SuicideRecently I left my wife for good because she has cheated on me twice and lied to me so much that I have decided to refuse to go back to her. As of a few days ago, she began threatening suicide. I have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe I'll come back. I know a lot of people will threaten this in order to get their way, but what happens if she really does? What do I do and how am I supposed to handle her death on my hands? I still love my wife but I cannot deal with getting cheated on again and constantly feeling insecure. I'm worried today may be the day she does it and I hope so much it doesn't happen.

First row after Tokenization, Lemmatization and Removing Stopwords:

Ex, Wife, Threatening, SuicideRecently, left, wife, good, cheated, twice, lied, much, decided, refuse, go, back, ., day, ago, ,, began, threatening, suicide, ., tirelessly, spent, paat, day, talking, keep, hesitating, want, believe, 'll, come, back, ., know, lot, people, threaten, order, get, way, ,, happens, really, ?, supposed, handle, death, hand, ?, still, love, wife, deal, getting, cheated, constantly, feeling, insecure, ., 'm, worried, today, may, day, hope, much, n't, happen, .

In [18]:
df.show(2)

[Stage 9:>                                                          (0 + 1) / 1]

+---+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+
| ID|                Text|      Label|              tokens|     filtered_tokens|   lemmatized_tokens|      processed_text|
+---+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+
|  2|Ex Wife Threateni...|    suicide|[Ex, Wife, Threat...|[Ex, Wife, Threat...|[Ex, Wife, Threat...|[Ex, Wife, Threat...|
|  3|Am I weird I don'...|non-suicide|[Am, I, weird, I,...|[weird, n't, get,...|[weird, n't, get,...|[weird, n't, get,...|
+---+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



                                                                                

In [19]:
# List of columns to remove
columns_to_remove = ["Text", "tokens", "filtered_tokens", "lemmatized_tokens"]

# Remove the specified columns
df = df.drop(*columns_to_remove)

df.show(2)

[Stage 10:>                                                         (0 + 1) / 1]

+---+-----------+--------------------+
| ID|      Label|      processed_text|
+---+-----------+--------------------+
|  2|    suicide|[Ex, Wife, Threat...|
|  3|non-suicide|[weird, n't, get,...|
+---+-----------+--------------------+
only showing top 2 rows



                                                                                

In [20]:
from pyspark.sql.functions import col

# Reorder the columns
df = df.select("ID", "processed_text", "Label")

df.show(2)

[Stage 11:>                                                         (0 + 1) / 1]

+---+--------------------+-----------+
| ID|      processed_text|      Label|
+---+--------------------+-----------+
|  2|[Ex, Wife, Threat...|    suicide|
|  3|[weird, n't, get,...|non-suicide|
+---+--------------------+-----------+
only showing top 2 rows



                                                                                

In [21]:
from pyspark.sql.functions import lower, regexp_replace
from functools import reduce

# List of preprocessing functions
preprocessing_functions = [
    lambda text: lower(text),                                     # Convert text to lowercase
    lambda text: regexp_replace(text, r'\S+@\S+', ''),           # Remove email addresses
    lambda text: regexp_replace(text, r'<.*?>', ''),             # Remove HTML tags
    lambda text: regexp_replace(text, r'[^a-zA-Z0-9\s]', ' '),   # Remove special characters
    lambda text: regexp_replace(text, r'[^\x00-\x7F]+', '')     # Remove accented characters
]

# Apply preprocessing functions to the "processed_text" column
for func in preprocessing_functions:
    df = df.withColumn("processed_text", func(df["processed_text"]))

# Show the updated DataFrame
df.show(2, truncate=False)

[Stage 12:>                                                         (0 + 1) / 1]

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                