In [1]:
import numpy as np 
import pandas as pd
from symspellpy import SymSpell, Verbosity
import pkg_resources
import time
import re
import unicodedata
#import enchant
#import neuspell
import warnings
warnings.filterwarnings("ignore")


In [4]:
df = pd.read_csv("Suicide_Detection.csv")
df = df[["text", "class"]]
df["text_cleaned"] = df["text"].str.lower().str.strip()
df["text_cleaned"] = df["text_cleaned"].apply(lambda x: ''.join((c for c in unicodedata.normalize('NFD', x) if unicodedata.category(c) != 'Mn')))


### Initialize SymSpell with dictionaries and define spell corrector function

In [5]:
#edit distance and prefix length set to default
sym_spell = SymSpell(max_dictionary_edit_distance = 2, prefix_length = 7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt"
)
dictionary_path_bigram =  pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
)

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(dictionary_path_bigram, term_index = 0, count_index = 2)

def symspell_corrector(input_term):
  # look up suggestions for multi-word input strings 
    suggestions = sym_spell.lookup_compound( 
      phrase=input_term,  
      max_edit_distance=2,  
      transfer_casing=True,  
      ignore_term_with_digits=True, 
      ignore_non_words=True, 
      split_by_space=True 
  ) 
    return suggestions[0].term

### Convert Slang Words to Their Meaning

In [6]:
#Slang dictionary retrieved from https://floatcode.wordpress.com/2015/11/28/internet-slang-dataset/
slang = pd.read_csv("slang_dict.csv")
slang = slang.dropna()
#if there are multiple meanings for slang, meanings are separated by |, split and take first meaning 
slang["Meaning"] = slang["Meaning"].str.lower().str.split("|").apply(lambda x: x[0])
slang["Slang"] = slang["Slang"].str.lower()

#Don't want to replace actual words if they double as slang, remove from slang df 
slang["Slang in Dict"] = slang["Slang"].apply(lambda x: True if 
                                              len(sym_spell.lookup(x, Verbosity.TOP, max_edit_distance = 0)) > 0 
                                              else False)
slang = slang[~slang["Slang in Dict"]]
#add word escapes for non-alphanumeric characters 
slang["Meaning"] = slang["Meaning"].apply(lambda x: re.escape(x))
slang["Slang"] = slang["Slang"].apply(lambda x: re.escape(x))

#add boundaries only for text that are solely composed of alphanumeric or space characters
#does not convert when trying to add boundaries for text with nonalphanumeric/space characters, 
#keep them as is (replace all keys that contains nonalphanumeric/space characters) 
slang["only_alnum_space"] = slang["Slang"].str.contains(r'^[a-zA-Z0-9\s]+$', regex = True)
add_boundaries = slang[slang["only_alnum_space"] == True]
add_boundaries["Slang"] = [rf'\b{word}\b' for word in add_boundaries["Slang"]]
no_boundaries = slang[slang["only_alnum_space"] == False]
slang = pd.concat([add_boundaries, no_boundaries])
slang_to_meaning = dict(zip(slang["Slang"], slang["Meaning"]))


In [7]:
time1 = time.time()
df["text_cleaned"] = df["text_cleaned"].replace(slang_to_meaning, regex = True)
time2 = time.time()
print("replacing slang to meanings:", round((time2 - time1)/60), "minutes")


replacing slang to meanings: 332 minutes


In [8]:
df["len_text"] = df["text"].apply(lambda x: len(x))
df["len_text_cleaned"] = df["text_cleaned"].apply(lambda x: len(x))
#If cleaned text is 1.5 times longer than original text, most likely incorrect. 
#Greatest sources of error in converting is * to "indicating spelling correction" when it is used for emphasis
#or converting websites http / www / .com 
df.loc[df["len_text_cleaned"] / df["len_text"] >= 1.5, "text_cleaned"] = \
df.loc[df["len_text_cleaned"] / df["len_text"] >= 1.5, "text"]


In [9]:
#df.to_csv("Suicide Detection Slang Replaced.csv")
#df.to_pickle("Suicide Detection Slang Replaced.pkl")

### Spellcheck Text

In [10]:
time1 = time.time()
uncleaned = []
for i in range(df.shape[0]):
    if i%10000 == 0:
        time2 = time.time()
        print(f"Cleaned {i} rows in: {round((time2 - time1)/60)} min")
    try:
        df.loc[i, "text_cleaned"] = symspell_corrector(df.loc[i, "text_cleaned"])
    except:
        print("Unable to clean text at index:", i)
        uncleaned.append(i)
        continue
    
time2 = time.time()
print("cleaned text time:", (time2 - time1)/60, "minutes")

Cleaned 0 rows in: 0 min
Cleaned 10000 rows in: 9 min
Cleaned 20000 rows in: 18 min
Cleaned 30000 rows in: 28 min
Cleaned 40000 rows in: 37 min
Cleaned 50000 rows in: 47 min
Cleaned 60000 rows in: 56 min
Cleaned 70000 rows in: 66 min
Cleaned 80000 rows in: 76 min
Cleaned 90000 rows in: 86 min
Cleaned 100000 rows in: 95 min
Cleaned 110000 rows in: 104 min
Cleaned 120000 rows in: 114 min
Cleaned 130000 rows in: 123 min
Cleaned 140000 rows in: 133 min
Cleaned 150000 rows in: 142 min
Cleaned 160000 rows in: 152 min
Cleaned 170000 rows in: 161 min
Cleaned 180000 rows in: 172 min
Cleaned 190000 rows in: 185 min
Unable to clean text at index: 197805
Cleaned 200000 rows in: 199 min
Cleaned 210000 rows in: 218 min
Cleaned 220000 rows in: 232 min
Cleaned 230000 rows in: 242 min
cleaned text time: 244.323998717467 minutes


In [31]:
#Only 1 row unable to be spellchecked at index 197805
df.loc[197805, "text_cleaned"]

'5오전5오전\n5오전\n5오전\n5오전\n5오전\n5오전\n5오전\n5오전\n5오전\n5오전\n아\n아\n차라리 살아보지 못한 편이 좋을거야'

Only uncleaned text is in a foreign language, unable to decipher so leave as is

In [37]:
#get new length of cleaned text
df["len_text_cleaned"] = df["text_cleaned"].apply(lambda x: len(x))

In [38]:
df.to_csv("Spellchecked Suicide Detection.csv")
df.to_pickle("Spellchecked Suicide Detection.pkl")