# Data Cleaning

In [54]:
import pandas as pd
path = 'reddit_mental_health.csv'

# data overview
df = pd.read_csv(path)
df

Unnamed: 0.1,Unnamed: 0,text,title,target
0,0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1
...,...,...,...,...
5952,1183,I’ve (24M) dealt with depression/anxiety for y...,Nobody takes me seriously,4
5953,1184,"""I don't feel very good, it's like I don't be...",selfishness,4
5954,1185,"I can't sleep most of the nights, meds didn't ...",Is there any way to sleep better?,4
5955,1186,"Hi, all. I have to give a presentation at work...",Public speaking tips?,4


In [55]:
# drop the rows with missing values
df1 = df.dropna()
df1

Unnamed: 0.1,Unnamed: 0,text,title,target
0,0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1
...,...,...,...,...
5952,1183,I’ve (24M) dealt with depression/anxiety for y...,Nobody takes me seriously,4
5953,1184,"""I don't feel very good, it's like I don't be...",selfishness,4
5954,1185,"I can't sleep most of the nights, meds didn't ...",Is there any way to sleep better?,4
5955,1186,"Hi, all. I have to give a presentation at work...",Public speaking tips?,4


In [56]:
# drop the 'row number' column
df2 = df1.drop(df1.columns[0], axis=1)
df2

Unnamed: 0,text,title,target
0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1
...,...,...,...
5952,I’ve (24M) dealt with depression/anxiety for y...,Nobody takes me seriously,4
5953,"""I don't feel very good, it's like I don't be...",selfishness,4
5954,"I can't sleep most of the nights, meds didn't ...",Is there any way to sleep better?,4
5955,"Hi, all. I have to give a presentation at work...",Public speaking tips?,4


In [57]:
# check if the datatypes are correct
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5607 entries, 0 to 5956
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5607 non-null   object
 1   title   5607 non-null   object
 2   target  5607 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 175.2+ KB


# Tokenization, Lemmitization, POS-tag

In [58]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import emoji
from nltk.corpus import wordnet

In [64]:
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def get_wordnet_pos(nltk_pos_tag):
    """
    Map NLTK POS tags to WordNet POS tags for lemmatization.

    Args:
        nltk_pos_tag (str): POS tag from NLTK's pos_tag.

    Returns:
        wordnet.POS: Corresponding WordNet POS tag or None.
    """
    tag_mapping = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }
    return tag_mapping.get(nltk_pos_tag[0], None)  # Return None if no match


def preprocess_text(text):
    """
    Preprocesses the input text and returns lemmatized tokens with POS tags.

    Args:
        text (str): The text to preprocess.

    Returns:
        tuple: (processed_text (str), pos_tags (list of tuples))
    """
    if not isinstance(text, str):  # Check if the text is a string
        return "", []

    text = text.lower()  # Lowercase the text
    text = re.sub(r'http\S+|www\.\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ''.join(char for char in text if char.isalnum() or char.isspace() or emoji.is_emoji(char))  # Remove special characters (excluding emojis)
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize excessive whitespace
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords and lemmatize tokens
    pos_tags = nltk.pos_tag(tokens)

    lemmatized_tokens = []
    for word, pos in pos_tags:
        wordnet_pos = get_wordnet_pos(pos)
        lemmatized_word = lemmatizer.lemmatize(word, pos=wordnet_pos)
        lemmatized_tokens.append(lemmatized_word)

    processed_text = ' '.join(lemmatized_tokens)

    return processed_text, pos_tags

In [60]:
# Apply the preprocessing function to the 'text' column
df2[['lemmatized_text', 'pos_tags_text']] = df2['text'].apply(lambda x: pd.Series(preprocess_text(x)))

df2[['text', 'lemmatized_text', 'pos_tags_text']].head(10)

Unnamed: 0,text,lemmatized_text,pos_tags_text
0,Welcome to /r/depression's check-in post - a p...,welcome rdepressions checkin post place take m...,"[(welcome, JJ), (rdepressions, NNS), (checkin,..."
1,We understand that most people who reply immed...,understand people reply immediately op invitat...,"[(understand, JJ), (people, NNS), (reply, VBP)..."
2,Anyone else just miss physical touch? I crave ...,anyone else miss physical touch crave badly,"[(anyone, NN), (else, RB), (miss, JJ), (physic..."
3,I’m just so ashamed. Everyone and everything f...,im ashamed everyone everything feel far away e...,"[(im, NN), (ashamed, VBD), (everyone, NN), (ev..."
4,I really need a friend. I don't even have a si...,really need friend dont even single best frien...,"[(really, RB), (need, VB), (friend, NN), (dont..."
5,Hear me out... life in general sucks. We have ...,hear life general suck work majority time job ...,"[(hear, JJ), (life, NN), (general, JJ), (sucks..."
6,Never in a million years did I think I’d be on...,never million year think id reddit write somet...,"[(never, RB), (million, CD), (years, NNS), (th..."
7,"Hi!! \n\nI want to preface by saying, i’m sorr...",hi want preface say im sorry know completely t...,"[(hi, NN), (want, VBP), (preface, NN), (saying..."
8,I’m 40(M) and I’ve always maintained that I’m ...,im ive always maintain im ugly woman hasnt cha...,"[(im, JJ), (ive, JJ), (always, RB), (maintaine..."
9,I used to get through my life by believing in ...,use get life believe delusion thing go work on...,"[(used, VBN), (get, VB), (life, NN), (believin..."


In [61]:
# Apply the preprocessing function to the 'title' column
df2[['lemmatized_title', 'pos_tags_title']] = df2['title'].apply(lambda x: pd.Series(preprocess_text(x)))

df2[['title', 'lemmatized_title', 'pos_tags_title']].head(10)

Unnamed: 0,title,lemmatized_title,pos_tags_title
0,"Regular check-in post, with information about ...",regular checkin post information rule wikis,"[(regular, JJ), (checkin, NN), (post, NN), (in..."
1,Our most-broken and least-understood rules is ...,mostbroken leastunderstood rule helper may inv...,"[(mostbroken, VBN), (leastunderstood, NN), (ru..."
2,"I haven’t been touched, or even hugged, in so ...",havent touch even hug long cant even remember ...,"[(havent, NN), (touched, VBD), (even, RB), (hu..."
3,Being Depressed is Embarrassing,depressed embarrassing,"[(depressed, JJ), (embarrassing, NN)]"
4,I'm desperate for a friend and to feel loved b...,im desperate friend feel love someone,"[(im, JJ), (desperate, JJ), (friend, NN), (fee..."
5,Call me crazy but suicide seems rational at th...,call crazy suicide seem rational point,"[(call, NN), (crazy, NN), (suicide, NN), (seem..."
6,Could really use someone to talk to. I’m falli...,could really use someone talk im fall apart,"[(could, MD), (really, RB), (use, VB), (someon..."
7,Can i rant to someone?? You can rant in return!!,rant someone rant return,"[(rant, NN), (someone, NN), (rant, JJ), (retur..."
8,I don’t want to care about being alone,dont want care alone,"[(dont, NN), (want, VBP), (care, NN), (alone, ..."
9,Hope is just a form of self-harm,hope form selfharm,"[(hope, NN), (form, NN), (selfharm, NN)]"


In [62]:
df2

Unnamed: 0,text,title,target,lemmatized_text,pos_tags_text,lemmatized_title,pos_tags_title
0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1,welcome rdepressions checkin post place take m...,"[(welcome, JJ), (rdepressions, NNS), (checkin,...",regular checkin post information rule wikis,"[(regular, JJ), (checkin, NN), (post, NN), (in..."
1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1,understand people reply immediately op invitat...,"[(understand, JJ), (people, NNS), (reply, VBP)...",mostbroken leastunderstood rule helper may inv...,"[(mostbroken, VBN), (leastunderstood, NN), (ru..."
2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1,anyone else miss physical touch crave badly,"[(anyone, NN), (else, RB), (miss, JJ), (physic...",havent touch even hug long cant even remember ...,"[(havent, NN), (touched, VBD), (even, RB), (hu..."
3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1,im ashamed everyone everything feel far away e...,"[(im, NN), (ashamed, VBD), (everyone, NN), (ev...",depressed embarrassing,"[(depressed, JJ), (embarrassing, NN)]"
4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1,really need friend dont even single best frien...,"[(really, RB), (need, VB), (friend, NN), (dont...",im desperate friend feel love someone,"[(im, JJ), (desperate, JJ), (friend, NN), (fee..."
...,...,...,...,...,...,...,...
5952,I’ve (24M) dealt with depression/anxiety for y...,Nobody takes me seriously,4,ive dealt depressionanxiety year use great peo...,"[(ive, JJ), (dealt, NN), (depressionanxiety, N...",nobody take seriously,"[(nobody, NN), (takes, VBZ), (seriously, RB)]"
5953,"""I don't feel very good, it's like I don't be...",selfishness,4,dont feel good like dont belong world dont thi...,"[(dont, NN), (feel, VB), (good, JJ), (like, IN...",selfishness,"[(selfishness, NN)]"
5954,"I can't sleep most of the nights, meds didn't ...",Is there any way to sleep better?,4,cant sleep night med didnt help,"[(cant, JJ), (sleep, JJ), (nights, NNS), (meds...",way sleep good,"[(way, NN), (sleep, VBP), (better, JJR)]"
5955,"Hi, all. I have to give a presentation at work...",Public speaking tips?,4,hi give presentation work next week minute lon...,"[(hi, NN), (give, VB), (presentation, NN), (wo...",public speaking tip,"[(public, JJ), (speaking, NN), (tips, NNS)]"
