In [12]:
# from textblob import TextBlob
from io import StringIO
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [4]:
df = pd.read_csv('suicidal_sentiment_dataset.csv')

In [5]:
df.head()

Unnamed: 0,text,label
0,I’m grateful for my family,not suicidal
1,I'm tired of existing,suicidal
2,Life feels meaningless,suicidal
3,I had a good day,not suicidal
4,I’m proud of myself,not suicidal


In [7]:
emotion_counts = df['label'].value_counts()
print(emotion_counts)

label
not suicidal    2500
suicidal        2500
Name: count, dtype: int64


In [14]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # lowercase the text and tokenize it
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word.isalnum()]
    
    # Rejoin tokens back to string
    return ' '.join(tokens)

df['preprocessed_text'] = df['text'].apply(preprocess_text)

In [16]:
X = df['preprocessed_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [18]:
param_grids = {
    "Logistic Regression": {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 200]
    },
    "SVC": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    "Multinomial Naive Bayes": {
        'alpha': [0.1, 0.5, 1.0]
    }
}

In [19]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(),
    "Multinomial Naive Bayes": MultinomialNB()
}
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    
    # GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(model, param_grids[model_name], cv=3, scoring='accuracy', n_jobs=-1)
    
    grid_search.fit(X_train_tfidf, y_train)  # Train with grid search

    # Get the best model and its parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Predict on training and test sets
    y_train_pred = best_model.predict(X_train_tfidf)
    y_test_pred = best_model.predict(X_test_tfidf)

    # Print the best parameters and accuracies
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    print(f"Best Parameters: {best_params}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Classification Report:\n{classification_report(y_test, y_test_pred)}")
    print("="*50)

Evaluating Logistic Regression...
Best Parameters: {'C': 0.1, 'max_iter': 100, 'solver': 'liblinear'}
Training Accuracy: 1.0000
Test Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

not suicidal       1.00      1.00      1.00       500
    suicidal       1.00      1.00      1.00       500

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

Evaluating SVC...
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Training Accuracy: 1.0000
Test Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

not suicidal       1.00      1.00      1.00       500
    suicidal       1.00      1.00      1.00       500

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

Evaluating Multino

In [20]:
final_model = SVC(C=1, gamma='scale', kernel='linear')
final_model.fit(X_train_tfidf, y_train)

y_train_pred = final_model.predict(X_train_tfidf)
y_test_pred = final_model.predict(X_test_tfidf)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report

print(f"Training Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

Training Accuracy: 1.0000
Test Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

not suicidal       1.00      1.00      1.00       500
    suicidal       1.00      1.00      1.00       500

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [21]:
def predict_emotion(text):
    processed_text = preprocess_text(text)
    text_tfidf = vectorizer.transform([processed_text])
    return final_model.predict(text_tfidf)

# Test with a new entry
new_entry = ["I want to kill myself", "I am so happy", "I dont know what to do with my life anymore", "I did not expect you here", "I feel hopeless", "I wanna kill myself"]
for i in new_entry:
    print(predict_emotion(i))

['suicidal']
['not suicidal']
['suicidal']
['not suicidal']
['not suicidal']
['not suicidal']


In [22]:
binary_y = ['suicidal' if label == 'suicidal' else 'not suicidal' for label in multiemotion_y]

NameError: name 'multiemotion_y' is not defined

In [26]:
import pandas as pd

# Read the TXT file (1 line = 1 entry)
with open('train.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
data = [line.strip().split(';') for line in lines if ';' in line]
df = pd.DataFrame(data, columns=['text', 'label'])

# Save to CSV
df.to_csv('data.csv', index=False)

In [27]:
df_multi= pd.read_csv('data.csv')

In [28]:
df_multi.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [29]:
df_multi['binary_label'] = df_multi['label'].apply(lambda x: 'suicidal' if x == 'suicidal' else 'not suicidal')

In [33]:
# Vectorize the preprocessed text
X_multi_tfidf = vectorizer.transform(df_multi['preprocessed_text'])

# Now you can predict
pred = final_model.predict(X_multi_tfidf)


In [34]:
from sklearn.metrics import accuracy_score

print(f"Test Accuracy: {accuracy_score(df_multi['binary_label'], pred):.4f}")


Test Accuracy: 0.7183


In [31]:
df_multi['preprocessed_text'] = df_multi['text'].apply(preprocess_text)
pred = final_model.predict(df_multi['preprocessed_text'])
print(f"Test Accuracy: {accuracy_score(pred, df_multi['binary_label']):.4f}")

ValueError: could not convert string to float: 'didnt feel humiliated'