In [3]:
# from textblob import TextBlob
from io import StringIO
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [4]:
df = pd.read_csv('Suicide_Detection.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [6]:
emotion_counts = df['class'].value_counts()
print(emotion_counts)

class
suicide        116037
non-suicide    116037
Name: count, dtype: int64


In [7]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word.isalnum()]

    return ' '.join(tokens)

df['preprocessed_text'] = df['text'].apply(preprocess_text)

In [8]:
X = df['preprocessed_text']
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
param_grids = {
    "Logistic Regression": {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 200]
    },
    "SVC": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    "Multinomial Naive Bayes": {
        'alpha': [0.1, 0.5, 1.0]
    }
}

In [8]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Multinomial Naive Bayes": MultinomialNB()
}
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)  # Train the model
    y_train_pred = model.predict(X_train_tfidf)  # Predict on training set
    y_test_pred = model.predict(X_test_tfidf)  # Predict on test set

    # Print the accuracies
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    print(f"Model: {model_name}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Classification Report:\n{classification_report(y_test, y_test_pred)}")
    print("="*50)

Model: Logistic Regression
Training Accuracy: 0.9418
Test Accuracy: 0.9368
Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.93      0.95      0.94     23208
     suicide       0.94      0.93      0.94     23207

    accuracy                           0.94     46415
   macro avg       0.94      0.94      0.94     46415
weighted avg       0.94      0.94      0.94     46415

Model: Multinomial Naive Bayes
Training Accuracy: 0.9035
Test Accuracy: 0.8934
Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.97      0.81      0.88     23208
     suicide       0.84      0.97      0.90     23207

    accuracy                           0.89     46415
   macro avg       0.90      0.89      0.89     46415
weighted avg       0.90      0.89      0.89     46415



In [9]:
final_model = LogisticRegression()
final_model.fit(X_train_tfidf, y_train)

y_train_pred = final_model.predict(X_train_tfidf)
y_test_pred = final_model.predict(X_test_tfidf)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report

print(f"Training Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

Training Accuracy: 0.9418
Test Accuracy: 0.9368
Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.93      0.95      0.94     23208
     suicide       0.94      0.93      0.94     23207

    accuracy                           0.94     46415
   macro avg       0.94      0.94      0.94     46415
weighted avg       0.94      0.94      0.94     46415



In [12]:
import pickle
def save_model(model, file_path):
  try:
    with open(file_path, 'wb') as file:
      pickle.dump(model, file)
    print(f"Model saved successfully to {file_path}")
  except Exception as e:
    print(f"An error occurred during saving: {e}")

save_model(final_model, 'suicidalOrNot.pkl')

Model saved successfully to suicidalOrNot.pkl


In [10]:
def save_object(obj, file_path):
  try:
    with open(file_path, 'wb') as file:
      pickle.dump(obj, file)
    print(f"Object saved successfully to {file_path}")
  except Exception as e:
    print(f"An error occurred during saving: {e}")

In [13]:
def predict_emotion(text):
    processed_text = preprocess_text(text)
    text_tfidf = vectorizer.transform([processed_text])
    return final_model.predict(text_tfidf)

# Test with a new entry
new_entry = ["I want to kill myself", "I am so happy", "I dont know what to do with my life anymore", "I did not expect you here", "I feel hopeless", "I wanna kill myself"]
for i in new_entry:
    print(predict_emotion(i))

['suicide']
['non-suicide']
['suicide']
['non-suicide']
['suicide']
['suicide']


In [12]:
import pickle
save_object(vectorizer, 'vectorizer.pkl')

Object saved successfully to vectorizer.pkl
