In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Necessary Libraries

In [None]:
from wordcloud import WordCloud
import numpy as np 
import pandas as pd
!pip install neattext
import neattext.functions as nfx
import matplotlib.pyplot as plt
import plotly.express as plx
import keras
from keras.layers import Embedding,Dense,LSTM,Bidirectional,GlobalMaxPooling1D,Input,Dropout
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras.models import Sequential
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
import seaborn as sns
import pickle
import warnings
import string

# Machine Learning Libraries
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest,chi2,f_classif
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.metrics import classification_report , confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

# Text Processing libraries
import nltk
from nltk.stem import PorterStemmer

warnings.filterwarnings('ignore')
sns.set_palette('pastel')

# Loading the Dataset

In [None]:
df=pd.read_csv('../input/suicide-watch/Suicide_Detection.csv')
df.head()

In [None]:
df = df.drop('Unnamed: 0',axis=1)

### Information about Dataset
Dataset contains 232074 entries, divided into 2 columns, 'text' and 'class'

In [None]:
df.info()

### Getting random samples
Randomly choosing 50k samples from the dataset

In [None]:
df = df.sample(n=50000, random_state=21)

### Visualizing count of each type of data
To prevent bias in the model, there should be an equal number of each category of data.

In [None]:
sns.countplot(x ='class', data = df)
plt.show()

### Checking if there is any empty or duplicated data

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

# Text Preprocessing

In [None]:
# Converting into lowercase for uniformity
df['text']= df['text'].str.lower()

In [None]:
# Removing Special Characters
df['text'] = df['text'].str.replace(r'[^\w\s]+', '',regex = True)

### Removing Stopwords
A stop words list is a collection of irrelevant, often occurring words with little to no grammatical significance for classifying text. To remove them, and reduce the amount of background information in our text, and increase the focus on the important information, we used NLTK's stop words corpus.

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

### Tokenization
Tokenization refers to the process of breaking down a sequence of text into individual units, or tokens, such as words, phrases, or subwords.

In [None]:
df['text'] = df['text'].apply(lambda x:nltk.word_tokenize(x))

### Stemming
Stemming is a process of reducing words to their root or base form by removing their affixes (prefixes, suffixes, and infixes). The goal of stemming is to reduce the dimensionality of text data and to normalize words that have the same stem but different forms (e.g., run, running, ran).

In [None]:
ps = PorterStemmer()
df['text'] = df['text'].apply(lambda x : [ps.stem(i) for i in x])

In [None]:
df['text']=df['text'].apply(lambda x : ' '.join(x))

In [None]:
# Processed Data
df.head()

In [None]:
df.info()

In [None]:
x,y = df['text'],df['class']

### Vectorization
Vectorization in machine learning is the process of converting raw data, such as text or images, into numerical vectors that can be processed by machine learning algorithms. 

TF-IDF is a statistical measure that reflects the importance of a word in a document or a corpus of documents. The TF-IDF vectorizer computes a numerical vector for each document in a corpus based on the frequency of each word in the document and the inverse frequency of the word in the corpus. This approach assigns higher weights to words that are more frequent in the document but less frequent in the corpus, indicating that these words are more discriminative and informative for the document.

In [None]:
vectorizer = TfidfVectorizer(min_df=50,max_features=5000)
x =  vectorizer.fit_transform(x).toarray()

In [None]:
# Save the fitted vectorizer to be used later
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)    

### Splitting the Dataset

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=21)

In [None]:
X_train.shape,X_test.shape

# Trying out different ML Models

#### Naive Bayes



In [None]:
nb = GaussianNB()
nb2 = BernoulliNB()
nb3 = MultinomialNB()
VotingClassifiers = VotingClassifier(estimators=[('GaussianNB', nb),('BernoulliNB',nb2), ('MultinomialNB', nb3)], 
                                     voting = 'soft')
VotingClassifiers.fit(X_train, y_train)
print('Training score:',VotingClassifiers.score(X_train, y_train))
print('Testing score:',VotingClassifiers.score(X_test,y_test))

In [None]:
y_act=y_test
y_pred=VotingClassifiers.predict(X_test)
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True)
print(classification_report(y_act,y_pred))

### Random Forest

In [None]:
classifiers = RandomizedSearchCV(RandomForestClassifier(),{'n_estimators':[4,5],'criterion':['entropy'],
                                                      'max_depth':range(1,4),'min_samples_split':range(2,5)},random_state=12)
classifiers.fit(X_train, y_train)
print('Training score:',classifiers.score(X_train, y_train))
print('Testing score:',classifiers.score(X_test,y_test))
print(classifiers.best_estimator_)

In [None]:
y_act=y_test
y_pred=classifiers.predict(X_test)
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True)
print(classification_report(y_act,y_pred))

### Decision Tree

In [None]:
model2 = DecisionTreeClassifier(criterion='gini',splitter='random',min_samples_leaf=70,max_depth=4,random_state=0)
model2.fit(X_train, y_train)
print(model2.score(X_train, y_train))
print(model2.score(X_test,y_test))

In [None]:
y_act=y_test
y_pred=model2.predict(X_test)
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True)
print(classification_report(y_act,y_pred))

### Gradient Boosting 

In [None]:
# model3 = RandomizedSearchCV(GradientBoostingClassifier(),{"learning_rate": range(3,5),
#                 "max_depth":[100],"max_features":range(6,10,2),
#                  "n_estimators":[10]},random_state=21,n_jobs=-1)
# model3.fit(X_train,y_train)
# print('Training score:',model3.score(X_train,y_train))
# print('Testing score:',model3.score(X_test,y_test))
# model3.best_params_

In [None]:
#confusion matrix and classification report
# y_act=y_test
# y_pred=model3.predict(X_test)
# sns.heatmap(confusion_matrix(y_act,y_pred),annot=True)
# print(classification_report(y_act,y_pred))

### XGB

In [None]:
modelx = XGBClassifier( eval_metric='map',max_depth=200,n_estimators=70,learning_rate=1.99)
modelx.fit(X_train,y_train.replace({"non-suicide":0,'suicide':1}))
print('Training score:',modelx.score(X_train,y_train.replace({"non-suicide":0,'suicide':1})))
print('Testing score:',modelx.score(X_test,y_test.replace({"non-suicide":0,'suicide':1})))

In [None]:
#confusion matrix and classification report
y_act = y_test.replace({"non-suicide":0,'suicide':1})
y_pred = modelx.predict(X_test)
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True)
print(classification_report(y_act,y_pred))

### KNN

In [None]:
# model = RandomizedSearchCV(KNeighborsClassifier(),{'n_neighbors':[223],'metric':['manhattan','minkowski','cosine','tanimoto'],
#                                                    'p':[1,2]},random_state=42,n_jobs=-1)
# model.fit(X_train, y_train)
# print('Training score:',model.score(X_train, y_train))
# print('Testing score:',model.score(X_test,y_test))
# print(model.best_estimator_)

In [None]:
# y_act = y_test
# y_pred = model.predict(X_test)
# sns.heatmap(confusion_matrix(y_act,y_pred),annot=True,cmap='summer')
# print(classification_report(y_act,y_pred))

# Conclusion
### Naive Bayes has the highest accuracy-> Training Accuracy: 90.4% and Testing Accuracy: 89%

In [None]:
def preprocess(inp):
    inp = inp.lower() #convert to lower case 
    inp = inp.replace(r'[^\w\s]+', '') #remove punctuations
    inp = [word for word in inp.split() if word not in (stop_words)] #tokenize the sentence
    inp = ' '.join([ps.stem(i) for i in inp]) #stemming
    inputToModel = vectorizer.transform([inp]).toarray() #transform to vector form
    return inputToModel

In [None]:
def test_sentence(input_text):
    print('Given Input : ',input_text)
    processed_array = preprocess(input_text) #preprocess the text 
    predict = modelx.predict(processed_array) #Model prediction
    if predict[0] == 0: 
        print('Output: No self harm detected')
    else:
        print('Output: Self harm detected')
    

In [None]:
test_sentence('I want to die')

In [None]:
test_sentence('please help')

In [None]:
test_sentence('please help me bake a cake')

In [None]:
test_sentence('It is too much, I cant take it anymore')

In [None]:
sns.set_palette('Paired')

In [None]:

 
# Evaluation metrics for each model
model_names = ['Naive Bayes', 'Random Forest', 'Decision Tree', 'XGBoost', 'KNN']
accuracy = [0.89, 0.76, 0.76, 0.88, 0.82]
precision = [0.89, 0.72, 0.74, 0.88, 0.80]
recall = [0.89, 0.85, 0.79, 0.89, 0.81]
f1_score = [0.89, 0.78, 0.76, 0.89, 0.82]

# Set the width of the bars
barWidth = 0.2

# Set the position of the bars on the x-axis
r1 = np.arange(len(model_names))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]

# Create the bar graph
plt.figure(figsize=(10, 6))
plt.bar(r1, accuracy, width=barWidth, label='Accuracy')
plt.bar(r2, precision, width=barWidth, label='Precision')
plt.bar(r3, recall, width=barWidth, label='Recall')
plt.bar(r4, f1_score, width=barWidth, label='F1 Score')

# Add xticks on the middle of the group bars
plt.xlabel('Model', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(model_names))], model_names)

# Add labels and title
plt.ylabel('Score', fontweight='bold')
plt.title('Evaluation Metrics of Different Machine Learning Models', fontweight='bold')
plt.legend()

# Show the graph
plt.show()
