In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
data = pd.read_csv('stress_detection.csv')
print(data.head(5))
data["label"] = data["label"].map({0: "No Stress", 1: "Stress"})
df = data[["text", "label"]]

          subreddit post_id sentence_range  \
0              ptsd  8601tu       (15, 20)   
1        assistance  8lbrx9         (0, 5)   
2              ptsd  9ch1zh       (15, 20)   
3     relationships  7rorpp        [5, 10]   
4  survivorsofabuse  9p2gbc         [0, 5]   

                                                text     id  label  \
0  He said he had not felt that way before, sugge...  33181      1   
1  Hey there r/assistance, Not sure if this is th...   2606      0   
2  My mom then hit me with the newspaper and it s...  38816      1   
3  until i met my new boyfriend, he is amazing, h...    239      1   
4  October is Domestic Violence Awareness Month a...   1421      1   

   confidence  social_timestamp  social_karma  syntax_ari  ...  \
0         0.8        1521614353             5    1.806818  ...   
1         1.0        1527009817             4    9.429737  ...   
2         0.8        1535935605             2    7.769821  ...   
3         0.6        1516429555       

In [2]:
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHAURYA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
df["text"] = df["text"].apply(clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].apply(clean)


In [4]:
#I want the model to completely classify a sentence as "stress" if the words : stressed,abuse,rape etc are used. Here's an additional code to the model
keywords = ['stress', 'abuse', 'rape']

for keyword in keywords:
    df[keyword] = df["text"].str.contains(keyword, case=False).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[keyword] = df["text"].str.contains(keyword, case=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[keyword] = df["text"].str.contains(keyword, case=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[keyword] = df["text"].str.contains(keyword, case=False).asty

In [5]:

Vector = CountVectorizer(stop_words='english', max_features=50000)
XT = Vector.fit_transform(np.array(df["text"]))
#Implementing the filter of keywords
X = pd.concat([pd.DataFrame(XT.toarray(), columns=Vector.get_feature_names_out()), df[keywords]], axis=1)
# Split data into training and testing sets (ratio = 1:3)
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.25, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

In [6]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
#Prints the accuracy of our model
print(f"Accuracy: {accuracy}\nClassification Report:\n{report}")


Accuracy: 0.7281690140845071
Classification Report:
              precision    recall  f1-score   support

   No Stress       0.73      0.68      0.70       335
      Stress       0.73      0.77      0.75       375

    accuracy                           0.73       710
   macro avg       0.73      0.73      0.73       710
weighted avg       0.73      0.73      0.73       710



In [8]:
keywords = ['stress', 'abuse', 'rape']
#Here is an attempt in including keywords to automatically detect "stress" in a sentence
# For User Input
user_input = input("Enter a sentence: ")

# Create a DataFrame for user input
user_df = pd.DataFrame({'text': [user_input]})

# Using the same method to predict the data
for keyword in keywords:
    user_df[keyword] = user_df['text'].str.contains(keyword, case=False).astype(int)

# Vectorize user input using the trained vectorizer
user_text_vectorized = Vector.transform(user_df['text'])

# Combine CountVectorizer features with keyword features
user_input_features = pd.concat([pd.DataFrame(user_text_vectorized.toarray(), columns=Vector.get_feature_names_out()), user_df[keywords]], axis=1)

# Use the trained model to predict the input
user_prediction = model.predict(user_input_features)

print("Predicted Label:", user_prediction[0])

Predicted Label: No Stress
