In [211]:
# libraries
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
import time

# metrics evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

## Data + combining

In [171]:
depression_df = pd.read_csv('depression_dataset_reddit_twitter.csv')
depression_df.head()

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [172]:
depression_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7731 entries, 0 to 7730
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_text     7731 non-null   object
 1   is_depression  7731 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 120.9+ KB


In [None]:
# Mapping for df {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'})

In [173]:
emotions_df = pd.read_csv('emotions_dataset.csv')
emotions_df.head()

Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,0
1,im alone i feel awful,0
2,ive probably mentioned this before but i reall...,1
3,i was feeling a little low few days back,0
4,i beleive that i am much more sensitive to oth...,2


In [174]:
emotions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    416809 non-null  object
 1   label   416809 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.4+ MB


In [175]:
emotions_count = emotions_df['label'].value_counts()
emotions_count.sort_index(inplace=True)
emotions_count

0    121187
1    141067
2     34554
3     57317
4     47712
5     14972
Name: label, dtype: int64

In [176]:
depression_only_df = depression_df[depression_df['is_depression'] == 1].copy()
depression_only_df.reset_index()
depression_only_df.shape

(3831, 2)

In [177]:
depression_only_df.rename({'clean_text': 'text', 'is_depression': 'label'}, axis=1, inplace=True)
depression_only_df.loc[:, 'label'] = 0
depression_only_df

Unnamed: 0,text,label
0,we understand that most people who reply immed...,0
1,welcome to r depression s check in post a plac...,0
2,anyone else instead of sleeping more when depr...,0
3,i ve kind of stuffed around a lot in my life d...,0
4,sleep is my greatest and most comforting escap...,0
...,...,...
3826,thlolo march eh it s because i don t want stre...,0
3827,i hate it when i m having depression day and t...,0
3828,educational depression,0
3829,dmt powder helping with depression amp anxiety...,0


The below dataset comes from https://www.kaggle.com/datasets/ritresearch/happydb

In [178]:
happy_df = pd.read_csv('happy_dataset.csv')
happy_df.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


In [179]:
happy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100535 entries, 0 to 100534
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   hmid                   100535 non-null  int64 
 1   wid                    100535 non-null  int64 
 2   reflection_period      100535 non-null  object
 3   original_hm            100535 non-null  object
 4   cleaned_hm             100535 non-null  object
 5   modified               100535 non-null  bool  
 6   num_sentence           100535 non-null  int64 
 7   ground_truth_category  14125 non-null   object
 8   predicted_category     100535 non-null  object
dtypes: bool(1), int64(3), object(5)
memory usage: 6.2+ MB


In [180]:
happy_df['predicted_category'].unique()

array(['affection', 'exercise', 'bonding', 'leisure', 'achievement',
       'enjoy_the_moment', 'nature'], dtype=object)

In [181]:
happy_df.predicted_category = happy_df.predicted_category.replace({'affection': 'love', 'exercise': 'joy', 'bonding': 'joy', 'leisure': 'joy', 'achievement': 'joy', 'enjoy_the_moment': 'joy', 'nature': 'joy'})


In [182]:
happy_df['predicted_category'].unique()

array(['love', 'joy'], dtype=object)

In [183]:
happy_df2 = happy_df[['cleaned_hm', 'predicted_category']].copy()
happy_df2.reset_index()
happy_df2.shape

(100535, 2)

In [184]:
happy_df2.rename({'cleaned_hm': 'text', 'predicted_category': 'label'}, axis=1, inplace=True)
happy_df2.label = happy_df2.label.replace({'love': 2, 'joy': 1})

In [185]:
happy_df2.head()

Unnamed: 0,text,label
0,I went on a successful date with someone I fel...,2
1,I was happy when my son got 90% marks in his e...,2
2,I went to the gym this morning and did yoga.,1
3,We had a serious talk with some friends of our...,1
4,I went with grandchildren to butterfly display...,2


In [186]:
new_emotions_df = pd.concat([emotions_df, depression_only_df, happy_df2])
new_emotions_df.reset_index(inplace=True)
new_emotions_df.drop('index', axis=1, inplace=True)
new_emotions_df

Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,0
1,im alone i feel awful,0
2,ive probably mentioned this before but i reall...,1
3,i was feeling a little low few days back,0
4,i beleive that i am much more sensitive to oth...,2
...,...,...
521170,My husband announced he is getting a decent bo...,2
521171,Had a can of Pepsi to drink.,1
521172,Cuddling with my girlfriend last night.,2
521173,I had a great meeting yesterday at work with m...,1


In [187]:
new_emotions_count = new_emotions_df['label'].value_counts()
new_emotions_count.sort_index(inplace=True)
new_emotions_count

0    125018
1    207434
2     68722
3     57317
4     47712
5     14972
Name: label, dtype: int64

In [188]:
new_emotions_df.to_csv('all_emotions.csv',index=False)

## Data Pre-processing

In [189]:
df = pd.read_csv('all_emotions.csv')
df.head()

Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,0
1,im alone i feel awful,0
2,ive probably mentioned this before but i reall...,1
3,i was feeling a little low few days back,0
4,i beleive that i am much more sensitive to oth...,2


In [241]:
df['label'].unique()

array([0, 1, 2, 3, 4, 5])

In [190]:
# import sys  
# !{sys.executable} -m pip install contractions

In [191]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [192]:
import contractions
df['text'] = [contractions.fix(text) for text in df['text']]

In [193]:
df['text'] = df['text'].str.replace('[^ a-zA-Z]', ' ', regex=True)

Test if lemmatisation makes accuracy better or not. Stemming already shown to have lower accuracy. Compare unigram and bigram also

Lemmatisation

In [194]:
df['text_tokenized'] = df['text'].apply(lambda x: x.split())
df['text_tokenized']

0         [i, feel, awful, about, it, too, because, it, ...
1                            [i, am, alone, i, feel, awful]
2         [i, have, probably, mentioned, this, before, b...
3         [i, was, feeling, a, little, low, few, days, b...
4         [i, beleive, that, i, am, much, more, sensitiv...
                                ...                        
521170    [my, husband, announced, he, is, getting, a, d...
521171                  [had, a, can, of, pepsi, to, drink]
521172        [cuddling, with, my, girlfriend, last, night]
521173    [i, had, a, great, meeting, yesterday, at, wor...
521174             [i, had, a, great, workout, last, night]
Name: text_tokenized, Length: 521175, dtype: object

In [195]:
stop_words = stopwords.words('english')

df['text_tokenized'] = df['text_tokenized'].apply(lambda x: [word for word in x if word not in stop_words])
df['text_tokenized']

0         [feel, awful, job, get, position, succeed, hap...
1                                      [alone, feel, awful]
2         [probably, mentioned, really, feel, proud, act...
3                        [feeling, little, low, days, back]
4         [beleive, much, sensitive, peoples, feelings, ...
                                ...                        
521170    [husband, announced, getting, decent, bonus, q...
521171                                       [pepsi, drink]
521172                  [cuddling, girlfriend, last, night]
521173    [great, meeting, yesterday, work, boss, collea...
521174                        [great, workout, last, night]
Name: text_tokenized, Length: 521175, dtype: object

In [196]:
lemmatizer = WordNetLemmatizer()

df['text_lemma'] = df['text_tokenized'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['text_lemma']

0         [feel, awful, job, get, position, succeed, hap...
1                                      [alone, feel, awful]
2         [probably, mentioned, really, feel, proud, act...
3                         [feeling, little, low, day, back]
4         [beleive, much, sensitive, people, feeling, te...
                                ...                        
521170    [husband, announced, getting, decent, bonus, q...
521171                                       [pepsi, drink]
521172                  [cuddling, girlfriend, last, night]
521173    [great, meeting, yesterday, work, bos, colleag...
521174                        [great, workout, last, night]
Name: text_lemma, Length: 521175, dtype: object

In [197]:
df['text_lemma'] = df['text_lemma'].apply(lambda x: " ".join(x))
df['text_lemma']

0                feel awful job get position succeed happen
1                                          alone feel awful
2         probably mentioned really feel proud actually ...
3                               feeling little low day back
4         beleive much sensitive people feeling tend com...
                                ...                        
521170       husband announced getting decent bonus quarter
521171                                          pepsi drink
521172                       cuddling girlfriend last night
521173    great meeting yesterday work bos colleague wen...
521174                             great workout last night
Name: text_lemma, Length: 521175, dtype: object

In [198]:
X_lemma_train, X_lemma_test, y_lemma_train, y_lemma_test = train_test_split(df['text_lemma'], df['label'], test_size=0.25, random_state=11)

In [199]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.25, random_state=11)

In [200]:
print("Train: ",X_lemma_train.shape, y_lemma_train.shape,"Test: ",(X_lemma_test.shape,y_lemma_test.shape))
print("Train: ",X_train.shape, y_train.shape,"Test: ",(X_test.shape,y_test.shape))

Train:  (390881,) (390881,) Test:  ((130294,), (130294,))
Train:  (390881,) (390881,) Test:  ((130294,), (130294,))


Unigram

In [202]:
lemma_vectorizer= TfidfVectorizer(lowercase=False, stop_words=stop_words, ngram_range=(1,1))
tf_x_lemma_train_1 = lemma_vectorizer.fit_transform(X_lemma_train)
tf_x_lemma_test_1 = lemma_vectorizer.transform(X_lemma_test)

In [203]:
text_vectorizer= TfidfVectorizer(lowercase=False, stop_words=stop_words, ngram_range=(1,1))
tf_x_train_1 = text_vectorizer.fit_transform(X_train)
tf_x_test_1 = text_vectorizer.transform(X_test)

In [204]:
text_vectorizer= TfidfVectorizer(lowercase=False, stop_words='english', ngram_range=(1,1))
tf_x_train_1a = text_vectorizer.fit_transform(X_train)
tf_x_test_1a = text_vectorizer.transform(X_test)

Bigram

In [205]:
lemma_vectorizer= TfidfVectorizer(lowercase=False, stop_words='english', ngram_range=(2,2))
tf_x_lemma_train_2 = lemma_vectorizer.fit_transform(X_lemma_train)
tf_x_lemma_test_2 = lemma_vectorizer.transform(X_lemma_test)

In [206]:
text_vectorizer= TfidfVectorizer(lowercase=False, stop_words=stop_words, ngram_range=(2,2))
tf_x_train_2 = text_vectorizer.fit_transform(X_train)
tf_x_test_2 = text_vectorizer.transform(X_test)

In [207]:
text_vectorizer= TfidfVectorizer(lowercase=False, stop_words='english', ngram_range=(2,2))
tf_x_train_2a = text_vectorizer.fit_transform(X_train)
tf_x_test_2a = text_vectorizer.transform(X_test)

## Model

In [214]:
# compare the various types of logistic regression models
solver = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']

In [213]:
for sol in solver:
    startTime = time.time()

    lr_lemma_1 = LogisticRegression(class_weight='balanced', random_state=11, solver=sol)
    lr_lemma_1.fit(tf_x_lemma_train_1, y_lemma_train)

    y_lemma_pred_1 = lr_lemma_1.predict(tf_x_lemma_test_1)

    print(sol)
    print(f'Accuracy Lemma: {accuracy_score(y_lemma_test, y_lemma_pred_1)}')
    print(f'F1 Score Lemma: {f1_score(y_lemma_test, y_lemma_pred_1, average="weighted")}')
    print('Running time: {0}'.format(time.time() - startTime))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lbfgs
Accuracy Lemma: 0.8912075767111303
F1 Score Lemma: 0.8933947147614666
Running time: 13.266667127609253
liblinear
Accuracy Lemma: 0.8931570141372588
F1 Score Lemma: 0.8945049995079697
Running time: 21.36852478981018
newton-cg
Accuracy Lemma: 0.8900102844336654
F1 Score Lemma: 0.8921661728330171
Running time: 27.35059881210327




sag
Accuracy Lemma: 0.8897646860177751
F1 Score Lemma: 0.8919424799787268
Running time: 36.762819051742554
saga
Accuracy Lemma: 0.8894960627503953
F1 Score Lemma: 0.8908484817845681
Running time: 38.34941077232361




In [215]:
for sol in solver:
    startTime = time.time()

    lr_text_1 = LogisticRegression(class_weight='balanced', random_state=11, solver=sol)
    lr_text_1.fit(tf_x_train_1, y_train)

    y_text_pred_1 = lr_text_1.predict(tf_x_test_1)
    
    print(sol)
    print(f'Accuracy Lemma: {accuracy_score(y_test, y_text_pred_1)}')
    print(f'F1 Score Lemma: {f1_score(y_test, y_text_pred_1, average="weighted")}')
    print('Running time: {0}'.format(time.time() - startTime))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lbfgs
Accuracy Lemma: 0.8935177368105975
F1 Score Lemma: 0.8955389078631671
Running time: 14.198921918869019
liblinear
Accuracy Lemma: 0.8932798133452039
F1 Score Lemma: 0.8946314834414637
Running time: 21.571985721588135
newton-cg
Accuracy Lemma: 0.8905014812654458
F1 Score Lemma: 0.8926299877866202
Running time: 31.116802215576172




sag
Accuracy Lemma: 0.890800804334812
F1 Score Lemma: 0.8929108883026867
Running time: 38.07388639450073
saga
Accuracy Lemma: 0.889304188987981
F1 Score Lemma: 0.891571620356086
Running time: 39.94084095954895




In [218]:
for sol in solver:
    startTime = time.time()

    lr_text_1a = LogisticRegression(class_weight='balanced', random_state=11, solver=sol)
    lr_text_1a.fit(tf_x_train_1a, y_train)

    y_text_pred_1a = lr_text_1a.predict(tf_x_test_1a)
    
    print(sol)
    print(f'Accuracy Lemma: {accuracy_score(y_test, y_text_pred_1a)}')
    print(f'F1 Score Lemma: {f1_score(y_test, y_text_pred_1a, average="weighted")}')
    print('Running time: {0}'.format(time.time() - startTime))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lbfgs
Accuracy Lemma: 0.8935177368105975
F1 Score Lemma: 0.8955389078631671
Running time: 14.334487199783325
liblinear
Accuracy Lemma: 0.8932798133452039
F1 Score Lemma: 0.8946314834414637
Running time: 26.609421968460083
newton-cg
Accuracy Lemma: 0.8905014812654458
F1 Score Lemma: 0.8926299877866202
Running time: 31.52431893348694




sag
Accuracy Lemma: 0.890800804334812
F1 Score Lemma: 0.8929108883026867
Running time: 37.3901629447937
saga
Accuracy Lemma: 0.889304188987981
F1 Score Lemma: 0.891571620356086
Running time: 39.841620206832886




In [219]:
for sol in solver:
    startTime = time.time()

    lr_lemma_2 = LogisticRegression(class_weight='balanced', random_state=11, solver=sol)
    lr_lemma_2.fit(tf_x_lemma_train_2, y_lemma_train)

    y_lemma_pred_2 = lr_lemma_2.predict(tf_x_lemma_test_2)

    print(sol)
    print(f'Accuracy Lemma: {accuracy_score(y_lemma_test, y_lemma_pred_2)}')
    print(f'F1 Score Lemma: {f1_score(y_lemma_test, y_lemma_pred_2, average="weighted")}')
    print('Running time: {0}'.format(time.time() - startTime))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lbfgs
Accuracy Lemma: 0.803805240456199
F1 Score Lemma: 0.8058005168829081
Running time: 141.64585208892822
liblinear
Accuracy Lemma: 0.8117564891706448
F1 Score Lemma: 0.8107689907270532
Running time: 28.361366987228394




newton-cg
Accuracy Lemma: 0.8134910279828695
F1 Score Lemma: 0.8144254759903472
Running time: 90.0956518650055




sag
Accuracy Lemma: 0.8133145041214485
F1 Score Lemma: 0.8142583276779258
Running time: 46.54056978225708
saga
Accuracy Lemma: 0.8128079573886748
F1 Score Lemma: 0.813774445186686
Running time: 48.73991394042969




In [220]:
for sol in solver:
    startTime = time.time()

    lr_text_2 = LogisticRegression(class_weight='balanced', random_state=11, solver=sol)
    lr_text_2.fit(tf_x_train_2, y_train)

    y_text_pred_2 = lr_text_2.predict(tf_x_test_2)
    
    print(sol)
    print(f'Accuracy Lemma: {accuracy_score(y_test, y_text_pred_2)}')
    print(f'F1 Score Lemma: {f1_score(y_test, y_text_pred_2, average="weighted")}')
    print('Running time: {0}'.format(time.time() - startTime))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lbfgs
Accuracy Lemma: 0.7935591815432791
F1 Score Lemma: 0.7945361512783702
Running time: 132.30526494979858
liblinear
Accuracy Lemma: 0.8062842494665909
F1 Score Lemma: 0.8050936269067712
Running time: 27.955080032348633
newton-cg
Accuracy Lemma: 0.8084869602591064
F1 Score Lemma: 0.8093253448695314
Running time: 92.4678750038147




sag
Accuracy Lemma: 0.8089321073879073
F1 Score Lemma: 0.8097929187870062
Running time: 48.4337100982666
saga
Accuracy Lemma: 0.7855081584723779
F1 Score Lemma: 0.7996688907924324
Running time: 50.51747488975525




In [221]:
for sol in solver:
    startTime = time.time()

    lr_text_2a = LogisticRegression(class_weight='balanced', random_state=11, solver=sol)
    lr_text_2a.fit(tf_x_train_2a, y_train)

    y_text_pred_2a = lr_text_2a.predict(tf_x_test_2a)
    
    print(sol)
    print(f'Accuracy Lemma: {accuracy_score(y_test, y_text_pred_2a)}')
    print(f'F1 Score Lemma: {f1_score(y_test, y_text_pred_2a, average="weighted")}')
    print('Running time: {0}'.format(time.time() - startTime))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lbfgs
Accuracy Lemma: 0.7935591815432791
F1 Score Lemma: 0.7945361512783702
Running time: 139.50385189056396
liblinear
Accuracy Lemma: 0.8062842494665909
F1 Score Lemma: 0.8050936269067712
Running time: 28.24203085899353
newton-cg
Accuracy Lemma: 0.8084869602591064
F1 Score Lemma: 0.8093253448695314
Running time: 94.29196691513062




sag
Accuracy Lemma: 0.8089321073879073
F1 Score Lemma: 0.8097929187870062
Running time: 49.073023319244385
saga
Accuracy Lemma: 0.7855081584723779
F1 Score Lemma: 0.7996688907924324
Running time: 50.08756709098816




## best result from lr_text_1 - non-lemmatised text, unigram + lbfgs logistic regression model

In [243]:
text_vectorizer= TfidfVectorizer(lowercase=False, stop_words='english', ngram_range=(1,1))
tf_x_train_1 = text_vectorizer.fit_transform(X_train)
tf_x_test_1 = text_vectorizer.transform(X_test)

In [242]:
lr_text_1 = LogisticRegression(class_weight='balanced', random_state=11, solver=sol)
lr_text_1.fit(tf_x_train_1, y_train)

y_text_pred_1 = lr_text_1.predict(tf_x_test_1)



In [246]:
import joblib

# Save models to disk
joblib.dump(text_vectorizer, 'tfidf_vectorizer.pkl', compress=True)
joblib.dump(lr_text_1, 'lr_model.pkl')

['lr_model.pkl']

use simpler ways to process data? https://textblob.readthedocs.io/en/dev/

In [227]:
#%pip install -U textblob

In [291]:
para = TextBlob("Today, my frend visited me and suprised me with my favorite cofee. I want to kill myself. My pet just died. It had been with me for my entire childhood. I miss it a lot. I want to die. The driver who ran over it shouldn't die.")
para2 = para.correct().lower()
sentences = para2.sentences
print(sentences)

[Sentence("today, my friend visited me and surprised me with my favorite coffee."), Sentence("i want to kill myself."), Sentence("by pet just died."), Sentence("it had been with me for my entire childhood."), Sentence("i miss it a lot."), Sentence("i want to die."), Sentence("the driver who ran over it shouldn't die.")]


In [292]:
for sent in sentences:
    print(sent)

today, my friend visited me and surprised me with my favorite coffee.
i want to kill myself.
by pet just died.
it had been with me for my entire childhood.
i miss it a lot.
i want to die.
the driver who ran over it shouldn't die.


In [309]:
'''
1) Expand contractions
2) Remove punctuations (except for full stop, comma, and apostrophe) and special characters
3) Turn into TextBlob to use the library's functions
4) Correct spelling mistakes
5) Lower text
6) Paragraph to Sentence
7) Tokenize Sentence  
8) Join Words
9) Parse the Sentences into the TfidfVectorizer
10) Parse the Feature into LogisticRegression Model
11) Update the emotion dictionary based on predicted count
12) Return emotion dictionary
'''

from collections import defaultdict
import re
from textblob import TextBlob
import contractions


def analyse_text(para):
    para = contractions.fix(para)
    para = re.sub(r"[^ a-zA-Z\.,']+", "", para)


    para = TextBlob(para)
    para = para.correct()
    para = para.lower()
    print(para)

    sentence_list = para.sentences
    
    tfidfvectorizer = joblib.load('tfidf_vectorizer.pkl')
    logreg = joblib.load('lr_model.pkl')

    emotions_results = defaultdict(int)
    sentiment_map = {
        0: "sadness/depression",
        1: "joy",
        2: "love",
        3: "anger",
        4: "fear",
        5: "surprise"
    }

    for sentence in sentence_list:
        print(f'Sentence: {sentence}')
        print(f'Processed Sentence: {sentence.words}')
        processed_sentence = " ".join(sentence.words)
        word_embedding = tfidfvectorizer.transform([processed_sentence])
        pred_emotion = logreg.predict(word_embedding)
        print(f'Sentiment Probability: {logreg.predict_proba(word_embedding)}')
        sentiment = sentiment_map[pred_emotion[0]]
        print(f'Predicted Sentiment: {sentiment}')
        emotions_results[sentiment] += 1

    return emotions_results

In [312]:
paragraph = "Today, my friend% visited me! and surprised me2 with my f-avorite coffee. I want to kill myself. My pet just died. It had been with me for my entire childhood. I miss it a lot. I want to die. The driver who ran over it should die."

analyse_text(paragraph)

today, my friend visited me and surprised me with my favorite coffee. i want to kill myself. by pet just died. it had been with me for my entire childhood. i miss it a lot. i want to die. the driver who ran over it should die.
Sentence: today, my friend visited me and surprised me with my favorite coffee.
Processed Sentence: ['today', 'my', 'friend', 'visited', 'me', 'and', 'surprised', 'me', 'with', 'my', 'favorite', 'coffee']
Sentiment Probability: [[4.73854381e-04 2.15245231e-01 1.60190246e-01 5.16238385e-04
  8.19174070e-05 6.23492514e-01]]
Predicted Sentiment: surprise
Sentence: i want to kill myself.
Processed Sentence: ['i', 'want', 'to', 'kill', 'myself']
Sentiment Probability: [[0.51989562 0.13898658 0.07444536 0.21734987 0.04512935 0.00419322]]
Predicted Sentiment: sadness/depression
Sentence: by pet just died.
Processed Sentence: ['by', 'pet', 'just', 'died']
Sentiment Probability: [[0.68799786 0.06300891 0.17602325 0.02286662 0.0483501  0.00175327]]
Predicted Sentiment: sad

defaultdict(int, {'surprise': 1, 'sadness/depression': 3, 'joy': 2, 'fear': 1})