In [1]:
# Carrying out the imports.
import numpy as np
import pandas as pd
import re

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, \
                            accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

import time

In [2]:
# Read df.
df = pd.read_csv('../datasets/final_df.csv')
print(df.shape)
df.head()

(38716, 5)


Unnamed: 0,emotion,author,post,post_length,cleaned_post
0,Anger,aboowwabooww,I need help I have undiagnosed and untreated a...,1239,i need help i have undiagnosed and untreated a...
1,Anger,69andeverything,How can someone who doesn't get angry help my ...,772,how can someone who doesnt get angry help my b...
2,Anger,mailception,how I make it stop ? anyone please ? All I can...,1053,how i make it stop anyone please all i can e...
3,Anger,lemonsandrosemary,Shattered a Window Today Just like the title s...,376,shattered a window today just like the title s...
4,Anger,sadtimesecondary,I need help with my irritable depression \nBas...,820,i need help with my irritable depression \nbas...


In [3]:
# Checking for null cells.
df.isna().sum()

emotion         0
author          0
post            0
post_length     0
cleaned_post    7
dtype: int64

In [4]:
# Since it's only 7 rows, I drop them.
df.dropna(axis=0, inplace=True)

In [5]:
# Instantiating.
lemmatizer = WordNetLemmatizer()

In [6]:
# Defining my X.
df['lem_post'] = [lemmatizer.lemmatize(w) for w in df['cleaned_post']]
X = df['lem_post']

In [7]:
# Assigning numerical values to my emotional classes for classification.
df['emotion_cat'] = df['emotion'].map({'Anger' : 1, 'Disgust': 2, 'Fear': 3, 'Joy': 4, 'Neutral': 5, 'Sadness': 6, 'Surprise': 7})

In [8]:
# Defining my y.
y = df['emotion_cat']

In [9]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

In [10]:
cvec = CountVectorizer(max_features = 500, stop_words = 'english')

In [11]:
X_train_cvec = pd.DataFrame(
    cvec.fit_transform(X_train).todense(),
    columns = cvec.get_feature_names()
)

In [12]:
X_test_cvec = pd.DataFrame(
    cvec.transform(X_test).todense(),
    columns = cvec.get_feature_names()
)

In [13]:
X_train_cvec.head()

Unnamed: 0,10,15,20,able,absolutely,actually,advice,afraid,age,ago,...,wouldnt,writing,wrong,x200b,year,years,yesterday,youre,youtube,zen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
tvec = TfidfVectorizer()

In [15]:
X_train_tvec = pd.DataFrame(
    tvec.fit_transform(X_train).todense(),
    columns = tvec.get_feature_names()
)


In [16]:
X_test_tvec = pd.DataFrame(
    tvec.transform(X_test).todense(),
    columns = tvec.get_feature_names()
)

In [17]:
X_train_tvec.head()

Unnamed: 0,00,000,0000,00009,001,0030,005,00s,01,010,...,zs,zshrc,zuckerberg,zuhause,zuigan,zunsu,zuowang,zurich,zutang,zyprexa
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
y_test

15089    3
26625    5
27616    5
16993    4
28080    5
        ..
14185    3
24262    5
15869    3
38540    7
8364     2
Name: emotion_cat, Length: 9678, dtype: int64

In [15]:
# checking distribution of y_train
y_train.value_counts(normalize=True)

5    0.194275
3    0.174365
1    0.164652
4    0.139575
6    0.125969
7    0.116944
2    0.084220
Name: emotion_cat, dtype: float64

In [16]:
# checking distribution of y_test
y_test.value_counts(normalize=True)

5    0.194255
3    0.174416
1    0.164703
4    0.139492
6    0.125956
7    0.116966
2    0.084212
Name: emotion_cat, dtype: float64

In [17]:
baseline_model = DummyClassifier(random_state=42)
baseline_model.fit(X_train_cvec, y_train)
baseline_predictions = baseline_model.predict(X_test_cvec)

In [18]:
accuracy_score(y_test, baseline_predictions)

0.19425501136598472

In [19]:
cross_val_score(baseline_model,X_train_cvec,y_train,cv=5).mean()

0.19427508617564618

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
dtree_model = DecisionTreeClassifier(max_depth = 2).fit(X_train_cvec, y_train)
dtree_predictions = dtree_model.predict(X_test_cvec)

In [21]:
accuracy_score(y_test, dtree_predictions)

0.2429220913411862

In [24]:
f1_score(y_test, dtree_predictions, average='weighted')

0.17825290240735703

In [210]:
cross_val_score(dtree_model,X_train_cvec,y_train,cv=5).mean()

0.24394605890084434

In [21]:
dtree_model2 = DecisionTreeClassifier(max_depth = 2).fit(X_train_tvec, y_train)
dtree_predictions2 = dtree_model2.predict(X_test_tvec)
print(accuracy_score(y_test, dtree_predictions2))
print(f1_score(y_test, dtree_predictions2, average='weighted'))
print(cross_val_score(dtree_model2,X_train_tvec,y_train,cv=5).mean())

0.2420954742715437
0.17724879369330307
0.24363604665185762


In [23]:
from sklearn.svm import SVC

In [25]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train_cvec, y_train)
svm_predictions = svm_model_linear.predict(X_test_cvec)

In [26]:
accuracy_score(y_test, svm_predictions)

0.5645794585658194

In [28]:
f1_score(y_test, svm_predictions, average='weighted')

0.5757793867132268

In [29]:
cross_val_score(svm_model_linear,X_train_cvec,y_train,cv=5).mean()

0.565016771840037

In [None]:
svm_model_linear2 = SVC(kernel = 'linear', C = 1).fit(X_train_tvec, y_train)
svm_predictions2 = svm_model_linear2.predict(X_test_tvec)
print(accuracy_score(y_test, svm_predictions2))
print(f1_score(y_test, svm_predictions2, average='weighted'))
print(cross_val_score(svm_model_linear2,X_train_tvec,y_train,cv=5).mean())

In [30]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train_cvec, y_train)
gnb_predictions = gnb.predict(X_test_cvec)

In [31]:
accuracy_score(y_test, gnb_predictions)

0.43552386856788594

In [33]:
f1_score(y_test, gnb_predictions, average='weighted')

0.45696437212933705

In [None]:
cross_val_score(gnb, X_train_cvec,y_train,cv=5).mean()

**Bert**

In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [4]:
import tensorflow as tf
import pandas as pd


In [None]:
# Copied but don't know how to use this.
dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

In [None]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

In [None]:
pred_sentences 

In [None]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])

**Hugging Face**

In [24]:
from transformers import pipeline

In [27]:
classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

Downloading:   0%|          | 0.00/768 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at bhadresh-savani/distilbert-base-uncased-emotion.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [35]:
prediction = classifier("i need help")
print(prediction)

[[{'label': 'sadness', 'score': 0.21333540976047516}, {'label': 'joy', 'score': 0.2224060595035553}, {'label': 'love', 'score': 0.0075500537641346455}, {'label': 'anger', 'score': 0.16987456381320953}, {'label': 'fear', 'score': 0.38217902183532715}, {'label': 'surprise', 'score': 0.004654841031879187}]]


In [34]:
classifier = pipeline("text-classification",model='bhadresh-savani/bert-base-uncased-emotion', return_all_scores=True)


Downloading:   0%|          | 0.00/935 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some layers from the model checkpoint at bhadresh-savani/bert-base-uncased-emotion were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at bhadresh-savani/bert-base-uncased-emotion.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Downloading:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [36]:
prediction = classifier("i need help")
print(prediction)

[[{'label': 'sadness', 'score': 0.21333540976047516}, {'label': 'joy', 'score': 0.2224060595035553}, {'label': 'love', 'score': 0.0075500537641346455}, {'label': 'anger', 'score': 0.16987456381320953}, {'label': 'fear', 'score': 0.38217902183532715}, {'label': 'surprise', 'score': 0.004654841031879187}]]
