# Poem Sentiment Analysis

In [185]:
# Importing library
import numpy as np
import pandas as pd
import time 
import re

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, PoemTokenizer
english_stop_words = stopwords.words('english')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# machine learning

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from tensorflow.keras.utils import to_categorical

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, GlobalMaxPooling1D, SpatialDropout1D



from sklearn.metrics import classification_report

import seaborn as sns

In [186]:
#Reading the file
df = pd.read_csv("C:/Users/chira/OneDrive - fsm.ac.in/Desktop/Poem dataset.csv",sep=",", encoding='cp1252')

In [148]:
# information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749 entries, 0 to 748
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  749 non-null    int64 
 1   Context     749 non-null    object
 2   Review      749 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 17.7+ KB


In [149]:
df.head()

Unnamed: 0.1,Unnamed: 0,Context,Review
0,0,with pale blue berries. in these peaceful shad...,1
1,1,"it flows so long as falls the rain,",0
2,2,"and that is why, the lonesome day,",-1
3,3,"when i peruse the conquered fame of heroes, an...",2
4,4,of inward strife for truth and liberty.,2


In [150]:
df.Review.unique()

array([ 1,  0, -1,  2], dtype=int64)

In [151]:
target_train=df.Review

In [152]:
#Here -1 is negative, 2 is no impact, 1 is positive and 3 is mixed

{-1: 0, 0: 2, 1: 1, 2: 3}

{-1: 0, 0: 2, 1: 1, 2: 3}

In [153]:
df['Review'] = df['Review'].replace(['2'], '3')
df['Review'] = df['Review'].replace(['0'], '2')
df['Review'] = df['Review'].replace(['-1'], '0')

In [154]:
df['Review']

0      1
1      0
2     -1
3      2
4      2
      ..
744    0
745    1
746   -1
747    0
748   -1
Name: Review, Length: 749, dtype: int64

In [155]:
# Checking null values in dataset
df.isnull().sum()

Unnamed: 0    0
Context       0
Review        0
dtype: int64

In [156]:
#Splitting the data
X_train, X_valid = train_test_split(df['Context'],
                                  random_state=123,
                                  train_size=0.80    # 99% vs 1%
                                 )

print(X_train.shape)     # (1467709, 12)
print(X_valid.shape)     # (14826, 12) 1% of above

(599,)
(150,)


In [157]:
REPLACE_WITH_SPACE = re.compile("(@)")
SPACE = " "
# preprocess on the poem and removing @ 
def preprocess_reviews(reviews):  
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line.lower()) for line in reviews]
    
    return reviews

# preprocess training data based of poem
reviews_train_clean = preprocess_reviews(df.Context)

In [158]:
# removing all stop words and doing further p;reprocessing on the poem data
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split()  if word not in english_stop_words]))
    return removed_stop_words

# removing stop words from training data
no_stop_words_train = remove_stop_words(reviews_train_clean)


In [159]:
# created a function for stemming words
def get_stemmed_text(corpus):
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

# stemming text on train data
stemmed_reviews_train = get_stemmed_text(no_stop_words_train)


In [160]:
# using Tokenizer method to filter out special character and initializing its method
tokenizer = Tokenizer(
    num_words = 8000,
    filters = '"#$%&()*+-/:;<=>@[\]^_`{|}~'
)

In [161]:
# using Tokenizer method converting training poem data to 3000 features
tokenizer.fit_on_texts(stemmed_reviews_train)
X_train = tokenizer.texts_to_sequences(stemmed_reviews_train)
X_train = pad_sequences(X_train, maxlen = 3000)

In [162]:
#checking shape
print("Training features:",X_train.shape)
print("Training labels:", target_train.shape)

Training features: (749, 3000)
Training labels: (749,)


# Preprocessing and cleaning data


In [163]:
# training data feature
poem_train = df["Context"]


In [164]:
# using Tokenizer method to filter out special character and initializing its method
tokenizer = Tokenizer(
    num_words = 8000,
    filters = '"#$%&()*+-/:;<=>@[\]^_`{|}~'
)

In [165]:
# using Tokenizer method converting training peom data to 3000 features
tokenizer.fit_on_texts(poem_train)
x_train = tokenizer.texts_to_sequences(poem_train)
x_train = pad_sequences(x_train, maxlen = 3000)

In [167]:
# creating a sequential model with dense layers, dropouts and all activation function as relu to remove 
#negative value and last activation function as softamx for getting final value and initialized weights in first two layer
model = Sequential()
model.add(Dense(128, input_dim = x_train.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(256, activation='relu', kernel_initializer='he_normal'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(4, activation='softmax'))
# summary of the model
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_22 (Dense)            (None, 128)               384128    
                                                                 
 dense_23 (Dense)            (None, 256)               33024     
                                                                 
 dropout_352 (Dropout)       (None, 256)               0         
                                                                 
 dense_24 (Dense)            (None, 512)               131584    
                                                                 
 dense_25 (Dense)            (None, 512)               262656    
                                                                 
 dropout_353 (Dropout)       (None, 512)               0         
                                                                 
 dense_26 (Dense)            (None, 256)              

In [168]:
# compiling the custom DL based model with loss calculated on the basis of categorical_crossentropy, with adam optimizer and used accuracy as metrics
model.compile(loss='categorical_crossentropy',
                 optimizer='adam', metrics=["accuracy"])

In [169]:
# shape
print("Training features:",x_train.shape)
print("Training labels:", y_train.shape)


Training features: (749, 3000)
Training labels: (749, 3)


In [170]:
!pip install transformers



In [171]:
# Importing hugging face transformer
from transformers import pipeline

In [172]:
data=df["Context"]

In [173]:
#importing "siebert/sentiment-roberta-large-english" from hugging face
classifier1 = pipeline("sentiment-analysis",
                         model="siebert/sentiment-roberta-large-english"
                        )

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at siebert/sentiment-roberta-large-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [174]:
# Extracting poem  from the dataset
df1=df['Context'].astype(str)

In [175]:
summarizer = pipeline("summarization")

No model was supplied, defaulted to t5-small and revision d769bba (https://huggingface.co/t5-small).
Using a pipeline without specifying a model name and revision in production is not recommended.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [176]:
#summary  from hugging face
summarizer(df1[:5].values.tolist(),max_length=100)

Your max_length is set to 100, but you input_length is only 15. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 100, but you input_length is only 12. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 100, but you input_length is only 15. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 100, but you input_length is only 35. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Your max_length is set to 100, but you input_length is only 14. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)


[{'summary_text': 'pale blue berries in these peaceful shades-- with pale blueberries . in these hues-- the berries are ripe and berry-like .'},
 {'summary_text': 'it flows so long as falls the rain . it flows . so long . as falls . the rain, the rain and the sand .'},
 {'summary_text': "the lonesome day, a day of london's worst ever, is a year of mourning . it's the first time we've had a great day ."},
 {'summary_text': 'when i peruse the conquered fame of heroes and the victories of mighty generals, i do not envy the generals .'},
 {'summary_text': 'of inward strife for truth and liberty . of truth and freedom . if you are a savior, please contact us on 0800 555 111 .'}]

In [177]:
# Text Generation from hugging face
generator = pipeline('text-generation', model = 'gpt2')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [178]:
# Text generating for first 5 poem extracts
generator(df1[:5].values.tolist(), max_length = 100, num_return_sequences=3)


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


[[{'generated_text': 'with pale blue berries. in these peaceful shades--as in the woods. There are two types of berries on each side of the stem--the white "brysey" and the red "bryseye." The white "bryseye" is almost a black-and-white kind of berries. The red "bryseye" is the most unusual kind in the picture. To be perfectly clear of color all berries are blue.\n\n\nThe white "brysey'},
  {'generated_text': 'with pale blue berries. in these peaceful shades--these are the first fruits of our love to flowers, the first time we really began to love these flowers.\n\n\nHAPPENING TO A GOOD LOSS OF TIME\n\nIt is the good lost sleep which brings a new good.\n\n\nHUMAN WASHING IN COLD\n\nThe night before breakfast, your skin and clothes dry to the skin. This will bring a new feeling. And the new feeling will come from'},
  {'generated_text': 'with pale blue berries. in these peaceful shades--the first hint of light, which had faded by night; a blueish grayness. Then the last of the green, the

In [179]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [180]:
# importing vader library
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\chira\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [181]:
df

Unnamed: 0.1,Unnamed: 0,Context,Review
0,0,with pale blue berries. in these peaceful shad...,1
1,1,"it flows so long as falls the rain,",0
2,2,"and that is why, the lonesome day,",-1
3,3,"when i peruse the conquered fame of heroes, an...",2
4,4,of inward strife for truth and liberty.,2
...,...,...,...
744,887,to his ears there came a murmur of far seas be...,0
745,888,"the one good man in the world who knows me, --",1
746,889,faint voices lifted shrill with pain,-1
747,890,"an', fust you knowed on, back come charles the...",0


df1

In [182]:
#measuring polarity and sentiment analysis
analyzer = SentimentIntensityAnalyzer()

df['compound'] = [analyzer.polarity_scores(x)['compound'] for x in df['Context']]
df['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df['Context']]
df['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df['Context']]
df['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df['Context']]

In [183]:
df

Unnamed: 0.1,Unnamed: 0,Context,Review,compound,neg,neu,pos
0,0,with pale blue berries. in these peaceful shad...,1,0.4939,0.000,0.686,0.314
1,1,"it flows so long as falls the rain,",0,0.0000,0.000,1.000,0.000
2,2,"and that is why, the lonesome day,",-1,-0.3612,0.294,0.706,0.000
3,3,"when i peruse the conquered fame of heroes, an...",2,0.7914,0.000,0.652,0.348
4,4,of inward strife for truth and liberty.,2,0.6908,0.000,0.467,0.533
...,...,...,...,...,...,...,...
744,887,to his ears there came a murmur of far seas be...,0,0.0000,0.000,1.000,0.000
745,888,"the one good man in the world who knows me, --",1,0.4404,0.000,0.775,0.225
746,889,faint voices lifted shrill with pain,-1,-0.5106,0.398,0.602,0.000
747,890,"an', fust you knowed on, back come charles the...",0,0.0000,0.000,1.000,0.000
