In [2]:
#https://www.kaggle.com/gferna/notes-on-sentiment-analysis-with-tensorflow-keras

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## data preprocessing on tensorflow

In [3]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [4]:
tokenizer=Tokenizer(num_words=100)

In [5]:
tokenizer.fit_on_texts(sentences)

In [6]:
word_to_index=tokenizer.word_index
print(word_to_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


In [7]:
# Tokenizer turn the sentences into a set of sequences.
sequences=tokenizer.texts_to_sequences(sentences)
print(sequences)

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


- Tokenizer encodes text, with his dictionary build by fit_on_text method, in integer lists, where tokens replace words. If there are words not learned by tokenizer it remove them out, because for it they are meaningless

In [8]:
test_text=['I really love my dog','My dog loves my manatee']
test_seq=tokenizer.texts_to_sequences(test_text)
print(test_text)
print(test_seq)

['I really love my dog', 'My dog loves my manatee']
[[4, 2, 1, 3], [1, 3, 1]]


- To avoid removing some unseen words in the sequence:

-  take a lot of training data (large corpus text) to get a broad vocabulary
put a special value in the vocabulary to use when an unseen word is encountered with the property oov_token in the tokenizer constructor.

In [9]:
tokenizer = Tokenizer(num_words=100,oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_to_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
test_seq = tokenizer.texts_to_sequences(test_text)

In [10]:
print(word_to_index)
print(test_seq)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


## padding

-  When we fed some texts into the neuronal network, we need them to be uniform in size. This is done with pad_sequences once the tokenizer has created the sequence. Sentences get padded to the length of the longest sequence by adding zeros to the beginning of shorter ones by default.

In [12]:
padded = pad_sequences(sequences)

In [13]:
print('\nDictionary: ',word_to_index)
print('\n',sequences)
print('\n',padded)


Dictionary:  {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

 [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

 [[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]


-  Now the list of sentences is a Matrix, with the column width equal the longest sentence. You can set padding after the sentences and set a maximum length for the sentences with maxlen parameter. If the sentence is longer than the maxlen it gets truncated, so you have to set from where with truncating parameter (default is pre)

In [14]:
padded = pad_sequences(sequences, padding='post',maxlen=5,truncating='post')
print(padded)

[[5 3 2 4 0]
 [5 3 2 7 0]
 [6 3 2 4 0]
 [8 6 9 2 4]]


# DataRead

In [15]:
import pandas as pd
import numpy as np

In [16]:
df_train = pd.read_csv('D:/Tensorflow2_nlp/sentimentAnalysis/train.tsv', sep='\t')
df_test = pd.read_csv('D:/Tensorflow2_nlp/sentimentAnalysis/test.tsv', sep='\t')

In [17]:
df_train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [18]:
sentences = df_train['Phrase']
label =df_train['SentenceId']
print('number of sentences: ',len(sentences))
print(sentences[3])
print(label[3])

number of sentences:  156060
A
1


##  We split the sentences in training and testing data shuffling them.

In [19]:
idx = np.arange(len(sentences))

In [20]:
np.random.shuffle(idx)

In [21]:
sentences = sentences.iloc[idx]

In [22]:
label = label.iloc[idx]

In [23]:
sentences.reset_index(drop=True,inplace=True)

In [24]:
label.reset_index(drop=True,inplace=True)

In [25]:
print(sentences[3])
print(label[3])

artistically inept
3330


- this should help you to decide whether to use STOP WORDS or not.
- This part of code is just great analytical tool

In [26]:
stop_word = set(stopwords.words('english')) 
word_vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word', min_df=0.001)
sparse_matrix = word_vectorizer.fit_transform(df_test['Phrase'])
frequencies = sum(sparse_matrix).toarray()[0]
freq = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
freq.sort_values('frequency', ascending=False)
print("ok")

NameError: name 'stopwords' is not defined

## Visualization of data set

In [None]:
a = df_train.Sentiment.value_counts()
a = pd.DataFrame(a)
a['Rating'] = a.index
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
fig, ax = plt.subplots(figsize=(10,6))
sns.barplot(y='Sentiment', x='Rating', data=a)

# Data Preprocessing 

In [None]:
#we make text lower case and leave only letters from a-z and digits
df_train['Phrase'] = df_train['Phrase'].str.lower()
df_train['Phrase'] = df_train['Phrase'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
df_test['Phrase'] = df_test['Phrase'].str.lower()
df_test['Phrase'] = df_test['Phrase'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [None]:
train_feature=df_train['Phrase']
label=df_train['Sentiment']
test_data=df_test['Phrase']
testdt=test_data


In [None]:
type(train_feature)

In [None]:
tokenize = Tokenizer()
X_train = tokenize.texts_to_sequences(df_train['Phrase'].values)
X_test = tokenize.texts_to_sequences(df_test['Phrase'].values)

In [None]:
import tensorflow as tf
train_data1=tf.keras.preprocessing.sequence.pad_sequences(
    X_train.split(),
    maxlen=None,
    dtype='int32',
    padding='pre',
    truncating='pre',
    value=0.0
)


In [None]:
train_data1[1]