In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
reviews = ['nice food', 'amazing restaurant', 
           'too good', 'just loved it', 'will go again', 
           'horrible food', 'never go there', 
           'poor service', 'poor quality', 'needs improvement']
sentiment = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [2]:
one_hot("amazing restaurant amazing", 30) #one hot numbers less than 30

[26, 9, 26]

In [3]:
one_hot("amazing restaurant and amazing food", 3)

[2, 2, 1, 2, 1]

In [4]:
one_hot("amazing restaurant", 300) #one hot numbers less than 300

[286, 44]

In [5]:
vocab_size = 30 #one hot numbers less than 30
encoded_reviews = [one_hot(d, vocab_size) for d in reviews]
encoded_reviews

[[26, 26],
 [26, 9],
 [3, 17],
 [17, 4, 7],
 [7, 12, 13],
 [17, 26],
 [22, 12, 23],
 [12, 3],
 [12, 5],
 [9, 6]]

In [6]:
max_length = 4
padded_reviews = pad_sequences(encoded_reviews, maxlen = max_length, padding = 'post') #post - pad zeros to the end
padded_reviews

array([[26, 26,  0,  0],
       [26,  9,  0,  0],
       [ 3, 17,  0,  0],
       [17,  4,  7,  0],
       [ 7, 12, 13,  0],
       [17, 26,  0,  0],
       [22, 12, 23,  0],
       [12,  3,  0,  0],
       [12,  5,  0,  0],
       [ 9,  6,  0,  0]], dtype=int32)

In [7]:
embedded_vector_size = 5 #output dimension
model = Sequential()
model.add(Embedding(vocab_size, embedded_vector_size,  
                    input_length=max_length, name='embedding')) #vocab size - input size
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [8]:
X = padded_reviews
y = sentiment

In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 5)              150       
                                                                 
 flatten (Flatten)           (None, 20)                0         
                                                                 
 dense (Dense)               (None, 1)                 21        
                                                                 
Total params: 171
Trainable params: 171
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.fit(X, y, epochs=50, verbose=0)

<keras.callbacks.History at 0x7f89d22d88d0>

In [11]:
loss, accuracy = model.evaluate(X, y)
accuracy



0.8999999761581421

In [12]:
loss

0.6268743276596069

In [13]:
weights = model.get_layer('embedding').get_weights()[0]
len(weights) #total words in the corpus

30

In [14]:
weights #5*30 matrix

array([[-0.0868435 , -0.0998238 ,  0.08845211,  0.03542681,  0.03517019],
       [ 0.0131984 , -0.04672587, -0.026042  ,  0.01290312,  0.02676788],
       [-0.01046201,  0.04826674, -0.00739669,  0.0458935 , -0.02784307],
       [ 0.02325351, -0.0189191 ,  0.07207242,  0.07978566, -0.09734825],
       [ 0.08007216,  0.08027931, -0.04450344, -0.03276031,  0.04552829],
       [-0.02127624, -0.06127708,  0.10022007,  0.06151345, -0.05877593],
       [-0.09353881, -0.03128747,  0.09942093,  0.0824604 , -0.05059745],
       [ 0.09316539,  0.07869239, -0.06988513, -0.00731696, -0.05857187],
       [ 0.02429447,  0.03064456,  0.0152407 ,  0.0332881 ,  0.01918376],
       [ 0.023996  ,  0.0884195 , -0.05518988, -0.00040303,  0.03302315],
       [ 0.02559003,  0.01348828,  0.01643496, -0.0160083 ,  0.00678672],
       [-0.0299291 ,  0.04839721,  0.01192177, -0.02288866, -0.03487744],
       [-0.05907712, -0.06621875,  0.08762579,  0.0834493 , -0.00673153],
       [ 0.06446045,  0.07058205, -0.0

In [15]:
weights[8]

array([0.02429447, 0.03064456, 0.0152407 , 0.0332881 , 0.01918376],
      dtype=float32)

## Keras-tokenizer

In [16]:
from keras.preprocessing.text import Tokenizer

In [17]:
t = Tokenizer()
fit_text = ['Machine Learning Knowledge', 'Machine Learning', 'Deep Learning', 'Artificial Intelligence']
t.fit_on_texts(fit_text)

In [18]:
print("The document count", t.document_count) #prints no. of documents present in the corpus

The document count 4


In [19]:
print("The count of words", t.word_counts) #prints no. of times words in the corpus

The count of words OrderedDict([('machine', 2), ('learning', 3), ('knowledge', 1), ('deep', 1), ('artificial', 1), ('intelligence', 1)])


In [20]:
print("The word index", t.word_index) #assigns a unique index to each word present in the corpus

The word index {'learning': 1, 'machine': 2, 'knowledge': 3, 'deep': 4, 'artificial': 5, 'intelligence': 6}


In [21]:
print("The word docs", t.word_docs) #tells how in many documents each of the words appear

The word docs defaultdict(<class 'int'>, {'learning': 3, 'knowledge': 1, 'machine': 2, 'deep': 1, 'intelligence': 1, 'artificial': 1})


In [22]:
t = Tokenizer()
fit_text = 'Machine Learning'
t.fit_on_texts(fit_text)
print("Count of characters: ", t.word_counts)
print("Length of text: ", t.document_count)
print("Character index: ", t.word_index)
print("Frequency of characters: ", t.word_docs)

Count of characters:  OrderedDict([('m', 1), ('a', 2), ('c', 1), ('h', 1), ('i', 2), ('n', 3), ('e', 2), ('l', 1), ('r', 1), ('g', 1)])
Length of text:  16
Character index:  {'n': 1, 'a': 2, 'i': 3, 'e': 4, 'm': 5, 'c': 6, 'h': 7, 'l': 8, 'r': 9, 'g': 10}
Frequency of characters:  defaultdict(<class 'int'>, {'m': 1, 'a': 2, 'c': 1, 'h': 1, 'i': 2, 'n': 3, 'e': 2, 'l': 1, 'r': 1, 'g': 1})


### text_to_sequences



In [23]:
#assigns integers to words
t = Tokenizer()
test_text = ["Machine Learning Knowledge", "Machine Learning", "Deep Learning", "Artificial Intelligence"] #document list
t.fit_on_texts(test_text)
sequences = t.texts_to_sequences(test_text)
print("The sequences generated from text are: ", sequences)

The sequences generated from text are:  [[2, 1, 3], [2, 1], [4, 1], [5, 6]]


In [24]:
#assigns integer to characters; 'e' --> 4
t = Tokenizer()
test_text = "Machine Learning" #string
t.fit_on_texts(test_text)
sequences = t.texts_to_sequences(test_text)
print("The sequences generated from text are: ", sequences)

The sequences generated from text are:  [[5], [2], [6], [7], [3], [1], [4], [], [8], [4], [2], [9], [1], [3], [1], [10]]


In [25]:
docs = ['Marvellous Machine Learning Marvellous Machine Learning',
        'Artificial Amazing Intelligence',
        'Dazzling Deep Learning',
        'Champion Computer Vision',
        'Notorious Natural Language Processing Notorious Natural Language Processing']
t = Tokenizer()
t.fit_on_texts(docs)
encoded_docs = t.texts_to_matrix(docs, mode = 'freq') #binary - 0s and 1s, count - count of words, tfidf - tf*idf value, 
                                                      #freq - ratio of the word with all other words in the corpus
print(encoded_docs) #matrix of unique words in the document

[[0.         0.33333333 0.33333333 0.33333333 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.33333333 0.33333333 0.33333333 0.
  0.         0.         0.         0.        ]
 [0.         0.33333333 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.33333333
  0.33333333 0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.33333333 0.33333333 0.33333333]
 [0.         0.         0.         0.         0.25       0.25
  0.25       0.25       0.         0.         0.         0.
  0.         0.         0.         0.        ]]


## Keras_tokenizer_text_classification

In [60]:
import keras
import numpy as np
from keras.datasets import reuters
import tensorflow as tf

In [61]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words = None, test_split = 0.2)

In [62]:
x_test

array([list([1, 4, 1378, 2025, 9, 697, 4622, 111, 8, 25, 109, 29, 3650, 11, 150, 244, 364, 33, 30, 30, 1398, 333, 6, 18292, 159, 9, 1084, 363, 13, 19231, 71, 9, 16273, 71, 117, 4, 225, 78, 206, 10, 9, 1214, 8, 4, 270, 5, 16273, 7, 748, 48, 9, 19231, 7, 207, 1451, 966, 1864, 793, 97, 133, 336, 7, 4, 493, 98, 273, 104, 284, 25, 39, 338, 22, 905, 220, 3465, 644, 59, 20, 6, 119, 61, 11, 15, 58, 579, 26, 10, 67, 7, 4, 738, 98, 43, 88, 333, 722, 12, 20, 6, 19, 746, 35, 15, 10, 9, 1214, 855, 129, 783, 21, 4, 2280, 244, 364, 51, 16, 299, 452, 16, 515, 4, 99, 29, 5, 4, 364, 281, 48, 10, 9, 1214, 23, 644, 47, 20, 324, 27, 56, 23406, 28185, 5, 192, 510, 17, 12]),
       list([1, 2768, 283, 122, 7, 4, 89, 544, 463, 29, 798, 748, 40, 85, 306, 28, 19, 59, 11, 82, 84, 22, 10, 1315, 19, 12, 11, 82, 52, 29, 283, 1135, 558, 13086, 265, 27151, 6607, 8, 6607, 118, 371, 10, 1503, 281, 4, 143, 4811, 760, 50, 2088, 225, 139, 683, 4, 48, 193, 862, 41, 967, 1999, 30, 1086, 36, 8, 28, 602, 19, 32, 11, 82, 5, 4,

In [63]:
print('# of Training Samples: {}'.format(len(x_train)))
print('# of Test Samples: {}'.format(len(x_test)))

# of Training Samples: 8982
# of Test Samples: 2246


In [64]:
num_classes = max(y_train)+1
print('# of classes: {}'.format(num_classes))

# of classes: 46


In [65]:
from keras.preprocessing.text import Tokenizer
max_words = 10000

In [66]:
tokenizer = Tokenizer(num_words = max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode = 'binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode = 'binary')

In [67]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [68]:
from  keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

In [69]:
model = Sequential()
model.add(Dense(512, input_shape = (max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [70]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [71]:
batch_size = 32
epochs = 2

In [72]:
history = model.fit(x_train, y_train, batch_size = batch_size, epochs = 10, verbose = 1, validation_split = 0.1)
score = model.evaluate(x_test, y_test, batch_size = batch_size, verbose = 1)
print("Test loss - ", score[0])
print("Accuracy - ", score[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss -  1.054833173751831
Accuracy -  0.8005343079566956


###count

In [73]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words = None, test_split = 0.2)

In [74]:
from keras.preprocessing.text import Tokenizer
max_words = 10000

In [75]:
tokenizer = Tokenizer(num_words = max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode = 'count')
x_test = tokenizer.sequences_to_matrix(x_test, mode = 'count')

In [76]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [77]:
model = Sequential()
model.add(Dense(512, input_shape = (max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [78]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [79]:
batch_size = 32
epochs = 2

In [80]:
history = model.fit(x_train, y_train, batch_size = batch_size, epochs = 10, verbose = 1, validation_split = 0.1)
score = model.evaluate(x_test, y_test, batch_size = batch_size, verbose = 1)
print("Test loss - ", score[0])
print("Accuracy - ", score[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss -  1.1508889198303223
Accuracy -  0.7987533211708069


###tf-idf

In [88]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words = None, test_split = 0.2)

In [89]:
from keras.preprocessing.text import Tokenizer
max_words = 10000

In [93]:
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_sequences(x_train)
x_train = tokenizer.sequences_to_matrix(x_train, mode = 'tfidf')
x_test = tokenizer.sequences_to_matrix(x_test, mode = 'tfidf')

In [94]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [95]:
model = Sequential()
model.add(Dense(512, input_shape = (max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [96]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [97]:
batch_size = 32
epochs = 2

In [98]:
history = model.fit(x_train, y_train, batch_size = batch_size, epochs = 10, verbose = 1, validation_split = 0.1)
score = model.evaluate(x_test, y_test, batch_size = batch_size, verbose = 1)
print("Test loss - ", score[0])
print("Accuracy - ", score[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss -  1.3600172996520996
Accuracy -  0.7960819005966187


## Twitter sentiment analysis

In [114]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import re
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [115]:
import pandas as pd
train = pd.read_csv("/content/drive/My Drive/NLP/train_E6oV3lV.csv")
test = pd.read_csv("/content/drive/My Drive/NLP/test_tweets_anuFYb8.csv")

In [116]:
stopwords = set(stopwords.words('english'))
stop = [x.lower() for x in stopwords]
lemma = WordNetLemmatizer()

In [117]:
shortcuts = {'u': 'you', 'y': 'why', 'r': 'are', 'doin': 'doing', 'hw': 'how', 'k': 'okay', 'm': 'am', 'b4': 'before',
'idc': "i do not care", 'ty': 'thankyou', 'wlcm': 'welcome', 'bc': 'because', '<3': 'love', 'xoxo': 'love',
'ttyl': 'talk to you later', 'gr8': 'great', 'bday': 'birthday', 'awsm': 'awesome', 'gud': 'good', 'h8': 'hate',
'lv': 'love', 'dm': 'direct message', 'rt': 'retweet', 'wtf': 'hate', 'idgaf': 'hate',
'irl': 'in real life', 'yolo': 'you only live once'}

In [118]:
def clean(text):
  text = text.lower()
  # keep alphanumeric characters only
  text = re.sub('\W+', ' ', text).strip()
  text = text.replace('user', '')
  # tokenize
  text_token = word_tokenize(text)
  # replace shortcuts using dict
  full_words = []
  for token in text_token:
    if token in shortcuts.keys():
      token = shortcuts[token]
    full_words.append(token)
  words_alpha = [re.sub(r'\d+', '', word) for word in full_words]
  words_big = [word for word in words_alpha if len(word)>2]
  stemmed_words = [lemma.lemmatize(word) for word in words_big]
  # join list elements to string
  clean_text = " ".join(stemmed_words)
  clean_text = clean_text.replace(' ', ' ')
  clean_text = clean_text.replace(' ', ' ')
  return clean_text

In [119]:
X_train = train.tweet
y = train.label
X_test = test.tweet

In [120]:
clean_Xtrain = X_train.apply(lambda x: clean(x))
clean_Xtest = X_test.apply(lambda x: clean(x))

In [122]:
tokenizer = Tokenizer(num_words = max_words)
X_train = tokenizer.sequences_to_matrix(X_train, mode = 'binary')
X_test = tokenizer.sequences_to_matrix(X_test, mode = 'binary')

TypeError: ignored

In [None]:
s