In [1]:
! git clone https://github.com/laxmimerit/twitter-data.git

Cloning into 'twitter-data'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 12 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (12/12), done.


In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Activation, Dropout, Conv1D, GlobalMaxPooling1D, MaxPooling1D
import numpy as np
import pandas as pd
from numpy import array
from sklearn.model_selection import train_test_split
import itertools 

In [3]:
df = pd.read_csv('/content/twitter-data/twitter4000.csv')

In [4]:
df.head()

Unnamed: 0,twitts,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0


In [5]:
df['sentiment'].value_counts()

1    2000
0    2000
Name: sentiment, dtype: int64

In [7]:
text = df['twitts'].tolist()
text[:10]

['is bored and wants to watch a movie  any suggestions?',
 'back in miami.  waiting to unboard ship ',
 "@misskpey awwww dnt dis brng bak memoriessss,  I thnk I'm sad. LoL",
 'ughhh i am so tired  blahhhhhhhhh',
 "@mandagoforth me bad! It's funny though. Zachary Quinto is only there for a few though.  &amp; to reply just put the @ symbol before the name!",
 "brr, i'm so cold. at the moment doing my assignment on Huntington's Disease, which is really depressing ",
 "@kevinmarquis haha yep but i really need to sleep, i feel like crap lol cant sleep when he's away  god i'm pathetic!",
 "eating some ice-cream while I try to see @peterfacinelli's followers numbre raise...not working sadly ",
 '@phatty84 just hella bored at work  lol',
 'Food poisoning blowssss ']

In [8]:
y = df['sentiment']

Now we will use the class Tokenizer() to convert the data from text to numbers. This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf.

In [11]:
token  = Tokenizer()
token.fit_on_texts(text)
token

<keras_preprocessing.text.Tokenizer at 0x7f72feef9690>

word_indexis index -> word dictionary so every word gets a unique integer value. It starts from 0 so we will add 1 to get the vocab_size. vocab_size is the total number of unique words in our dataset.

In [13]:
vocab = token.index_word
vocab

{1: 'i',
 2: 'to',
 3: 'the',
 4: 'a',
 5: 'my',
 6: 'and',
 7: 'you',
 8: 'is',
 9: 'it',
 10: 'in',
 11: 'for',
 12: 'of',
 13: 'me',
 14: 'on',
 15: 'so',
 16: 'that',
 17: "i'm",
 18: 'have',
 19: 'at',
 20: 'but',
 21: 'just',
 22: 'was',
 23: 'with',
 24: 'not',
 25: 'be',
 26: 'this',
 27: 'day',
 28: 'up',
 29: 'now',
 30: 'good',
 31: 'all',
 32: 'get',
 33: 'out',
 34: 'go',
 35: 'no',
 36: 'http',
 37: 'today',
 38: 'like',
 39: 'are',
 40: 'love',
 41: 'your',
 42: 'quot',
 43: 'too',
 44: 'lol',
 45: 'work',
 46: 'got',
 47: "it's",
 48: 'amp',
 49: 'do',
 50: 'com',
 51: 'u',
 52: 'back',
 53: 'going',
 54: 'what',
 55: 'time',
 56: 'from',
 57: 'had',
 58: 'will',
 59: 'know',
 60: 'about',
 61: 'im',
 62: 'am',
 63: "don't",
 64: 'can',
 65: 'one',
 66: 'really',
 67: "can't",
 68: 'we',
 69: 'oh',
 70: 'well',
 71: 'still',
 72: '2',
 73: 'some',
 74: 'its',
 75: 'miss',
 76: 'want',
 77: 'see',
 78: 'when',
 79: 'home',
 80: 'think',
 81: 'an',
 82: 'as',
 83: 'if',
 

In [25]:
vocab_size = len(token.word_index) + 1 # since the index starts from zero

In [26]:
vocab_size

10135

In [16]:
x = ['i to the a and']

In [17]:
token.texts_to_sequences(x)

[[1, 2, 3, 4, 6]]

In [18]:
encoded_text = token.texts_to_sequences(text)

In [19]:
encoded_text

[[8, 304, 6, 345, 2, 191, 4, 236, 254, 3079],
 [52, 10, 1019, 206, 2, 3080, 3081],
 [3082, 1197, 668, 1955, 3083, 1956, 3084, 1, 3085, 17, 115, 44],
 [1957, 1, 62, 15, 192, 3086],
 [3087,
  13,
  113,
  47,
  328,
  136,
  3088,
  3089,
  8,
  101,
  88,
  11,
  4,
  285,
  136,
  48,
  2,
  448,
  21,
  277,
  3,
  3090,
  218,
  3,
  449],
 [3091,
  17,
  15,
  315,
  19,
  3,
  892,
  164,
  5,
  1459,
  14,
  3092,
  3093,
  386,
  8,
  66,
  1460],
 [3094,
  110,
  366,
  20,
  1,
  66,
  85,
  2,
  108,
  1,
  117,
  38,
  536,
  44,
  182,
  108,
  78,
  346,
  207,
  305,
  17,
  3095],
 [450, 73, 537, 569, 295, 1, 316, 2, 77, 3096, 367, 3097, 1461, 24, 187, 893],
 [3098, 21, 1958, 304, 19, 45, 44],
 [409, 3099, 3100],
 [3101, 132, 609, 79, 3, 193, 368, 17, 131, 3, 158, 199],
 [3102, 127, 1, 139, 226, 2, 1020, 9, 29, 1, 222, 74, 55, 2, 3103, 16, 3104],
 [67, 894, 423],
 [1959,
  119,
  52,
  56,
  211,
  159,
  387,
  669,
  48,
  68,
  255,
  1462,
  3,
  3105,
  71,
  570,
  

In [20]:
print(encoded_text[:30])

[[8, 304, 6, 345, 2, 191, 4, 236, 254, 3079], [52, 10, 1019, 206, 2, 3080, 3081], [3082, 1197, 668, 1955, 3083, 1956, 3084, 1, 3085, 17, 115, 44], [1957, 1, 62, 15, 192, 3086], [3087, 13, 113, 47, 328, 136, 3088, 3089, 8, 101, 88, 11, 4, 285, 136, 48, 2, 448, 21, 277, 3, 3090, 218, 3, 449], [3091, 17, 15, 315, 19, 3, 892, 164, 5, 1459, 14, 3092, 3093, 386, 8, 66, 1460], [3094, 110, 366, 20, 1, 66, 85, 2, 108, 1, 117, 38, 536, 44, 182, 108, 78, 346, 207, 305, 17, 3095], [450, 73, 537, 569, 295, 1, 316, 2, 77, 3096, 367, 3097, 1461, 24, 187, 893], [3098, 21, 1958, 304, 19, 45, 44], [409, 3099, 3100], [3101, 132, 609, 79, 3, 193, 368, 17, 131, 3, 158, 199], [3102, 127, 1, 139, 226, 2, 1020, 9, 29, 1, 222, 74, 55, 2, 3103, 16, 3104], [67, 894, 423], [1959, 119, 52, 56, 211, 159, 387, 669, 48, 68, 255, 1462, 3, 3105, 71, 570, 5, 1959, 329], [1960, 3106, 3107, 46, 3108, 3109], [3110, 1463, 70, 19, 227, 17, 28, 2], [3111, 1, 245, 212, 1961, 51, 72, 36, 146, 246, 3112, 1, 538, 20, 74, 507, 196

We can see that the length of each tweet is different. The length of all encoded tweets must be same before feeding them to the neural network. Hence we are using pad_sequences which pads zeros to reviews with length less than 120.

In [21]:
max_length = 120
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')
print(X)

[[    8   304     6 ...     0     0     0]
 [   52    10  1019 ...     0     0     0]
 [ 3082  1197   668 ...     0     0     0]
 ...
 [ 1033    21  1021 ...     0     0     0]
 [10134   134     7 ...     0     0     0]
 [   94    11   226 ...     0     0     0]]


In [22]:
X.shape

(4000, 120)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

GlobalMaxPooling1D() downsamples the input representation by taking the maximum value over the time dimension.

In [29]:
vec_size = 300

model = Sequential()
model.add(Embedding(vocab_size, vec_size, input_length=max_length))

model.add(Conv1D(64, 8, activation = 'relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.2))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(16, activation='relu'))

model.add(GlobalMaxPooling1D())

model.add(Dense(1, activation='sigmoid'))

In [30]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [31]:
%%time
model.fit(X_train, y_train, epochs = 5, validation_data = (X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 2min 5s, sys: 3.28 s, total: 2min 8s
Wall time: 1min 22s


<tensorflow.python.keras.callbacks.History at 0x7f72fa287290>

In [32]:
def get_encoded(x):
    x = token.texts_to_sequences(x)
    x = pad_sequences(x, maxlen=max_length, padding = 'post')
    return x

x = ['worst services. will not come again']
model.predict_classes(get_encoded(x))



array([[0]], dtype=int32)

In [34]:
x = ['thank you for watching']
model.predict_classes(get_encoded(x))



array([[1]], dtype=int32)