In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
# keras is a high level API that makes it easy to configure neural networks
from tensorflow import keras

In [None]:
df = pd.read_csv(r"https://raw.githubusercontent.com/JonathanBechtel/DAT-10-19/main/ClassMaterial/Unit4/data/IMDB.csv")

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['review'] = df['review'].str.replace('<br />', '')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['sentiment'] = np.where(df['sentiment'] == 'positive', 1, 0)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size = 0.2, random_state = 42)

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words = 10000)

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'in': 7,
 'it': 8,
 'i': 9,
 'this': 10,
 'that': 11,
 'was': 12,
 'as': 13,
 'for': 14,
 'with': 15,
 'movie': 16,
 'but': 17,
 'film': 18,
 'on': 19,
 'not': 20,
 'you': 21,
 'are': 22,
 'his': 23,
 'have': 24,
 'be': 25,
 'one': 26,
 'he': 27,
 'all': 28,
 'at': 29,
 'by': 30,
 'an': 31,
 'they': 32,
 'so': 33,
 'from': 34,
 'who': 35,
 'like': 36,
 'or': 37,
 'just': 38,
 'her': 39,
 'out': 40,
 'about': 41,
 'if': 42,
 "it's": 43,
 'has': 44,
 'there': 45,
 'some': 46,
 'what': 47,
 'good': 48,
 'more': 49,
 'very': 50,
 'when': 51,
 'up': 52,
 'no': 53,
 'time': 54,
 'my': 55,
 'even': 56,
 'would': 57,
 'she': 58,
 'which': 59,
 'only': 60,
 'story': 61,
 'really': 62,
 'see': 63,
 'their': 64,
 'had': 65,
 'can': 66,
 'me': 67,
 'well': 68,
 'were': 69,
 'we': 70,
 'than': 71,
 'much': 72,
 'bad': 73,
 'been': 74,
 'do': 75,
 'get': 76,
 'great': 77,
 'also': 78,
 'will': 79,
 'other': 80,
 'into': 81,
 'people': 82,

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)

In [None]:
X_train

[[196,
  47,
  9,
  799,
  2157,
  533,
  301,
  1,
  106,
  1903,
  2035,
  4321,
  6543,
  2,
  811,
  4841,
  11,
  1,
  226,
  1,
  6186,
  78,
  781,
  52,
  51,
  21,
  100,
  4,
  1,
  26,
  2008,
  101,
  35,
  24,
  33,
  118,
  1128,
  11,
  8,
  6,
  2365,
  1189,
  5,
  453,
  47,
  551,
  5,
  91,
  32,
  22,
  38,
  915,
  398,
  14,
  1,
  165,
  5,
  2992,
  23,
  4356,
  19,
  3,
  2944,
  11,
  44,
  74,
  220,
  72,
  125,
  7,
  80,
  3154,
  194,
  19,
  238,
  2,
  1,
  445,
  9,
  204,
  4979,
  143,
  20,
  62,
  26,
  14,
  73,
  364,
  301,
  3,
  18,
  17,
  8,
  204,
  25,
  305,
  11,
  13,
  1,
  114,
  436,
  2,
  13,
  1,
  1701,
  613,
  69,
  422,
  380,
  9,
  88,
  117,
  47,
  111,
  369,
  32,
  34,
  17,
  42,
  9,
  12,
  91,
  484,
  6824,
  14,
  3,
  363,
  1181,
  60,
  7,
  1,
  470,
  215,
  1015,
  5,
  4156,
  7,
  3,
  173,
  4,
  33,
  437,
  696,
  622,
  11,
  3747,
  236,
  110,
  847,
  34,
  169,
  29,
  218,
  196,
  1,
  427,
  3

In [None]:
X_train[0]

[196,
 47,
 9,
 799,
 2157,
 533,
 301,
 1,
 106,
 1903,
 2035,
 4321,
 6543,
 2,
 811,
 4841,
 11,
 1,
 226,
 1,
 6186,
 78,
 781,
 52,
 51,
 21,
 100,
 4,
 1,
 26,
 2008,
 101,
 35,
 24,
 33,
 118,
 1128,
 11,
 8,
 6,
 2365,
 1189,
 5,
 453,
 47,
 551,
 5,
 91,
 32,
 22,
 38,
 915,
 398,
 14,
 1,
 165,
 5,
 2992,
 23,
 4356,
 19,
 3,
 2944,
 11,
 44,
 74,
 220,
 72,
 125,
 7,
 80,
 3154,
 194,
 19,
 238,
 2,
 1,
 445,
 9,
 204,
 4979,
 143,
 20,
 62,
 26,
 14,
 73,
 364,
 301,
 3,
 18,
 17,
 8,
 204,
 25,
 305,
 11,
 13,
 1,
 114,
 436,
 2,
 13,
 1,
 1701,
 613,
 69,
 422,
 380,
 9,
 88,
 117,
 47,
 111,
 369,
 32,
 34,
 17,
 42,
 9,
 12,
 91,
 484,
 6824,
 14,
 3,
 363,
 1181,
 60,
 7,
 1,
 470,
 215,
 1015,
 5,
 4156,
 7,
 3,
 173,
 4,
 33,
 437,
 696,
 622,
 11,
 3747,
 236,
 110,
 847,
 34,
 169,
 29,
 218,
 196,
 1,
 427,
 366,
 54,
 3761,
 3,
 277,
 156,
 1705,
 186,
 6,
 1,
 726,
 1934,
 1,
 1196,
 4,
 2945,
 3748,
 1833,
 2,
 146,
 143,
 3,
 228,
 4,
 3,
 206,
 322,
 2,
 143,

In [None]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'in': 7,
 'it': 8,
 'i': 9,
 'this': 10,
 'that': 11,
 'was': 12,
 'as': 13,
 'for': 14,
 'with': 15,
 'movie': 16,
 'but': 17,
 'film': 18,
 'on': 19,
 'not': 20,
 'you': 21,
 'are': 22,
 'his': 23,
 'have': 24,
 'be': 25,
 'one': 26,
 'he': 27,
 'all': 28,
 'at': 29,
 'by': 30,
 'an': 31,
 'they': 32,
 'so': 33,
 'from': 34,
 'who': 35,
 'like': 36,
 'or': 37,
 'just': 38,
 'her': 39,
 'out': 40,
 'about': 41,
 'if': 42,
 "it's": 43,
 'has': 44,
 'there': 45,
 'some': 46,
 'what': 47,
 'good': 48,
 'more': 49,
 'very': 50,
 'when': 51,
 'up': 52,
 'no': 53,
 'time': 54,
 'my': 55,
 'even': 56,
 'would': 57,
 'she': 58,
 'which': 59,
 'only': 60,
 'story': 61,
 'really': 62,
 'see': 63,
 'their': 64,
 'had': 65,
 'can': 66,
 'me': 67,
 'well': 68,
 'were': 69,
 'we': 70,
 'than': 71,
 'much': 72,
 'bad': 73,
 'been': 74,
 'do': 75,
 'get': 76,
 'great': 77,
 'also': 78,
 'will': 79,
 'other': 80,
 'into': 81,
 'people': 82,

In [None]:
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
len(X_train[0])

369

In [None]:
len(X_train[1])

257

In [None]:
len(X_train[2])

88

In [None]:
len(X_train[3])

155

In [None]:
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen = 300)
X_test  = keras.preprocessing.sequence.pad_sequences(X_test, maxlen = 300)

In [None]:
X_train.shape

(40000, 300)

In [None]:
X_train[0]

array([   7,   80, 3154,  194,   19,  238,    2,    1,  445,    9,  204,
       4979,  143,   20,   62,   26,   14,   73,  364,  301,    3,   18,
         17,    8,  204,   25,  305,   11,   13,    1,  114,  436,    2,
         13,    1, 1701,  613,   69,  422,  380,    9,   88,  117,   47,
        111,  369,   32,   34,   17,   42,    9,   12,   91,  484, 6824,
         14,    3,  363, 1181,   60,    7,    1,  470,  215, 1015,    5,
       4156,    7,    3,  173,    4,   33,  437,  696,  622,   11, 3747,
        236,  110,  847,   34,  169,   29,  218,  196,    1,  427,  366,
         54, 3761,    3,  277,  156, 1705,  186,    6,    1,  726, 1934,
          1, 1196,    4, 2945, 3748, 1833,    2,  146,  143,    3,  228,
          4,    3,  206,  322,    2,  143, 1081,   15,   87,    4,  131,
       2867, 8738,   17,    9,  152, 9250,   98,    4,    1, 4018,  301,
         10,   16,  999,   34,    1,  495,  491, 2618,  248,   70,   76,
        106,  106,  697,   59,   85, 1045, 1361,   

In [None]:
X_train[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    9,
        115,   20,  102,    1,  438,   16,    9,   95,   20,  102,    1,
        438,   16,    9, 2302,    1,  265,   99,  145,   14,  303,   31,
        526,    2,    9, 1457,  248,  537,    4,  145,  519,    8,  541,
        519,  159,  647,    1, 3670,   40,    4,    1,  412,    9,   36,
       4003,   96,  194, 1651,    2,  756,   17,   66,   60,   25, 2111,
         13,    3, 1651,  594,   29,    3, 4003,  207,    1,  442,    6,
        333,  507,   97,  246,    5,   76,    1,  309,    5,  424,    1,
        109,  183,    5,   25,   31, 1383,   14,  708, 1338,   34,   26,
        127,    5,  156,  310,    6,  334,    2, 1506, 4116,   37,   28,
         23,  674, 6544,  212,  620,   13, 2093, 13

In [None]:
mod = keras.models.Sequential([
      # creates set of weights for each word
      keras.layers.Embedding(10000, 64, input_length = 300),
      # this turns the output data from 3d to 2d
      # (40000, 300, 64) -> (40000, 300 * 64)
      keras.layers.Flatten(),
      # these layers define a standard matrix multiplication
      keras.layers.Dense(64, activation = 'relu'),
      keras.layers.Dense(64, activation = 'relu'),
      keras.layers.Dense(1, activation = 'sigmoid')
])

In [None]:
mod.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 300, 64)           640000    
                                                                 
 flatten_2 (Flatten)         (None, 19200)             0         
                                                                 
 dense_6 (Dense)             (None, 640)               12288640  
                                                                 
 dense_7 (Dense)             (None, 640)               410240    
                                                                 
 dense_8 (Dense)             (None, 2)                 1282      
                                                                 
Total params: 13,340,162
Trainable params: 13,340,162
Non-trainable params: 0
_________________________________________________________________


In [None]:
mod.get_weights()

[array([[ 0.04291744,  0.04349399,  0.01040478, ..., -0.04623259,
          0.04485423,  0.02574397],
        [-0.03482485, -0.04079473, -0.00063542, ...,  0.02588374,
         -0.04079611, -0.04003321],
        [-0.02681776, -0.03409628,  0.04029787, ...,  0.02146183,
          0.02365097,  0.04049074],
        ...,
        [-0.01428535,  0.04499148,  0.04724013, ...,  0.0038721 ,
         -0.04148148,  0.00138398],
        [-0.02375745, -0.04427614,  0.03842926, ...,  0.0254203 ,
          0.0066247 ,  0.02471249],
        [-0.01132051,  0.0277896 ,  0.00420143, ..., -0.01000632,
          0.0123726 ,  0.00872688]], dtype=float32),
 array([[-0.00792474,  0.01245961,  0.0056677 , ..., -0.01093495,
          0.01435689,  0.01471265],
        [ 0.00861932, -0.00664628,  0.01544944, ..., -0.0061135 ,
         -0.01552988,  0.00661263],
        [-0.01258559, -0.00641319, -0.00908876, ...,  0.01702936,
         -0.01229722,  0.00564835],
        ...,
        [ 0.00921158, -0.01475048, -0.0

In [None]:
mod.fit(X_train, y_train)

RuntimeError: ignored

In [None]:
mod.compile(loss = 'binary_crossentropy', metrics = ['acc'])

In [None]:
mod.fit(X_train, y_train.values, epochs = 10, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f515b73e4d0>

In [None]:
y_train.values

array([0, 0, 1, ..., 0, 1, 1])

In [None]:
mod.evaluate(X_test, y_test)



[2.2664287090301514, 0.8547000288963318]

In [None]:
mod.predict(X_test)[0]

array([0.0003995], dtype=float32)