Yap Yoon ICP 12

In [1]:
import pandas as pd
import numpy as np
import re
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import load_model
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

1) Load Sentiment.csv and process data

In [3]:
# Load the sentiment data
data = pd.read_csv('/content/Sentiment.csv')

# Keep the necessary columns - sentiment, text
data = data[['text', 'sentiment']]

In [4]:
# Convert all string to lowercase
data['text'] = data['text'].apply(lambda x:x.lower())

# Preprocess the text by removing everything that is not [a-zA-z0-9\s]
data['text'] = data['text'].apply((lambda x:re.sub('[^a-zA-Z0-9\s]'," ",x)))

In [5]:
# Interate over dataframe rows and remove 'rt' in the start of the text
for idx, row in data.iterrows():
  row[0] = row[0].replace('rt', ' ')

In [6]:
# Remove the stopwords from the text
stopwords = stopwords.words('english')
data["text"] = data["text"].apply(lambda x: " ".join([word for word in x.split() if word not in stopwords]))

In [7]:
# Fit tokenizer with the text values
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)

In [8]:
# Convert text into sequences
X = tokenizer.texts_to_sequences(data['text'].values)

# Pads sequences to the same length.
X = pad_sequences(X)

2) Build Keras model 

In [9]:
# Intialize embedding and lstm layer variables
embed_dim = 128
lstm_out = 196
length = X.shape[1]

In [10]:
# Build the model
model = Sequential()
model.add(layers.Embedding(max_features, embed_dim, input_length = length))
model.add(layers.LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(layers.Dense(3, activation='softmax'))

In [11]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

3) Train and save the model

In [12]:
# Encode and one-hot encode the sentiment labels
le = LabelEncoder()
encode = le.fit_transform(data['sentiment'])
y = to_categorical(encode)

In [13]:
# Generate the training and testing datasets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [14]:
# Fit the model
batch_size = 32
model.fit(x_train, y_train, epochs = 7, batch_size = batch_size, verbose = 2)

Epoch 1/7
291/291 - 48s - loss: 0.8216 - accuracy: 0.6468 - 48s/epoch - 164ms/step
Epoch 2/7
291/291 - 33s - loss: 0.6701 - accuracy: 0.7114 - 33s/epoch - 113ms/step
Epoch 3/7
291/291 - 33s - loss: 0.6081 - accuracy: 0.7413 - 33s/epoch - 113ms/step
Epoch 4/7
291/291 - 34s - loss: 0.5616 - accuracy: 0.7591 - 34s/epoch - 115ms/step
Epoch 5/7
291/291 - 34s - loss: 0.5219 - accuracy: 0.7813 - 34s/epoch - 117ms/step
Epoch 6/7
291/291 - 33s - loss: 0.4808 - accuracy: 0.7966 - 33s/epoch - 113ms/step
Epoch 7/7
291/291 - 33s - loss: 0.4471 - accuracy: 0.8162 - 33s/epoch - 113ms/step


<keras.callbacks.History at 0x7f955eed54d0>

In [15]:
# Evaluate the model
score, acc = model.evaluate(x_test, y_test, verbose = 2, batch_size = batch_size)

144/144 - 2s - loss: 0.9709 - accuracy: 0.6551 - 2s/epoch - 17ms/step


In [16]:
# Display loss and accuracy scores
print("score: %.2f" %(score))
print("acc: %.2f" %(acc))

score: 0.97
acc: 0.66


In [17]:
# Save model
model.save('./ICP12model'+'.h5')

In [18]:
# Load the model
model_reload = load_model('/content/ICP12model.h5')

In [19]:
# Preprocess new text
text = [["A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump "]]
df = pd.DataFrame(text, index = range(0, 1, 1), columns =list('t'))
df['t'] = df['t'].apply(lambda x : x.lower())
df['t'] = df['t'].apply((lambda x : re.sub('[^a-zA-z0-9\s]','',x)))

In [20]:
# Fit tokenizer with the text values
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split = ' ')
tokenizer.fit_on_texts(df['t'].values)

# Convert text into sequences
X = tokenizer.texts_to_sequences(df['t'].values)

# Pads sequences to the same length.
X = pad_sequences(X, maxlen = length)

In [21]:
# Use the model to predict the sentiment of the text
output = model_reload.predict(X)
print('Raw prediction:', output)
print("Model identified sentiment to be class", np.argmax(output))

# Identify the corresponding label and class
print(data['sentiment'])
print(y)

Raw prediction: [[0.13235769 0.3287208  0.53892154]]
Model identified sentiment to be class 2
0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object
[[0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


4) Apply code on spam dataset

In [22]:
# Load spam dataset
spam = pd.read_csv('/content/spam.csv',encoding='latin-1')

In [23]:
# Keep necessary columns
spam = spam[['v1','v2']]

# Preprocess the textual data
spam['v2'] = spam['v2'].apply(lambda x: x.lower())
spam['v2'] = spam['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '',x)))
spam["v2"] = spam["v2"].apply(lambda x: " ".join([word for word in x.split() if word not in stopwords]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
# Fit tokenizer with the text values
max_features = 2000
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(spam['v2'].values)

# Convert text into sequences
X = tokenizer.texts_to_sequences(spam['v2'].values)

# Pads sequences to the same length.
X = pad_sequences(X)

In [25]:
# Intialize embedding and lstm layer variables
embed_dim = 128
lstm_out = 196

In [26]:
# Build the model
model2 = Sequential()

model2.add(layers.Embedding(max_features, embed_dim, input_length = X.shape[1]))
model2.add(layers.LSTM(lstm_out, dropout=0.2, recurrent_dropout =0.2))
model2.add(layers.Dense(2, activation = 'softmax'))

# Compile the model
model2.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])

model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 62, 128)           256000    
                                                                 
 lstm_1 (LSTM)               (None, 196)               254800    
                                                                 
 dense_1 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [27]:
# Encode and one-hot encode the sentiment labels
le = LabelEncoder()
integer_encoded = le.fit_transform(spam['v1'])
y = to_categorical(integer_encoded)

# Generate the training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

In [28]:
# Fit the model
batch_size = 32
model2.fit(X_train, Y_train, epochs = 7, batch_size = batch_size, verbose = 2)

Epoch 1/7
117/117 - 36s - loss: 0.1743 - accuracy: 0.9421 - 36s/epoch - 309ms/step
Epoch 2/7
117/117 - 34s - loss: 0.0395 - accuracy: 0.9877 - 34s/epoch - 288ms/step
Epoch 3/7
117/117 - 34s - loss: 0.0194 - accuracy: 0.9952 - 34s/epoch - 288ms/step
Epoch 4/7
117/117 - 34s - loss: 0.0107 - accuracy: 0.9973 - 34s/epoch - 288ms/step
Epoch 5/7
117/117 - 34s - loss: 0.0058 - accuracy: 0.9984 - 34s/epoch - 289ms/step
Epoch 6/7
117/117 - 34s - loss: 0.0028 - accuracy: 0.9989 - 34s/epoch - 290ms/step
Epoch 7/7
117/117 - 34s - loss: 0.0016 - accuracy: 0.9992 - 34s/epoch - 288ms/step


<keras.callbacks.History at 0x7f955ab30d50>

In [29]:
# Evaluate the model
score , acc = model2.evaluate(X_test, Y_test, batch_size = batch_size, verbose = 2)

58/58 - 3s - loss: 0.1281 - accuracy: 0.9744 - 3s/epoch - 44ms/step


In [30]:
# Display loss and accuracy scores
print("score: %.2f" %(score))
print("acc: %.2f" %(acc))

score: 0.13
acc: 0.97


Transfer Learning

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
# Define the shape for the images and the paths for the datasets
SHAPE = (32, 32)
test_path = '/content/drive/MyDrive/5590ICP10dataset/Test'
train_path = '/content/drive/MyDrive/5590ICP10dataset/Train'

In [33]:
# Define parameters for the image data generator
datagen_kwargs = dict(rescale=1./255)

# Obtain the testing dataset
valid_datagen = keras.preprocessing.image.ImageDataGenerator(**datagen_kwargs)
valid_gen = valid_datagen.flow_from_directory(test_path, shuffle=True, target_size=SHAPE, class_mode='categorical')

Found 20 images belonging to 10 classes.


In [34]:
# Split the generator into image and label variables
img_val, label_val = next(iter(valid_gen))

# Display the shapes of the image and label variables
print("Image shape:", img_val.shape)
print("Label shape:", label_val.shape)

Image shape: (20, 32, 32, 3)
Label shape: (20, 10)


In [35]:
# Obtain the training dataset
train_datagen = keras.preprocessing.image.ImageDataGenerator(**datagen_kwargs)
train_gen = train_datagen.flow_from_directory(train_path, shuffle=True, target_size=SHAPE, class_mode='categorical')

Found 80 images belonging to 10 classes.


In [36]:
# Split the generator into image and label variables
img_train, label_train = next(iter(train_gen))

# Display the shapes of the image and label variables
print("Image shape:", img_train.shape)
print("Label shape:", label_train.shape)
dataset_labels = sorted(train_gen.class_indices.items(), key=lambda pair:pair[1])

# Extract the labels of the images
dataset_labels = np.array([key.title() for key, value in dataset_labels])
print(dataset_labels)

  "Palette images with Transparency expressed in bytes should be "


Image shape: (32, 32, 32, 3)
Label shape: (32, 10)
['Bike' 'Boat' 'Bus' 'Car' 'Helicopter' 'Locomotive' 'Motorcycle' 'Plane'
 'Scooter' 'Truck']


In [37]:
# Load the model
model_custom = keras.models.load_model('/content/model2_withCallbacks.h5')

In [38]:
# Remodel the output of the old model
output = model_custom.layers[-1].output
output = layers.Flatten()(output)
model_custom = keras.Model(model_custom.input, output)

In [39]:
# Freeze the model layers
model_custom.trainable = False
for layer in model_custom.layers:
  layer.trainable = False

In [40]:
# Check that the layers are frozen
import pandas as pd
pd.set_option('max_colwidth', -1)
m_layers = [(layer, layer.name, layer.trainable) for layer in model_custom.layers]
pd.DataFrame(m_layers, columns=['Layer Type', 'Layer Name', 'Layer Trainable'])  

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Layer Type,Layer Name,Layer Trainable
0,<keras.engine.input_layer.InputLayer object at 0x7f9557e18e50>,input_7,False
1,<keras.layers.convolutional.Conv2D object at 0x7f9557e1b310>,conv2d_30,False
2,<keras.layers.core.dropout.Dropout object at 0x7f9557e1b810>,dropout_18,False
3,<keras.layers.convolutional.Conv2D object at 0x7f9557e1bb10>,conv2d_31,False
4,<keras.layers.pooling.MaxPooling2D object at 0x7f9557d9f150>,max_pooling2d_15,False
5,<keras.layers.convolutional.Conv2D object at 0x7f9557d9f710>,conv2d_32,False
6,<keras.layers.core.dropout.Dropout object at 0x7f955688e1d0>,dropout_19,False
7,<keras.layers.convolutional.Conv2D object at 0x7f9557d9fb50>,conv2d_33,False
8,<keras.layers.pooling.MaxPooling2D object at 0x7f9557da3410>,max_pooling2d_16,False
9,<keras.layers.convolutional.Conv2D object at 0x7f9557da39d0>,conv2d_34,False


In [41]:
# Build new model that works with the loaded model
input = keras.Input(shape=(32,32,3))

origin = model_custom(input, training=False)
den1 = layers.Dense(32, activation='relu')(origin)
drop1 = layers.Dropout(0.2)(den1)
den2 = layers.Dense(500, activation='relu')(drop1)
drop2 = layers.Dropout(0.2)(den2)
den3 = layers.Dense(250, activation='relu')(drop2)
out = layers.Dense(10, activation='softmax')(den3)

In [42]:
transfer_model = keras.Model(input, out)

In [43]:
# Compile model
transfer_model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])

In [44]:
# Fit the training data to the model
history = transfer_model.fit(img_train, label_train, batch_size=10, epochs=50, validation_data=(img_val, label_val), shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [45]:
# Predict the first 5 images
pred = transfer_model.predict(img_val[0:5, :])
classes = np.argmax(pred, axis = 1)
actual = np.argmax(label_val[0:5], axis = 1)

# Show what the model predicted
print("The model identified the images to belong to classes {}.".format(classes))
# Show actual labels
print("The model identified the images to belong to classes {}.".format(actual))

The model identified the images to belong to classes [0 6 0 0 6].
The model identified the images to belong to classes [1 5 8 5 2].


In [46]:
model_custom.trainable = True
set_trainable = False 

# Unfreeze some of the layers of the loaded model
for layer in model_custom.layers:
  if layer.name in ['conv2d_34', 'conv2d_35	']:
    set_trainable = True
  if set_trainable:
    layer.trainable = True
  else:
    layer.trainable = False

In [47]:
# Check that some of the layers are unfrozen
transfer_model2 = keras.Model(input, out)
m_layers = [(layer, layer.name, layer.trainable) for layer in model_custom.layers]
pd.DataFrame(m_layers, columns=['Layer Type', 'Layer Name', 'Layer Trainable'])  

Unnamed: 0,Layer Type,Layer Name,Layer Trainable
0,<keras.engine.input_layer.InputLayer object at 0x7f9557e18e50>,input_7,False
1,<keras.layers.convolutional.Conv2D object at 0x7f9557e1b310>,conv2d_30,False
2,<keras.layers.core.dropout.Dropout object at 0x7f9557e1b810>,dropout_18,False
3,<keras.layers.convolutional.Conv2D object at 0x7f9557e1bb10>,conv2d_31,False
4,<keras.layers.pooling.MaxPooling2D object at 0x7f9557d9f150>,max_pooling2d_15,False
5,<keras.layers.convolutional.Conv2D object at 0x7f9557d9f710>,conv2d_32,False
6,<keras.layers.core.dropout.Dropout object at 0x7f955688e1d0>,dropout_19,False
7,<keras.layers.convolutional.Conv2D object at 0x7f9557d9fb50>,conv2d_33,False
8,<keras.layers.pooling.MaxPooling2D object at 0x7f9557da3410>,max_pooling2d_16,False
9,<keras.layers.convolutional.Conv2D object at 0x7f9557da39d0>,conv2d_34,True


In [48]:
# Build a model with the unfrozen layers
input = keras.Input(shape=(32,32,3))

origin = model_custom(input)
den1 = layers.Dense(32, activation='relu')(origin)
drop1 = layers.Dropout(0.2)(den1)
den2 = layers.Dense(500, activation='relu')(drop1)
drop2 = layers.Dropout(0.2)(den2)
den3 = layers.Dense(250, activation='relu')(drop2)
out = layers.Dense(10, activation='softmax')(den3)

In [49]:
# Compile model
transfer_model2.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])

In [50]:
# Fit the training data to the model
transfer_model2.fit(img_train, label_train, batch_size=10, epochs=50, verbose=1,
                            validation_data=(img_val, label_val),
                            shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f9557a9f250>

In [51]:
# Predict the first 5 images
pred = transfer_model2.predict(img_val[0:5, :])
classes2 = np.argmax(pred, axis = 1)
actual2 = np.argmax(label_val[0:5], axis = 1)

# Show what the model predicted
print("The model identified the images to belong to classes {}.".format(classes2))
# Show actual labels
print("The model identified the images to belong to classes {}.".format(actual2))

The model identified the images to belong to classes [0 6 0 0 6].
The model identified the images to belong to classes [1 5 8 5 2].
