<a href="https://colab.research.google.com/github/iyline-sigey/PREDICTIVE-ANALYSIS-ON-REMOTE-LEARNING/blob/Modelling/ANN_and_RNN_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
#Libraries for modelling
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from keras import layers

**Load the data**

In [2]:
df=pd.read_csv('modeling_data.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'clean_tweet', 'class'], dtype='object')

In [4]:
df.drop(['Unnamed: 0'], axis =1, inplace=True)

In [5]:
#Checking the shape of the minority class
a= df[df['class']==0]
a.shape

(118, 2)

Dealing with the class imbalance

In [6]:
#We will use up-sampling to solve the clase imbalance
#Up-sampling is the process of randomly duplicating observations from the minority class in order to reinforce its signal

from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df['class']==1]
df_minority = df[df['class']==0]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=130,    # to match majority class
                                 random_state=0) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled['class'].value_counts()

1    192
0    130
Name: class, dtype: int64

# Modelling

**Splitting the Dataset into Train and Test**

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

print('df_train shape: {}'.format(df_train.shape))
print('df_test shape: {}'.format(df_test.shape))

print('df_train: {:.2f}% positive reviews'.format(df_train['class'].mean()*100))
print('df_test: {:.2f}% positive reviews'.format(df_test['class'].mean()*100))

df_train shape: (248, 2)
df_test shape: (62, 2)
df_train: 60.89% positive reviews
df_test: 66.13% positive reviews


**Further Splitting the Train dataset into Train and Validation**

In [8]:
# Split the data into train and validation set.
df0_train, df0_val = train_test_split(df_train, test_size=0.2)

In [9]:
#set the target and features
X_train = df0_train['clean_tweet'].values
y_train = df0_train['class'].values

X_val = df0_val['clean_tweet'].values
y_val = df0_val['class'].values

**Preprocessing the Text: Tokenization and Conversion to Sequences**

In [10]:
# set a vocabulary size. This is the maximum number of words that can be used.
vocabulary_size = 10000

# create the tokenizer that comes with Keras.
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(X_train)

# convert the texts to sequences.
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [11]:
l = [len(i) for i in X_train_seq]
l = np.array(l)
print('minimum number of words: {}'.format(l.min()))
print('median number of words: {}'.format(np.median(l)))
print('average number of words: {}'.format(l.mean()))
print('maximum number of words: {}'.format(l.max()))

minimum number of words: 4
median number of words: 22.0
average number of words: 22.252525252525253
maximum number of words: 46


In [12]:
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=200)
X_val_seq_padded  = pad_sequences(X_val_seq, maxlen=200)

**ANN**

In [13]:
import tensorflow as tf

#Create a tensorflow model
embedding_vector_length = 32
vocab_size = len(tokenizer.word_index) + 1
model = tf.keras.models.Sequential() 
model.add(tf.keras.layers.Embedding(vocab_size, embedding_vector_length, input_length=200) )
model.add(tf.keras.layers.SpatialDropout1D(0.25))
model.add(tf.keras.layers.LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])  
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 32)           37216     
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 50)                16600     
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 53,867
Trainable params: 53,867
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
#fit the model
history = model.fit(X_train_seq_padded,y_train,validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


**RNN**

In [15]:
from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint

max_words = 5000
max_len = 200

X_train_seq_padded = pad_sequences(X_train_seq, maxlen=200)
X_val_seq_padded  = pad_sequences(X_val_seq, maxlen=200)

model1 = Sequential()
model1.add(layers.Embedding(max_words, 20)) #The embedding layer
model1.add(layers.LSTM(15,dropout=0.5)) #Our LSTM layer
model1.add(layers.Dense(1,activation='sigmoid'))
print(model1.summary())

model1.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])

checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,
                              save_best_only=True,mode='auto', period=1,save_weights_only=False)
#model.fit(X_train, Y_train,validation_data = (X_test,y_test),epochs = 10, batch_size=32)

history = model1.fit(X_train_seq_padded, y_train, epochs=10,batch_size=32,
                     validation_data=(X_val_seq_padded, y_val),callbacks=[checkpoint1])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 20)          100000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 15)                2160      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 16        
Total params: 102,176
Trainable params: 102,176
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.66000, saving model to best_model1.hdf5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.66000
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.66000
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.66000
Epoch 5/10

Epoch 00005: val_accuracy improved from 0.66000 to 0.88000

In [16]:
#Evaluating the model using the test data
# predict on the test dataset.

# transform text to sequences.
X_test_seq = tokenizer.texts_to_sequences(df_test['clean_tweet'].values)
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=200)
y_test = df_test['class'].values

y_test_pred = model1.predict(X_test_seq_padded)
y_test_pred = y_test_pred.reshape(y_test_pred.shape[0],)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

auc_lstm = roc_auc_score(y_test, y_test_pred)
auc_lstm

1.0

**Bidirectional layers**

In [17]:
#Intialize the model
model2 = Sequential()
model2.add(layers.Embedding(max_words, 40, input_length=max_len))
model2.add(layers.Bidirectional(layers.LSTM(20,dropout=0.6)))
model2.add(layers.Dense(1,activation='sigmoid'))
#Call comipiler ab=nd the checkpoints

model2.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])
checkpoint2 = ModelCheckpoint("best_model2.hdf5", monitor='val_accuracy', 
                              verbose=1,save_best_only=True, mode='auto', 
                              period=1,save_weights_only=False)

#fit the model

history = model2.fit(X_train_seq_padded, y_train, epochs=10,
                     validation_data=(X_val_seq_padded, y_val),callbacks=[checkpoint2])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.66000, saving model to best_model2.hdf5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.66000
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.66000
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.66000
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.66000
Epoch 6/10

Epoch 00006: val_accuracy improved from 0.66000 to 0.70000, saving model to best_model2.hdf5
Epoch 7/10

Epoch 00007: val_accuracy improved from 0.70000 to 1.00000, saving model to best_model2.hdf5
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 1.00000
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 1.00000
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 1.00000


In [18]:
#Evaluating the model using the test data
# predict on the test dataset.

# transform text to sequences.
X_test_seq = tokenizer.texts_to_sequences(df_test['clean_tweet'].values)
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=200)
y_test = df_test['class'].values

y_test_pred = model2.predict(X_test_seq_padded)
y_test_pred = y_test_pred.reshape(y_test_pred.shape[0],)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

auc_lstm = roc_auc_score(y_test, y_test_pred)
auc_lstm

1.0