# Modelling

In [None]:
!pip install wandb



In [None]:
from tensorflow.keras.callbacks import Callback
from wandb.keras import WandbCallback

In [None]:
import pandas as pd
df=pd.read_csv('modeling_data.csv')

#### Dealing with the class imbalance

In [None]:
#We will use up-sampling to solve the clase imbalance
#Up-sampling is the process of randomly duplicating observations from the minority class in order to reinforce its signal

from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df['class']==1]
df_minority = df[df['class']==0]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=130,    # to match majority class
                                 random_state=0) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled['class'].value_counts()

0    130
1    115
Name: class, dtype: int64

In [None]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mjack-analyst[0m (use `wandb login --relogin` to force relogin)


True

In [None]:
#config
config = wandb.config

## **Random Forest**

In [None]:
#Importing libraries
import pandas as pd
import numpy as np
#for metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding
df_clean = pd.read_csv('/content/modeling_data.csv')

In [None]:
# Separating the target from our data
ft = (df_clean.drop(['class'],axis=1)).columns
X = df_clean['clean_tweet'].values
y = df_clean['class'].values

#  Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
# set a vocabulary size. This is the maximum number of words that can be used.
vocabulary_size = 10000

# create the tokenizer that comes with Keras.
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(X_train)

# convert the texts to sequences.
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_test)

max_words = 5000
max_len = 200

X_train_seq_padded = pad_sequences(X_train_seq, maxlen=200)
X_val_seq_padded  = pad_sequences(X_val_seq, maxlen=200)

In [None]:
#instantiate the model 
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error,mean_squared_error, accuracy_score, classification_report, f1_score,r2_score

forest = RandomForestClassifier(random_state=0)

# Train it on our training set.
forest.fit(X_train_seq_padded , y_train)

# Predict based on the model we've trained
y_pred_ftrain = forest.predict(X_train_seq_padded)
y_pred = forest.predict(X_val_seq_padded)


print("Accuracy Score test: ",accuracy_score(y_test, y_pred))
print('R-squared test:',r2_score(y_test, y_pred))  
print('Mean Squared Error test:', mean_squared_error(y_test, y_pred))
print('\n')
print(classification_report(y_test,y_pred))


Accuracy Score test:  0.8571428571428571
R-squared test: 0.02608695652173887
Mean Squared Error test: 0.14285714285714285


              precision    recall  f1-score   support

           0       1.00      0.20      0.33         5
           1       0.85      1.00      0.92        23

    accuracy                           0.86        28
   macro avg       0.93      0.60      0.63        28
weighted avg       0.88      0.86      0.82        28



## **KNN**

In [None]:
#we are going to use k of 5
from sklearn.neighbors import KNeighborsClassifier
cl=KNeighborsClassifier(n_neighbors=5)
cl.fit(X_train_seq_padded,y_train)
#we will predict our model
y_pred =cl.predict(X_val_seq_padded)

In [None]:
#checking metrics for accuracy
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 1  4]
 [ 2 21]]
              precision    recall  f1-score   support

           0       0.33      0.20      0.25         5
           1       0.84      0.91      0.87        23

    accuracy                           0.79        28
   macro avg       0.59      0.56      0.56        28
weighted avg       0.75      0.79      0.76        28



In [None]:
#we will tune our parameter using gridsearch
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors
params = {'n_neighbors':[2,3,4,5,6,7,8,9]}

knn = neighbors.KNeighborsClassifier()

cl_g = GridSearchCV(knn, params, cv=5)
cl_g.fit(X_train_seq_padded,y_train)
cl_g.best_params_
#the best parameter for our model is 9

{'n_neighbors': 9}

In [None]:
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
#training our model using k as 9

from sklearn.neighbors import KNeighborsClassifier
cl_gs=KNeighborsClassifier(n_neighbors=9)
cl_gs.fit(X_train_seq_padded,y_train)

#predicting our model
y_pred_gs=cl_gs.predict(X_val_seq_padded)

In [None]:
#checking metrics for our models accuracy
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred_gs))
print(classification_report(y_test, y_pred_gs))

[[ 0  5]
 [ 1 22]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.81      0.96      0.88        23

    accuracy                           0.79        28
   macro avg       0.41      0.48      0.44        28
weighted avg       0.67      0.79      0.72        28



## **Gradient Boost Classifier**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


gbr = GradientBoostingClassifier(n_estimators=200, learning_rate=0.4, max_depth=2)

#training our model.
gbr = gbr.fit(X_train_seq_padded,y_train)

#predicting.
y_pred_gbr = gbr.predict(X_val_seq_padded)

In [None]:
#checking metrics for our models accuracy
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 1  4]
 [ 2 21]]
              precision    recall  f1-score   support

           0       0.33      0.20      0.25         5
           1       0.84      0.91      0.87        23

    accuracy                           0.79        28
   macro avg       0.59      0.56      0.56        28
weighted avg       0.75      0.79      0.76        28



##Naive bayes

In [None]:
#Declaring our X and Y variables
X = df_clean.clean_tweet.values
y = df_clean['class'].values

In [None]:
# Train Test Split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=10)

In [None]:
# set a vocabulary size. This is the maximum number of words that can be used.
vocabulary_size = 10000

# create the tokenizer that comes with Keras.
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(X_train)

# convert the texts to sequences.
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
max_words = 5000
max_len = 5000

X_train_seq_padded = pad_sequences(X_train_seq, maxlen=200)
X_val_seq_padded  = pad_sequences(X_val_seq, maxlen=200)

In [None]:
# Training the Model
# We will start by splitting our data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fitting our model 
# Then, all that we have to do is initialize the Naive Bayes Classifier and fit the data. 
# For text classification problems, the Multinomial Naive Bayes Classifier is well-suited
# 
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train_seq_padded, y_train)

# Evaluating the Model
# Once we have put together our classifier, we can evaluate its performance in the testing set
# 
predicted = model.predict(X_val_seq_padded)
print(np.mean(predicted == y_test))
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

0.6785714285714286
[[ 1  4]
 [ 5 18]]
              precision    recall  f1-score   support

           0       0.17      0.20      0.18         5
           1       0.82      0.78      0.80        23

    accuracy                           0.68        28
   macro avg       0.49      0.49      0.49        28
weighted avg       0.70      0.68      0.69        28



Hyperparameter Tuning

In [None]:
#Tuning hyperparameters and transforming features to a normal distribution
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

alphas = [0.1, 0.001, 0.2, 0.3, 0.4, 0.5]
p_grid_NB = {'alpha': alphas}

NB_cls= MultinomialNB()

grid = GridSearchCV(estimator = NB_cls, param_grid = p_grid_NB, cv = 5)
model = MultinomialNB().fit(X_train_seq_padded, y_train)
grid.fit(X_train_seq_padded, y_train)
grid.best_params_

{'alpha': 0.1}

In [None]:
y_true, y_pred = y_test , model.predict(X_val_seq_padded)

from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))

Results on the test set:
              precision    recall  f1-score   support

           0       0.17      0.20      0.18         5
           1       0.82      0.78      0.80        23

    accuracy                           0.68        28
   macro avg       0.49      0.49      0.49        28
weighted avg       0.70      0.68      0.69        28



**Trying Gaussian Naive Bayes**

In [None]:
#  Training our model and printing out metrics
 
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()  
model = clf.fit(X_train_seq_padded, y_train) 
predicted = model.predict(X_val_seq_padded)
print(np.mean(predicted == y_test))
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

0.17857142857142858
[[ 5  0]
 [23  0]]
              precision    recall  f1-score   support

           0       0.18      1.00      0.30         5
           1       0.00      0.00      0.00        23

    accuracy                           0.18        28
   macro avg       0.09      0.50      0.15        28
weighted avg       0.03      0.18      0.05        28



  _warn_prf(average, modifier, msg_start, len(result))


##Neural Networks MLP

In [None]:
import pandas as pd
import numpy as np
#for metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding

from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report,confusion_matrix

In [None]:
df_clean = pd.read_csv('/content/modeling_data.csv')

In [None]:
#Declaring our X and Y variables
X = df_clean.clean_tweet.values
y = df_clean['class'].values

In [None]:
# Train Test Split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=10)

In [None]:
# set a vocabulary size. This is the maximum number of words that can be used.
vocabulary_size = 10000

# create the tokenizer that comes with Keras.
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(X_train)

# convert the texts to sequences.
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
max_words = 5000
max_len = 5000

X_train_seq_padded = pad_sequences(X_train_seq, maxlen=200)
X_val_seq_padded  = pad_sequences(X_val_seq, maxlen=200)

In [None]:
#creating an instamce ofthe model
mlp = MLPClassifier(hidden_layer_sizes = (13, 13,13), max_iter = 500)

# fitting the data
mlp.fit(X_train_seq_padded,y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(13, 13, 13), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [None]:
# Now that we have our model in place, let's do the prediction
from sklearn import metrics
pred = mlp.predict(X_val_seq_padded)

mlp_nn = metrics.accuracy_score(pred,y_test)*100

# Evaluating the performance of ur model

print('The accuracy of the model is ',metrics.accuracy_score(y_test, pred))

print (confusion_matrix(y_test,pred))

print('-----------------------------------------------')

print(classification_report(y_test,pred))

The accuracy of the model is  0.6428571428571429
[[ 1  4]
 [ 6 17]]
-----------------------------------------------
              precision    recall  f1-score   support

           0       0.14      0.20      0.17         5
           1       0.81      0.74      0.77        23

    accuracy                           0.64        28
   macro avg       0.48      0.47      0.47        28
weighted avg       0.69      0.64      0.66        28



In [None]:
# Extracting the weights and bias vectors

# Checking the number of weights 
len(mlp.coefs_) 

# Checking the number of biases 
len(mlp.intercepts_)

4

**Hyperparameter tune for the MLP**

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=100)

In [None]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu','sigmoid'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [None]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train_seq_padded, y_train)



GridSearchCV(cv=3, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=100, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state...
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu', 'sigmoid'],
                         'alpha': [0.00

In [None]:
# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found:
 {'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}
0.837 (+/-0.041) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
0.773 (+/-0.054) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
0.836 (+/-0.004) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
0.727 (+/-0.045) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
0.827 (+/-0.054) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
0.791 (+/-0.091) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'adam'

In [None]:
y_true, y_pred = y_test , clf.predict(X_val_seq_padded)

from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))

Results on the test set:
              precision    recall  f1-score   support

           0       0.40      0.40      0.40         5
           1       0.87      0.87      0.87        23

    accuracy                           0.79        28
   macro avg       0.63      0.63      0.63        28
weighted avg       0.79      0.79      0.79        28



## Neural Network: ANN and RNN

**Splitting the Dataset into Train and Test**

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

print('df_train shape: {}'.format(df_train.shape))
print('df_test shape: {}'.format(df_test.shape))

print('df_train: {:.2f}% positive reviews'.format(df_train['class'].mean()*100))
print('df_test: {:.2f}% positive reviews'.format(df_test['class'].mean()*100))

df_train shape: (110, 11)
df_test shape: (28, 11)
df_train: 83.64% positive reviews
df_test: 82.14% positive reviews


**Further Splitting the Train dataset into Train and Validation**

In [None]:
# Split the data into train and validation set.
df0_train, df0_val = train_test_split(df_train, test_size=0.2)

In [None]:
#set the target and features
X_train = df0_train['clean_tweet'].values
y_train = df0_train['class'].values

X_val = df0_val['clean_tweet'].values
y_val = df0_val['class'].values

**Preprocessing the Text: Tokenization and Conversion to Sequences**

In [None]:
# set a vocabulary size. This is the maximum number of words that can be used.
vocabulary_size = 10000

# create the tokenizer that comes with Keras.
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(X_train)

# convert the texts to sequences.
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [None]:
l = [len(i) for i in X_train_seq]
l = np.array(l)
print('minimum number of words: {}'.format(l.min()))
print('median number of words: {}'.format(np.median(l)))
print('average number of words: {}'.format(l.mean()))
print('maximum number of words: {}'.format(l.max()))

minimum number of words: 3
median number of words: 18.5
average number of words: 18.602272727272727
maximum number of words: 36


In [None]:
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=200)
X_val_seq_padded  = pad_sequences(X_val_seq, maxlen=200)

**ANN**

In [None]:
#Intialize
wandb.init(project="Remote Learning",config=config)

In [None]:
import tensorflow as tf

#Create a tensorflow model
embedding_vector_length = 32
vocab_size = len(tokenizer.word_index) + 1
model = tf.keras.models.Sequential() 
model.add(tf.keras.layers.Embedding(vocab_size, embedding_vector_length, input_length=200) )
model.add(tf.keras.layers.SpatialDropout1D(0.25))
model.add(tf.keras.layers.LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])  
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 32)           25888     
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 50)                16600     
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 42,539
Trainable params: 42,539
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
#fit the model
history = model.fit(X_train_seq_padded,y_train,validation_split=0.2, epochs=5, batch_size=32,callbacks=[WandbCallback()])
wandb.finish()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


VBox(children=(Label(value=' 0.74MB of 0.74MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,4.0
loss,0.63525
accuracy,0.82857
val_loss,0.61311
val_accuracy,0.83333
_runtime,16.0
_timestamp,1629326994.0
_step,4.0
best_val_loss,0.61311
best_epoch,4.0


0,1
epoch,▁▃▅▆█
loss,█▇▅▃▁
accuracy,▁▇███
val_loss,█▇▆▄▁
val_accuracy,▁▁▁▁▁
_runtime,▁▁▃▆█
_timestamp,▁▁▃▆█
_step,▁▃▅▆█


### **RNN**

In [None]:
#Intialize
wandb.init(project="Remote Learning",config=config)

**LSTM**

In [None]:
max_words = 5000
max_len = 200

In [None]:
model=tf.keras.Sequential([
                           tf.keras.layers.Embedding(vocab_size,embedding_vector_length, input_length=max_len),
                           tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,dropout=0.5,return_sequences=True)),
                           tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
                           tf.keras.layers.Dense(24,activation='relu'),
                           tf.keras.layers.Dense(1,activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 32)           25888     
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 128)          49664     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_1 (Dense)              (None, 24)                1560      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 25        
Total params: 118,353
Trainable params: 118,353
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs=5
history=model.fit(X_train_seq_padded,y_train,epochs=num_epochs,
                  validation_data=(X_val_seq_padded,y_val),verbose=2,callbacks=[WandbCallback()])
wandb.finish()

Epoch 1/5
3/3 - 10s - loss: 0.6875 - accuracy: 0.6136 - val_loss: 0.6632 - val_accuracy: 0.8636
Epoch 2/5
3/3 - 1s - loss: 0.6545 - accuracy: 0.8295 - val_loss: 0.6085 - val_accuracy: 0.8636
Epoch 3/5
3/3 - 1s - loss: 0.5977 - accuracy: 0.8295 - val_loss: 0.5184 - val_accuracy: 0.8636
Epoch 4/5
3/3 - 1s - loss: 0.5086 - accuracy: 0.8295 - val_loss: 0.4025 - val_accuracy: 0.8636
Epoch 5/5
3/3 - 1s - loss: 0.4394 - accuracy: 0.8295 - val_loss: 0.4143 - val_accuracy: 0.8636


VBox(children=(Label(value=' 1.77MB of 1.77MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,4.0
loss,0.43938
accuracy,0.82955
val_loss,0.4143
val_accuracy,0.86364
_runtime,24.0
_timestamp,1629327024.0
_step,4.0
best_val_loss,0.40247
best_epoch,3.0


0,1
epoch,▁▃▅▆█
loss,█▇▅▃▁
accuracy,▁████
val_loss,█▇▄▁▁
val_accuracy,▁▁▁▁▁
_runtime,▁▂▅▇█
_timestamp,▁▂▅▇█
_step,▁▃▅▆█


In [None]:
#Intialize
wandb.init(project="Remote Learning",config=config)

In [None]:
from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint

max_words = 5000
max_len = 200

X_train_seq_padded = pad_sequences(X_train_seq, maxlen=200)
X_val_seq_padded  = pad_sequences(X_val_seq, maxlen=200)

model1 = Sequential()
model1.add(layers.Embedding(max_words, 20)) #The embedding layer
model1.add(layers.LSTM(15,dropout=0.5)) #Our LSTM layer
model1.add(layers.Dense(1,activation='sigmoid'))
print(model1.summary())

model1.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])


#model.fit(X_train, Y_train,validation_data = (X_test,y_test),epochs = 10, batch_size=32)

history = model1.fit(X_train_seq_padded, y_train, epochs=10,batch_size=32,
                     validation_data=(X_val_seq_padded, y_val),callbacks=[WandbCallback()])
wandb.finish()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 20)          100000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 15)                2160      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 16        
Total params: 102,176
Trainable params: 102,176
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


VBox(children=(Label(value=' 1.02MB of 1.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,9.0
loss,0.44363
accuracy,0.82955
val_loss,0.4126
val_accuracy,0.86364
_runtime,11.0
_timestamp,1629327042.0
_step,9.0
best_val_loss,0.4126
best_epoch,9.0


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,██▇▇▆▅▃▂▁▁
accuracy,▁█████████
val_loss,██▇▇▆▃▂▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
_runtime,▁▃▃▃▆▆▆███
_timestamp,▁▃▃▃▆▆▆███
_step,▁▂▃▃▄▅▆▆▇█


In [None]:
#Evaluating the model using the test data
# predict on the test dataset.

# transform text to sequences.
X_test_seq = tokenizer.texts_to_sequences(df_test['clean_tweet'].values)
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=200)
y_test = df_test['class'].values

y_test_pred = model1.predict(X_test_seq_padded)
y_test_pred = y_test_pred.reshape(y_test_pred.shape[0],)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

auc_lstm = roc_auc_score(y_test, y_test_pred)
auc_lstm

0.7217391304347827

**Bidirectional layers**

In [None]:
#Intialize
wandb.init(project="Remote Learning",config=config)

In [None]:
#Intialize the model
model2 = Sequential()
model2.add(layers.Embedding(max_words, 40, input_length=max_len))
model2.add(layers.Bidirectional(layers.LSTM(20,dropout=0.6)))
model2.add(layers.Dense(1,activation='sigmoid'))
#Call comipiler ab=nd the checkpoints

model2.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])

#fit the model

history = model2.fit(X_train_seq_padded, y_train, epochs=10,
                     validation_data=(X_val_seq_padded, y_val),callbacks=[WandbCallback()])
wandb.finish()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


VBox(children=(Label(value=' 1.88MB of 1.88MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,9.0
loss,0.3943
accuracy,0.82955
val_loss,0.3935
val_accuracy,0.86364
_runtime,17.0
_timestamp,1629327065.0
_step,9.0
best_val_loss,0.39308
best_epoch,7.0


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▇▅▃▃▂▂▂▁▁
accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▃▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▃▃▅▅▆▆██
_timestamp,▁▁▃▃▅▅▆▆██
_step,▁▂▃▃▄▅▆▆▇█


In [None]:
#Evaluating the model using the test data
# predict on the test dataset.

# transform text to sequences.
X_test_seq = tokenizer.texts_to_sequences(df_test['clean_tweet'].values)
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=200)
y_test = df_test['class'].values

y_test_pred = model2.predict(X_test_seq_padded)
y_test_pred = y_test_pred.reshape(y_test_pred.shape[0],)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

auc_lstm = roc_auc_score(y_test, y_test_pred)
auc_lstm

0.7478260869565218

# Transformers

In [None]:
!pip install git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-nd_xoiqa
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-nd_xoiqa
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [None]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import pandas as pd
import json
import gc

#for warning
import warnings
warnings.filterwarnings("ignore")

In [None]:
data_texts = df["clean_tweet"].to_list() # Features (not-tokenized yet)
data_labels = df["class"].to_list() # Lables

In [None]:
from sklearn.model_selection import train_test_split

# Split Train and Validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size=0.2, random_state=0)

# Keep some data for inference (testing)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.01, random_state=0)

In [None]:
#Tokenizing
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

**Building the model and compiling it**

In [None]:
#Intialize
wandb.init(project="Remote Learning",config=config)

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_20']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16,
          validation_data=val_dataset.shuffle(1000).batch(16),callbacks=[WandbCallback()])
wandb.finish()

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.


Epoch 2/3
Epoch 3/3


VBox(children=(Label(value=' 0.41MB of 0.41MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,2.0
loss,0.36964
accuracy,0.84259
val_loss,0.44086
val_accuracy,0.82143
_runtime,209.0
_timestamp,1629327306.0
_step,2.0
best_val_loss,0.44086
best_epoch,2.0


0,1
epoch,▁▅█
loss,█▃▁
accuracy,▁▁▁
val_loss,█▅▁
val_accuracy,▁▁▁
_runtime,▁▅█
_timestamp,▁▅█
_step,▁▅█


**Make Prediction**

In [None]:
save_directory = "/saved_models" # change this to your preferred location

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/saved_models/tokenizer_config.json',
 '/saved_models/special_tokens_map.json',
 '/saved_models/vocab.txt',
 '/saved_models/added_tokens.json')

In [None]:
loaded_tokenizer = DistilBertTokenizer.from_pretrained(save_directory)
loaded_model = TFDistilBertForSequenceClassification.from_pretrained(save_directory)

Some layers from the model checkpoint at /saved_models were not used when initializing TFDistilBertForSequenceClassification: ['dropout_20']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /saved_models and are newly initialized: ['dropout_40']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
test_text = test_texts[0]
test_text

'onlin learn amp onlin date feel like someth happen noth happen'

In [None]:
predict_input = loaded_tokenizer.encode(test_text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

output = loaded_model(predict_input)[0]

prediction_value = tf.argmax(output, axis=1).numpy()[0]
prediction_value

1