<a href="https://colab.research.google.com/github/idoFinder/NLP_colab/blob/master/CNN_For_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports



In [0]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import os
from google.colab import drive
from sklearn.model_selection import train_test_split

In [0]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

# Load Data

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
columns = ['idx','sentiment','id','date','query','user','text']
train_data = pd.read_csv('/content/drive/My Drive/Projects/COLAB/CNN_For_NLP/data/train_sub.csv',
                         header=None,
                         names = columns,
                         engine="python",
                         encoding="latin1")

# TODO: fix the CSV to match this format
train_data = train_data.drop(['idx'], axis=1).iloc[1:]

In [0]:
columns = ['sentiment','id','date','query','user','text']
test_data = pd.read_csv('/content/drive/My Drive/Projects/COLAB/CNN_For_NLP/data/test.csv',
                         header=None,
                         names = columns,
                         engine="python",
                         encoding="latin1")

# Cleaning & Preprocessing

In [0]:
def clean_tweets(tweet):
  tweet = BeautifulSoup(tweet,"lxml").get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ',tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ',tweet)
  tweet = re.sub(r"[^a-zA-Z.!?]", ' ',tweet)
  tweet = re.sub(r" +", ' ',tweet)
  return tweet


In [0]:
train_data.drop(['id','date','query','user'], axis=1,inplace=True)
test_data.drop(['id','date','query','user'], axis=1,inplace=True)

clean_train_data = [clean_tweets(tweet) for tweet in  train_data.text]
clean_test_data = [clean_tweets(tweet) for tweet in  test_data.text]

# changing the sentiment 4 into 1
train_labels = train_data.sentiment.values
train_labels[train_labels == '1'] = 1
train_labels[train_labels == '0'] = 0

# Tokenizing the words into numbers vector
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    clean_train_data, target_vocab_size=2**16
)

# Converting the text input into numerical vector
data_inputs = [tokenizer.encode(sent) for sent in clean_train_data]

MAX_LEN = max(len(vec) for vec in data_inputs)

# Add padding to all inputs to have the same size
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding='post',
                                                            maxlen=MAX_LEN)

# split train and test
X_train, X_test, y_train, y_test = train_test_split(data_inputs, train_labels,
                                                    stratify=train_labels, 
                                                    test_size=0.01)

# Model

In [0]:
class DCNN(tf.keras.Model):
  def __init__(self,
               vocab_size,
               emb_dim=128,
               nb_filters=50,
               FFN_units=512,
               nb_classes=2,
               dropout_rate=0.1,
               training=False,
               name='DCNN'):
    
    super(DCNN,self).__init__(name=name)
    
    self.embedding = layers.Embedding(vocab_size, emb_dim)

    self.bigram = layers.Conv1D(filters=nb_filters,padding='valid',kernel_size=2,activation='relu')
    self.pool_1 = layers.GlobalMaxPooling1D()

    self.trigram = layers.Conv1D(filters=nb_filters,padding='valid',kernel_size=2,activation='relu')
    self.pool_2 = layers.GlobalMaxPooling1D()

    self.fourgram = layers.Conv1D(filters=nb_filters,padding='valid',kernel_size=2,activation='relu')
    self.pool_3 = layers.GlobalMaxPooling1D()

    self.dense_1 = layers.Dense(units=FFN_units, activation='relu')
    self.dropout = layers.Dropout(rate=dropout_rate)

    if nb_classes == 2:
      self.last_dense = layers.Dense(units=2, activation='sigmoid')
    else:
      self.last_dense = layers.Dense(units=nb_classes, activation='softmax')
    
    
  def call(self,inputs,training):
    x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool_1(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool_2(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool_3(x_3)

    # shape: (batch_size, 3*nb_filters)
    merged = tf.concat([x_1,x_2,x_3], axis=-1) 
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)

    return output





## Config

In [0]:
# parameters for model
VOCAB_SIZE = tokenizer.vocab_size
EMB_DIM=200
NB_FILTERS = 100
FFN_UNITS =256
NB_CLASSES = len(set(y_train))
DROUPOUT_RATE = 0.2
BATCH_SIZE =32
NB_EPOCHS = 5


## Training

In [0]:
Dcnn = DCNN(vocab_size = VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROUPOUT_RATE)


In [0]:
if NB_CLASSES == 2:
  Dcnn.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=["accuracy"])
else:
    Dcnn.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=["sparse_categorical_accuracy"])

In [0]:
# save the trained model using checkpoints
checkpoint_path = "/content/drive/My Drive/Projects/COLAB/CNN_For_NLP/checkpoints"
ckpt = tf.train.Checkpoint(Dcnn=Dcnn)
ckpt_manager = tf.train.CheckpointManager(ckpt,checkpoint_path,max_to_keep=1)
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("latest checkpoint restored")

In [113]:
Dcnn.fit(X_train, y_train,batch_size=BATCH_SIZE,epochs=NB_EPOCHS)
ckpt_manager.save()

ValueError: ignored