<a href="https://colab.research.google.com/github/knozdogan/crud-project/blob/master/nlp_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Text Classification with CNN**
****
Disaster tweets classification (real or not) with convolutional neural networks.[](http://)

In [0]:
# !pip install tensorflow-datasets
# !pip install tf-nightly

In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [0]:
train_data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
train_data["text"][57]

*Data Preprocess*

In [0]:
# replace urls with http
train_data["text"] = train_data["text"].str.replace("(\w+:\/\/\S+)", "http", regex=True)
test_data["text"] = test_data["text"].str.replace("(\w+:\/\/\S+)", "http", regex=True)

# remove non-ascii characters
def remove_non_ascii(text):
    return ''.join(i for i in text if ord(i)<128)
 
train_data['text'] = train_data['text'].apply(remove_non_ascii)
test_data['text'] = test_data['text'].apply(remove_non_ascii)

# combine two information
train_data["text"] = train_data["text"] + " in " + train_data["location"].replace(np.nan, '', regex=True)
test_data["text"] = test_data["text"] + " in " + test_data["location"].replace(np.nan, '', regex=True)

# lowercase
train_data["text"] = train_data["text"].str.lower()
test_data["text"] = test_data["text"].str.lower()

# remove punctuations
train_data["text"] = train_data["text"].str.replace('[^\w\s]','')
test_data["text"] = test_data["text"].str.replace('[^\w\s]','')

# remove underscores
train_data["text"] = train_data["text"].str.replace('_','')
test_data["text"] = test_data["text"].str.replace('_','')

*Input Pipeline*

In [0]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [0]:
BATCH_SIZE = 64
x_train = train_data.pop("text")
y_train = train_data.pop("target")
dataset_train = tf.data.Dataset.from_tensor_slices((x_train.values, y_train.values))

tokenizer = tfds.features.text.Tokenizer()
vocabulary_set = set()
for text_tensor, _ in dataset_train:
    vocabulary_set.update(tokenizer.tokenize(text_tensor.numpy()))

vocab_size = len(vocabulary_set)
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

def encode_map_fn(text, label):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text, label = tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))
    # `tf.data.Datasets` work best if all components have a shape set so set the shapes manually: 
    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text, label

train_data = dataset_train.map(encode_map_fn)

train_data = train_data.padded_batch(BATCH_SIZE)