all credits to https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17.   I made minor changes: data from kaggle

# The Data

In [1]:
# get data file
! pip install -q kaggle

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json



Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 65 bytes


In [2]:
! pwd

/content


In [3]:
!kaggle datasets download cfpb/us-consumer-finance-complaints

us-consumer-finance-complaints.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!unzip us-consumer-finance-complaints.zip
!ls

Archive:  us-consumer-finance-complaints.zip
replace consumer_complaints.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/content'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/content/consumer_complaints.csv')
df.info()

In [None]:
df.head()

In [None]:
df['product'].value_counts()

In [None]:
#Plotly notebook mode with google colaboratory
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

In [None]:
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
configure_plotly_browser_state()


df['product'].value_counts().sort_values(ascending = False).iplot(kind='bar', yTitle = "Number of complaints",
                                                                title = 'Number complaints in each product')

In [None]:
def print_plot(index):
    example = df[df.index == index][["consumer_complaint_narrative",'product']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Product:', example[1])

print_plot(0)

# Text Pre-processing

In [None]:
import re
from nltk.corpus import stopwords
import nltk

df = df.reset_index( drop = True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
       text as string
       
       return: modified initial string
    """
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub(' ', text)
    text = text.replace("x", '')
    text = " ".join(word for word in text.split() if word not in STOPWORDS)
    return text
df['consumer_complaint_narrative'] =  df['consumer_complaint_narrative'].astype(str)
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(clean_text)
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].str.replace('\d+', '')

# LSTM Modeling

In [None]:
# LSTM Modeling
from keras.preprocessing.text import Tokenizer

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['consumer_complaint_narrative'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = tokenizer.texts_to_sequences(df['consumer_complaint_narrative'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
#Converting categorical labels to numbers.
Y = pd.get_dummies(df['product']).values
print('Shape of label tensor:', Y.shape)


In [None]:
#Train test split.
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.utils import to_categorical
X_train = to_categorical(X_train, 11)
Y_train = to_categorical(Y_train, 11)
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(11, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


In [24]:
epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, 
                    batch_size=batch_size,validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss',
                                             patience=3, min_delta=0.0001)])

Epoch 1/5


ValueError: ignored