### 1. Import the libraries
As the first step, we need to import the required libraries.

In [3]:
import pandas as pd
import numpy as np

### 2. Load the dataset
Load the dataset.

In [4]:
df = pd.read_csv('../data/text-classification.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [5]:
df.shape

(2225, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


### 3. Exploratory Data Analysis

In [7]:
from collections import Counter

def countWord(list_of_words):            
    count = Counter()
    for sentence in list_of_words:
        for word in sentence.split():
            count[word] += 1
    
    return count

In [8]:
countWord(df['category'])

Counter({'tech': 401,
         'business': 510,
         'sport': 511,
         'entertainment': 386,
         'politics': 417})

In [9]:
counter = countWord(df['text'])
counter.most_common(5)

[('the', 52567), ('to', 24955), ('of', 19947), ('and', 18561), ('a', 18251)]

### 4. Pre-processing the data
The actual data must meet certain conditions before being sent to the model. We will create a `pipeline`: a multi-level system where each level receives its data from the previous level and sends its results to the next level.

#### 4.1 Tranforming the data
We transform the `textual categories` into `index values`.

In [10]:
def category_transforming(df):
    category_mapper = dict(zip(np.unique(df["category"]), list(range(df['category'].nunique()))))
    category_inv_mapper = dict(zip(list(range(df['category'].nunique())), np.unique(df["category"])))
    
    return category_mapper, category_inv_mapper

In [11]:
category_mapper, category_inv_mapper = category_transforming(df)

In [12]:
category_ind = [category_mapper[i] for i in df['category']]
df['category_ind'] = category_ind
df.head()

Unnamed: 0,category,text,category_ind
0,tech,tv future in the hands of viewers with home th...,4
1,business,worldcom boss left books alone former worldc...,0
2,sport,tigers wary of farrell gamble leicester say ...,3
3,sport,yeading face newcastle in fa cup premiership s...,3
4,entertainment,ocean s twelve raids box office ocean s twelve...,1


We can use another alternative with `scikit-learn` :

In [13]:
from sklearn.preprocessing import LabelEncoder

def category_transforming(list_of_categories):
    label_encoder = LabelEncoder()
    label_encoder.fit(df['category'])
    predicted_label = label_encoder.transform(list_of_categories)
    
    return predicted_label

In [14]:
category_ind = category_transforming(df['category'])
df['category_ind'] = category_ind
df.head()

Unnamed: 0,category,text,category_ind
0,tech,tv future in the hands of viewers with home th...,4
1,business,worldcom boss left books alone former worldc...,0
2,sport,tigers wary of farrell gamble leicester say ...,3
3,sport,yeading face newcastle in fa cup premiership s...,3
4,entertainment,ocean s twelve raids box office ocean s twelve...,1


#### 4.2 Splitting the Data

#### 4.3  NLP Pipeline

In [15]:
import nltk

#### Step 1 -  Remove URL's

In [16]:
import re

def remove_links(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub("", text)

In [17]:
corpus = [remove_links(sentence) for sentence in df['text']]
corpus[0]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are also being built-in to high

In [18]:
df['text'] = df['text'].map(remove_links)
df['text'].head()

0    tv future in the hands of viewers with home th...
1    worldcom boss  left books alone  former worldc...
2    tigers wary of farrell  gamble  leicester say ...
3    yeading face newcastle in fa cup premiership s...
4    ocean s twelve raids box office ocean s twelve...
Name: text, dtype: object

#### Step 2 -  Remove Punctuations

In [19]:
import string

def remove_punctuations(text):
    characters_to_remove = string.punctuation
    translator = str.maketrans("", "", characters_to_remove)
    clean_text = (text
                  .lower()
                  .translate(translator)
                 )
    
    return clean_text

In [20]:
corpus = [remove_punctuations(sentence) for sentence in df['text']]
corpus[0]

'tv future in the hands of viewers with home theatre systems  plasma highdefinition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices  one of the most talkedabout technologies of ces has been digital and personal video recorders dvr and pvr these settop boxes  like the us s tivo and the uk s sky system  allow people to record  store  play  pause and forward wind tv programmes when they want  essentially  the technology allows for much more personalised tv they are also being builtin to highdefinition tv

In [21]:
df['text'] = df['text'].map(remove_punctuations)
df['text'].head()

0    tv future in the hands of viewers with home th...
1    worldcom boss  left books alone  former worldc...
2    tigers wary of farrell  gamble  leicester say ...
3    yeading face newcastle in fa cup premiership s...
4    ocean s twelve raids box office ocean s twelve...
Name: text, dtype: object

In [38]:
df.head()

Unnamed: 0,category,text,category_ind
0,tech,tv future hands viewers home theatre systems p...,4
1,business,worldcom boss left books alone former worldcom...,0
2,sport,tigers wary farrell gamble leicester say rushe...,3
3,sport,yeading face newcastle fa cup premiership side...,3
4,entertainment,ocean twelve raids box office ocean twelve cri...,1


#### Step 3 - Stop words

In [22]:
# nltk.download('stopwords')
from nltk.corpus import stopwords

def removing_stop_words(text):
    stop = stopwords.words("english")
    filtered_words = [word for word in text.split() if word not in stop]
    
    return " ".join(filtered_words)

In [23]:
df['text'] = df['text'].map(removing_stop_words)
df['text'].head()

0    tv future hands viewers home theatre systems p...
1    worldcom boss left books alone former worldcom...
2    tigers wary farrell gamble leicester say rushe...
3    yeading face newcastle fa cup premiership side...
4    ocean twelve raids box office ocean twelve cri...
Name: text, dtype: object

#### Step 3 - Splitting the Dataset

In [None]:
split_size = int(df.shape[0] * 0.8)

In [34]:
df_train = df[:split_size]
df_val = df[split_size:]

In [39]:
train_text_to_array = df_train['text'].to_numpy()
val_text_to_array = df_val['text'].to_numpy()
train_category_ind_to_array = df_train['category_ind'].to_numpy()
val_category_ind_to_array = df_val['category_ind'].to_numpy()

In [40]:
train_text_to_array.shape, val_text_to_array.shape

((1780,), (445,))

#### Step 4 - Tokenization, Stemming, Lemmatization 

In [25]:
from keras.preprocessing.text import Tokenizer

In [26]:
# nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def tokenize(sentence):
    return nltk.word_tokenize(sentence)

def stem(word):
    stemmer = PorterStemmer()
    return stemmer.stem(word)

def lemmatize(word):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)

In [27]:
all_words = tokenize(df['text'][0])
all_words

['tv',
 'future',
 'hands',
 'viewers',
 'home',
 'theatre',
 'systems',
 'plasma',
 'highdefinition',
 'tvs',
 'digital',
 'video',
 'recorders',
 'moving',
 'living',
 'room',
 'way',
 'people',
 'watch',
 'tv',
 'radically',
 'different',
 'five',
 'years',
 'time',
 'according',
 'expert',
 'panel',
 'gathered',
 'annual',
 'consumer',
 'electronics',
 'show',
 'las',
 'vegas',
 'discuss',
 'new',
 'technologies',
 'impact',
 'one',
 'favourite',
 'pastimes',
 'us',
 'leading',
 'trend',
 'programmes',
 'content',
 'delivered',
 'viewers',
 'via',
 'home',
 'networks',
 'cable',
 'satellite',
 'telecoms',
 'companies',
 'broadband',
 'service',
 'providers',
 'front',
 'rooms',
 'portable',
 'devices',
 'one',
 'talkedabout',
 'technologies',
 'ces',
 'digital',
 'personal',
 'video',
 'recorders',
 'dvr',
 'pvr',
 'settop',
 'boxes',
 'like',
 'us',
 'tivo',
 'uk',
 'sky',
 'system',
 'allow',
 'people',
 'record',
 'store',
 'play',
 'pause',
 'forward',
 'wind',
 'tv',
 'program

In [28]:
a = ["organize", "organizes", "organizing"]
stemmed_words = [stem(w) for w in a]

In [29]:
stemmed_words

['organ', 'organ', 'organ']

In [30]:
lemmatized_word = [lemmatize(w) for w in a]

In [31]:
lemmatized_word

['organize', 'organizes', 'organizing']