### NLP 

Usando datos de:

https://www.kaggle.com/pradeeptrical/text-tweet-classification


In [1]:
import pandas as pd
import numpy as np
df = pd.read_excel('text_classification_dataset.xlsx', index_col=0)


First we have to clean the dataset and tokenize it

In [2]:
df.reset_index(inplace=True)

Workflow que neceesitamos para usar NN en NLP

In [3]:
df.head()

Unnamed: 0,text,type
0,@ACNI2012 @TheToka920 Never knew having 1 or 2...,sports
1,"MYCA Magical Moments:\n\nSeptember, 2011: Sham...",sports
2,The current state of last year's @BBL finalist...,sports
3,@HOLLYJISOO Why did you bring a cricket...,sports
4,Babar Azam only Pakistani included in the ICC ...,sports


## Simple approach using BOW

In [4]:
import nltk
import string
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

#Default stopwords that we would like to delete from our statements
from nltk.corpus import stopwords 

[nltk_data] Downloading package stopwords to /Users/tomas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
sw = stopwords.words('english')

In [6]:
sw.append('')
sw.append('tweet')
sw.append('tweets')
len(sw)

182

In [7]:
import re

def cleantxt(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) #quitar menciones
    text = re.sub(r'#','',text) #quitar hashtags
    text = re.sub(r'RT','',text) #quitar RT
    text = re.sub(r'https?:\/\/\S+','',text) #quitar links
    
    #El resto es para quitar strings que no tienen sentido, las encuentras mirando detenidamente el dataset
    
    text = re.sub(r'\n\n','',text)
    text = re.sub(r'\n','',text)
    text = re.sub(r':','',text)
    text = re.sub(r'[0-9]','',text)
    text = re.sub(r'_[A-Za-z0-9]+','',text)
    text = re.sub(r'\n[A-Za-z0-9]+','',text)
    text = re.sub(r'\'[A-Za-z0-9]+','',text)
    text = re.sub(r"'","",text)
    text = re.sub(r"...$","",text)
    text = re.sub(r"..$","",text)
    text = re.sub(r"....$","",text)
    text = re.sub(r"...$","",text)
    text = re.sub(r"`","",text)
    text = re.sub(r"-","",text)
    text = re.sub(r"!","",text)
    
    return text

In [8]:
# Limpiamos el texto
df['text'] = df['text'].apply(cleantxt)
df

Unnamed: 0,text,type
0,Never knew having or followers had anythin...,sports
1,"MYCA Magical MomentsSeptember, Sham Chotoo of...",sports
2,The current state of last year finalists P...,sports
3,Why did you bring,sports
4,Babar Azam only Pakistani included in the ICC ...,sports
...,...,...
1157,The senior is one of the most decorated male t...,sports
1158,COULD be your year to get moving and change t...,sports
1159,thought you liked yellow on me but that OK....,sports
1160,❤️ Tennis greats played together to raise mo...,sports


In [9]:
new_text = []
for index in range(df.shape[0]):

    text = df.text[index].lower().split(' ')
    for word in text:
        if word not in sw:
            if len(word)>3 and len(word)<9:
                new_text.append(word)



In [10]:
len(new_text)

7479

In [11]:
final_words = set(new_text)
len(final_words)

3615

In [12]:
dict_words ={}
for word in final_words:
    dict_words[word] = []

In [13]:
dict_words

{'chesney': [],
 'pick': [],
 'less': [],
 'sen.': [],
 'elbaum': [],
 'stealing': [],
 'japan': [],
 'nearly': [],
 'simply': [],
 'parnas,': [],
 'reason.': [],
 'y’all': [],
 '....📸': [],
 'going.': [],
 'patsy': [],
 'gang..??': [],
 'abuse.': [],
 'national': [],
 'grounds,': [],
 'angel': [],
 'things': [],
 'said,': [],
 'appear': [],
 'vice': [],
 'phillip': [],
 'general': [],
 'study': [],
 'smile': [],
 '"every': [],
 'wickets': [],
 'medvedev': [],
 'ahead': [],
 'tells': [],
 'disabled': [],
 'sleeping': [],
 'cover,': [],
 'flight': [],
 'teeth': [],
 'streamin': [],
 'bowling': [],
 'kang': [],
 'politic': [],
 'band': [],
 'accident': [],
 'lede': [],
 'british': [],
 'flawed': [],
 'told': [],
 'lack,': [],
 'happens': [],
 'teach': [],
 'ugly': [],
 'frank': [],
 'stay': [],
 'britain': [],
 'ridge': [],
 'bailout': [],
 'again.': [],
 'dragged': [],
 'bully': [],
 'dembele,': [],
 'gene': [],
 'college': [],
 'sexism': [],
 'sticker': [],
 'change?': [],
 'goin': [],

In [14]:
for index in range(df.shape[0]):
    sentence = df.text[index].lower().split(' ')
    for word in final_words:
        if word in sentence:
            n_times = sentence.count(word)
            dict_words[word].append(n_times)
        else:
            dict_words[word].append(0)

In [15]:
df_words = pd.DataFrame(dict_words)
df_words.shape

(1162, 3615)

Máximo del Dataframe para normalizar

In [16]:
max_df = df_words.to_numpy().max()
df_words = df_words/max_df

Dataframe a numpy array

In [17]:
X = df_words.to_numpy(float)
X.shape

(1162, 3615)

In [18]:
dict_rep = {'sports':1,
           'medical':2,
           'entertainment':3,
           'politics':4}
y = df.type
y = y.replace(dict_rep)
y = np.array(y).astype(int)
y[0]

1

In [19]:
from sklearn.model_selection import train_test_split

trainingX,testX,trainingy,testy = train_test_split(X,y, stratify=y)

In [20]:
print(trainingX.shape,testX.shape,trainingy.shape, testy.shape)

(871, 3615) (291, 3615) (871,) (291,)


Red Neuronal

In [21]:
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)


model = tf.keras.models.Sequential([
                                    tf.keras.layers.Dense(64, activation=tf.nn.relu),
                                    tf.keras.layers.Dense(5, activation=tf.nn.softmax)])

model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(trainingX, trainingy, epochs=6, verbose=1)

model.evaluate(testX, testy)




2.5.0
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


[1.069620132446289, 0.6907216310501099]