In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical




In [2]:
vocab_size=10000
max_length=200

In [3]:
data=pd.read_csv("./bbc_news_mixed.csv")

In [4]:
data.head()

Unnamed: 0,text,label
0,Cairn shares slump on oil setback\n\nShares in...,business
1,Egypt to sell off state-owned bank\n\nThe Egyp...,business
2,Cairn shares up on new oil find\n\nShares in C...,business
3,Low-cost airlines hit Eurotunnel\n\nChannel Tu...,business
4,"Parmalat to return to stockmarket\n\nParmalat,...",business


In [5]:
data['label'].value_counts()

label
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

In [6]:
data['text'].iloc[0]



In [7]:
token=Tokenizer(num_words=vocab_size)

In [8]:
token.fit_on_texts(data['text'])

In [9]:
seq=token.texts_to_sequences(data['text'])

In [10]:
pad_seq=pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')

In [11]:
model=Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='softmax'))
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 5)                 325       
                                                                 
Total params: 1042565 (3.98 MB)
Trainable params: 1042565 (3.98 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [22]:
# encoder=LabelEncoder()
encoder=OneHotEncoder()

In [24]:
y=encoder.fit_transform(data[['label']])

In [None]:
categorical_labels=to_categorical(y)

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
pad_seq=pad_seq.astype('float32')

In [28]:
xtrain, xtest, ytrain, ytest=train_test_split(pad_seq, categorical_labels, train_size=0.8)

In [29]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [30]:
model.fit(xtrain, ytrain, epochs=10)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x25958f1aa50>

In [31]:
model.save('bbcmodel.h5')

  saving_api.save_model(


In [32]:
import joblib

In [34]:
joblib.dump(encoder,'encoder.pkl')
joblib.dump(token,'tokenizer.pkl')

['tokenizer.pkl']

In [35]:
config={}

In [36]:
config['vocab_size'] = vocab_size

In [37]:
config['pad_length']=max_length

In [40]:
file=open('config.txt', 'w')

In [41]:
file.write(str(config))

40