In [1]:
import bz2
from tqdm import tqdm
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from joblib import Parallel, delayed
import joblib

### Extract bz2 files

In [2]:

path_train='./data/train.ft.txt'

with bz2.open('./data/train.ft.txt.bz2', 'rt', encoding='utf-8') as compressed_file, open(path_train, 'w', encoding='utf-8') as output_file:
    for line in compressed_file:
        output_file.write(line)
    print('Train data export successful.')
    
path_test='./data/test.ft.txt'

with bz2.open('./data/test.ft.txt.bz2', 'rt', encoding='utf-8') as compressed_file, open(path_test, 'w', encoding='utf-8') as output_file:
    for line in compressed_file:
        output_file.write(line)
    print('Test data export successful.')

Train data export successful.
Test data export successful.


### Check txt files

In [3]:
# Specify the number of lines you want to display
num_lines = 5  # Change this number as needed

# Open the file and read the first 'num_lines' lines
with open(path_train, 'r') as file:
    for i, line in enumerate(file):
        if i < num_lines:
            print(line.strip())  # Strip removes trailing newline characters
        else:
            break

__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.
__label__2 Amazing!: This soundtrack is m

In [4]:
train,test,train_label,test_label=[],[],[],[]
with open(path_train, 'r', encoding='utf-8') as file:
    lines = file.readlines()
for line in tqdm(lines):
    train.append(line.split('__label__')[1][1:])
    train_label.append(line.split('__label__')[1][0])
with open(path_test, 'r', encoding='utf-8') as file:
    lines = file.readlines()
for line in tqdm(lines):
    test.append(line.split('__label__')[1][1:])
    test_label.append(line.split('__label__')[1][0])

100%|██████████| 3600000/3600000 [00:04<00:00, 839233.65it/s]
100%|██████████| 400000/400000 [00:00<00:00, 843153.29it/s]


In [5]:
def clean_text(text):
    # Remove non-alphanumeric characters and extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert multiple whitespace characters to a single space
    text = re.sub(r'\s+', ' ', text)
    # Convert the text to lowercase
    text = text.lower()
    return text

In [6]:
print('Train Length',len(train))
print('Train Label Length',len(train_label))
print('Test Length',len(test))
print('Test Label Length',len(test_label))

Train Length 3600000
Train Label Length 3600000
Test Length 400000
Test Label Length 400000


In [7]:
train_label[0],train[0]

('2',
 ' Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n')

In [8]:
train_label[0],clean_text(train[0])

('2',
 ' stuning even for the nongamer this sound track was beautiful it paints the senery in your mind so well i would recomend it even to people who hate vid game music i have played the game chrono cross but out of all of the games i have ever played it has the best music it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras it would impress anyone who cares to listen ')

In [9]:
train=pd.DataFrame(train)[0].apply(clean_text)
test=pd.DataFrame(test)[0].apply(clean_text)
train.head()

0     stuning even for the nongamer this sound trac...
1     the best soundtrack ever to anything im readi...
2     amazing this soundtrack is my favorite music ...
3     excellent soundtrack i truly like this soundt...
4     remember pull your jaw off the floor after he...
Name: 0, dtype: object

In [26]:
# Load stopwords into a global variable
stopwords = set(nltk.corpus.stopwords.words('english'))

# function to tokenize and remove stopwords from a single text
def remove_stopwords(series):
    return series.apply(lambda text: ' '.join(word for word in nltk.word_tokenize(text) if word.lower() not in stopwords))

In [30]:
# tokenize and remove stopwords in parallel
def parallelize_tokenizer(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Parallel(n_jobs=n_cores)
    df = pd.concat(pool(delayed(func)(i) for i in df_split))
    return df

num_cores = joblib.cpu_count()
train_token = parallelize_tokenizer(train, remove_stopwords,num_cores)
test_token = parallelize_tokenizer(test, remove_stopwords,num_cores)


In [11]:
from transformers import BertTokenizer

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and pad sequences
def tokenize_and_pad(text, max_len):
    tokenized = tokenizer.encode_plus(
        text, 
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    return tokenized["input_ids"], tokenized["attention_mask"]

max_len = 512
train_tokenized = [tokenize_and_pad(text, max_len) for text in train]
test_tokenized = [tokenize_and_pad(text, max_len) for text in test]

In [None]:
# vectorize


In [13]:
df_train = pd.DataFrame({"text":train,"label":train_label})
df_test = pd.DataFrame({"text":test,"label":test_label})

In [15]:
df_train.to_parquet('./data/train.parquet')
df_test.to_parquet('./data/test.parquet')

# Modelling

In [11]:
voc_size = 20000
max_length = 100
tokenizer = Tokenizer(num_words=voc_size)
tokenizer.fit_on_texts(train)
word_index = tokenizer.word_index

In [12]:
train = tokenizer.texts_to_sequences(train)
train = pad_sequences(train, maxlen=max_length)
test = tokenizer.texts_to_sequences(test)
test = pad_sequences(test, maxlen=max_length)

In [13]:
train_lab=np.array([1 if i=='2' else 0 for i in train_label])
test_lab=np.array([1 if i=='2' else 0 for i in test_label])

In [14]:
model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=64, input_length=max_length))
model.add(LSTM(units=32, return_sequences=True))
model.add(SpatialDropout1D(rate=0.2))  
model.add(LSTM(units=32))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           1280000   
                                                                 
 lstm (LSTM)                 (None, 100, 32)           12416     
                                                                 
 spatial_dropout1d (Spatial  (None, 100, 32)           0         
 Dropout1D)                                                      
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 1300769 (4.96 MB)
Trainable params: 1300769 (4.96 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [15]:
checkpoint_cb =ModelCheckpoint(".data/my_keras_model.h5", save_best_only=True)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(train, train_lab, epochs=2,
                    validation_split=.1,
                    callbacks=[checkpoint_cb])

Epoch 1/2
  5509/101250 [>.............................] - ETA: 1:13:18 - loss: 0.2745 - accuracy: 0.8868

KeyboardInterrupt: 

In [None]:
loss,accuracy = model.evaluate(test, test_lab)
print("Loss:", loss)
print("Accuracy:", accuracy)

In [None]:
pd.DataFrame(history.history)

In [None]:
prediction=model.predict(test)
y_pred=np.where(prediction>=.5,1,0)
df=pd.DataFrame()
df['actual'],df['pred']=test_lab,y_pred

In [None]:
CM = confusion_matrix(test_lab, y_pred)
sns.heatmap(CM,annot=True,center = True,fmt='g',cmap='Blues')
CM

In [None]:
ClassificationReport = classification_report(test_lab,y_pred)
print('Classification Report is : ', ClassificationReport) 