In [1]:
!curl --header "Host: storage.googleapis.com" --header "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.79 Safari/537.36" --header "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" --header "Accept-Language: en-GB,en-US;q=0.9,en;q=0.8" --header "Referer: https://www.kaggle.com/" "https://storage.googleapis.com/kaggle-data-sets/19426/25246/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220802%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220802T045418Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=799a354a79438738ee029cd982685432b434d098a304f558d0ffef95a8d446169d3adc6570276c350742527a34a46dc20fbbe3e788353115fec117c92887930632f214a0fd4210f59fd402a47f00b2e449d39b7114f62b7dfdf1e60b6dfc318623866c25b6a6a168b62de2770f031a238ab2c447a74d640470bf6f86b6962eaa8c1d8a09479c5730c7b83e0b359113954a30fcb36d433db6cbf3f9307f2642cd0343418c89c3fe2369860eb79d196f50afff0fed81328c7e1ab182248b9b6f48f362de497dedd2d15c4a6f1dab6d147a1bf207baba371bb4d41e12dc8de30989803fb2ba9524406772c3b1c4e8cf4335c7825ae448293e0093197dc90ea9e6df" -L -o "archive.zip"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1264M  100 1264M    0     0   180M      0  0:00:07  0:00:07 --:--:--  176M


In [2]:
!unzip archive.zip

Archive:  archive.zip
replace econbiz.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: econbiz.csv             
replace pubmed.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: pubmed.csv              y



In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from keras.layers import Dense, Activation, Dropout, BatchNormalization,Embedding
from keras.models import Sequential
#from keras.optimizers import Adam
from keras.layers import Conv1D, GlobalMaxPooling1D
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#Load econbiz dataset (use only a portion to see pre-processing part)
data = pd.read_csv("econbiz.csv")
data = data.iloc[:100000]
data = data[['id','title','labels']]

In [5]:
#Separate labels (currently separeted by tab space) and this is needed for the multilabelbinarizer
data['labels'] = data['labels'].str.split()

In [6]:
#Pre-process title data
def process(title):

    stop_words = set(stopwords.words('english')) #Stop words to remove
    new_df = re.sub('[^a-zA-Z]', ' ', title) #Removes numbers
    new_df = re.sub(r"\s+[a-zA-Z]\s+", ' ', new_df) # Removes single characters
    new_df = re.sub(r'\s+', ' ', new_df) # Removes multiple spaces
    tokens = word_tokenize(new_df) #Tokenize the title sample for checking stopwords
    new_df = [token for token in tokens if not token in stop_words]


    return new_df

In [7]:
#Preprocess title column
X = []
new_df = list(data['title'])

for idx,title in enumerate(new_df):
  X.append(process(title))

#Preprocess labels to be a binary representation
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(data.labels)
labels = multilabel_binarizer.classes_
y = multilabel_binarizer.transform(data.labels)

In [8]:
#Divide dataset into training and validation set (80/20)
#To this point, the title column still needs to be vectorize
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [9]:
#Initialize tokenizer from keras that will vectorize title values
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(x_train)

In [10]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

vocabulary_size = len(tokenizer.word_index) + 1
x_train = pad_sequences(x_train, padding= 'post',maxlen = 51)
x_test = pad_sequences(x_test,padding='post', maxlen = 51)
print('Pad sequences (samples x time)')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (80000, 51)
x_test shape: (20000, 51)


In [11]:
#Got the model structure and parameters from https://keras.io/examples/imdb_cnn/ which is an example of using CNN for text classification
# set parameters:
max_features = vocabulary_size
maxlen = 51
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 300
epochs = 5

print('Build model...')
model = Sequential()

model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.5))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(y_train.shape[1]))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))




Build model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f20fdf8d810>

In [None]:
probs = np.arange(0.05,1.0,0.05)
scores = []

for prob in probs:
  preds = model.predict(x_test)
  preds[preds>=prob] = 1
  preds[preds<prob] = 0
  scores.append(tuple((f1_score(y_test, preds, average="samples"),prob)))
  print((f1_score(y_test, preds, average="samples"),prob))


(0.11778730099416852, 0.05)
(0.1825840305749914, 0.1)
(0.2061008597658147, 0.15000000000000002)
(0.2021330298811208, 0.2)
(0.18104751101366814, 0.25)
(0.15295772772903948, 0.3)
(0.12615350527525448, 0.35000000000000003)
(0.1007298626210391, 0.4)
(0.07821868185735834, 0.45)
(0.057974582623259094, 0.5)
(0.041670245016421484, 0.55)
(0.029764802403478872, 0.6000000000000001)
(0.022339221872898343, 0.6500000000000001)
(0.01606924963924964, 0.7000000000000001)
(0.011276699134199134, 0.7500000000000001)
