In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
t = pd.read_csv('../input/agnews/train.csv')
train = t['Title']
label_train = t['Class Index']
label_train = [i-1 for i in label_train]

t = pd.read_csv('../input/agnews/test.csv')
test = t['Title']
label_test = t['Class Index']
label_test = [i-1 for i in label_test]
label_test = np.array(label_test)

In [3]:
import nltk
from nltk.corpus import stopwords

corpus_train = []
corpus_test = []
punc = '''()-[]{};:'"\,<>/@#$%^&*_.~”'''
stop = set(stopwords.words('english'))

for i in train:
    tokens = nltk.word_tokenize(i)
    tokens = [w for w in tokens if w not in punc]
    tokens = [w for w in tokens if w not in stop]
    corpus_train.append(tokens)

for i in test:
    tokens = nltk.word_tokenize(i)
    tokens = [w for w in tokens if w not in punc]
    tokens = [w for w in tokens if w not in stop]
    corpus_test.append(tokens)

In [4]:
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load('../input/word2vec-model/wiki-lemma-100D-phrase')

train_max = 0
test_max = 0
max_len = 0

for i in corpus_train:
    if len(i) > train_max:
        train_max = len(i)
        
for i in corpus_test:
    if len(i) > test_max:
        test_max = len(i)
        
if train_max > test_max:
    max_len = train_max
else:
    max_len = test_max
    
feature = []
t_feature = []

# size of corpus_train matrix of each corpus: max_len*100
for i in range(len(corpus_train)):
    vec = np.zeros((max_len, 100))
    
    for j in range(len(corpus_train[i])):
        if corpus_train[i][j] in model.wv.key_to_index:
            vec[j] = model.wv[corpus_train[i][j]]
        else:
            vec[j] = np.zeros(100)

    feature.append(vec)
    
for i in range(len(corpus_test)):
    vec = np.zeros((max_len, 100))
    
    for j in range(len(corpus_test[i])):
        if corpus_test[i][j] in model.wv.key_to_index:
            vec[j] = model.wv[corpus_test[i][j]]
        else:
            vec[j] = np.zeros(100)

    t_feature.append(vec)

feature = np.array(feature)
t_feature = np.array(t_feature)
print(feature.shape)

In [5]:
# artificial neural networks
import torch
import torch.nn as nn

class ann_model(nn.Module):
    def __init__(self):
        super(ann_model, self).__init__()
        self.w2v_dim = 100
        self.feature_num = 50
        self.max_len = max_len
        self.label_num = 4
        self.kernel_size = [3, 4]
        
        self.convs = nn.ModuleList([
                     nn.Sequential(nn.Conv1d(self.w2v_dim, self.feature_num, h),
                                   nn.BatchNorm1d(self.feature_num),
                                   nn.ReLU(),
                                   nn.MaxPool1d(self.max_len - h + 1)) for h in self.kernel_size])
        self.fc = nn.Linear(self.feature_num * len(self.kernel_size), self.label_num)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        out = [conv(x) for conv in self.convs]
        out = torch.cat(out, dim = 1)
        out = out.view(-1, out.size(1))
        out = self.fc(out)

        return out

In [6]:
# preparation before training
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

f_train, f_val, l_train, l_val = train_test_split(feature, label_train, test_size=0.3)

class dataset(Dataset):
    def __init__(self, f, l):
        self.data = f
        self.label = l

    def __getitem__(self, index):
        return self.data[index], self.label[index]

    def __len__(self):
        return len(self.data)

ds_train = dataset(torch.from_numpy(f_train), torch.tensor(l_train))
dl_train = DataLoader(dataset = ds_train, batch_size = 32, shuffle = True) # get 32 features and labels each time

def get_accu(output, label):
    predict = torch.max(output, 1).indices
    accu = accuracy_score(label.tolist(), predict.tolist())
    matrix = confusion_matrix(label.tolist(), predict.tolist())
            
    return accu, matrix

In [7]:
# train
from torch.optim import Adam
from torch.optim import SGD
from torch.autograd import Variable
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

m = ann_model().double()
optimizer = Adam(m.parameters())
# optimizer = SGD(m.parameters(), lr=0.001, momentum=0.9)
loss_func = nn.CrossEntropyLoss()

if torch.cuda.is_available():
    m = m.cuda()
    loss_func = loss_func.cuda()
    
# train the model
def train(epoch):
    for e in range(epoch):
        this_epoch_loss = 0
        this_epoch_accu = 0
        
        for i, (data, label) in enumerate(dl_train): # i for batch index
            data, label = Variable(data), Variable(label)
            
            if torch.cuda.is_available():
                data = data.cuda()
                label = label.cuda()

            optimizer.zero_grad()
            output = m(data.double()) # output size: batch size * label num
            loss = loss_func(output, label.long())
            # calculate gradient
            loss.backward()
            # update parameters
            optimizer.step()
            
            accu, _= get_accu(output, label)
            this_epoch_accu += accu
            this_epoch_loss += loss
            
        if (e + 1) % 2 == 0:
            print('Training epoch: {}/{},  loss: {:.4f}, accu: {:.4f}'.format(e + 1, 20, this_epoch_loss / (i + 1), this_epoch_accu / (i + 1)))
    
# let's get it
train(20)

In [8]:
import matplotlib.pyplot as plt

# validate
val_f = torch.from_numpy(f_val)
val_l = torch.tensor(list(l_val))

val_f, val_l = Variable(val_f), Variable(val_l)

if torch.cuda.is_available():
    val_f = val_f.cuda()
    val_l = val_l.cuda()

output = m(val_f.double())
loss = loss_func(output, val_l.long())
val_accu, mat = get_accu(output, val_l)
val_loss = loss

print('Validation: loss: {:.4f}, accu: {:.4f}'.format(val_loss, val_accu))

plt.figure(figsize = (6, 6))
plt.matshow(mat, fignum = 1, cmap = plt.cm.Blues, alpha = 0.7)
for i in range(mat.shape[0]):
    for j in range(mat.shape[1]):
        plt.text(x = j, y = i, s = mat[i,j], va='center', ha='center')

target_name = ['1', '2', '3', '4']
plt.xticks(np.arange(4), target_name) 
plt.yticks(np.arange(4), target_name)
        
plt.title('Validation')
plt.xlabel('predicted value')
plt.ylabel('true value')
plt.show()

In [9]:
# test
test_f = torch.from_numpy(t_feature)

if torch.cuda.is_available():
    m = m.cuda()
    test_f = test_f.cuda()

output = m(test_f.double())
test_accu, mat = get_accu(output, label_test)

print('Test: accu: {:.4f}'.format(test_accu))

plt.figure(figsize = (6, 6))
plt.matshow(mat, fignum = 1, cmap = plt.cm.Blues, alpha = 0.7)
for i in range(mat.shape[0]):
    for j in range(mat.shape[1]):
        plt.text(x = j, y = i, s = mat[i,j], va='center', ha='center')

target_name = ['1', '2', '3', '4']
plt.xticks(np.arange(4), target_name) 
plt.yticks(np.arange(4), target_name)
        
plt.title('Test')
plt.xlabel('predicted value')
plt.ylabel('true value')
plt.show()