In [None]:
!pip install --upgrade transformers
!pip install simpletransformers

Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.20.1
    Uninstalling transformers-4.20.1:
      Successfully uninstalled transformers-4.20.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.10.1 requires transformers<4.21,>=4.1, but you have transformers 4.25.1 which is incompatible.[0m[31m
[0mSuccessfully installed transformers-4.25.1
[0mCollecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 kB[0m [31m912.5 kB/s[0m eta [36m0:00:00[0m

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy.special import softmax

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/science-topic-classification/SampleSubmission.csv
/kaggle/input/science-topic-classification/train.csv
/kaggle/input/science-topic-classification/test.csv


## Submission File Check

In [None]:
sub = pd.read_csv('/kaggle/input/science-topic-classification/SampleSubmission.csv')
sub.head()

Unnamed: 0,ID,label
0,1,-1
1,2,-1
2,3,-1
3,4,-1
4,5,-1


## Data Check

In [None]:
train = pd.read_csv('/kaggle/input/science-topic-classification/train.csv')
train['articles'] = train['TITLE'] + train['ABSTRACT']
train = train[['articles', 'label']]
train.head()

Unnamed: 0,articles,label
0,Detecting the impact of public transit on the ...,0
1,Is Proxima Centauri b habitable? -- A study of...,1
2,Verifying Security Protocols using Dynamic Str...,0
3,Scenic: Language-Based Scene Generation Synth...,0
4,Near-Optimal Discrete Optimization for Experim...,0


In [None]:
test = pd.read_csv('/kaggle/input/science-topic-classification/test.csv')
test['articles'] = test['TITLE'] + test['ABSTRACT']
test = test[['articles']]
test['label'] = 0
test.head()

Unnamed: 0,articles,label
0,An analytic resolution to the competition betw...,0
1,Attention-based Natural Language Person Retrie...,0
2,Asymptotics of multivariate contingency tables...,0
3,Discriminant of the ordinary transversal singu...,0
4,Pharmacokinetics Simulations for Studying Corr...,0


## Preprocessing

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
def preprocessor(df: pd.DataFrame):
    df['articles'] = df['articles'].str.replace('\n', ' ')
    df['articles'] = df['articles'].str.replace('\t', ' ')
    df['articles'] = df['articles'].str.replace('?', '')
    
    df['articles'] = df['articles'].apply(lambda x: re.sub(r'[0-9]', '', x))
    df['articles'] = df['articles'].apply(lambda x: re.sub(r'[/(){}\[\]\|@,;.:-]', ' ', x))
    
    df['articles'] = df['articles'].apply(lambda x: x.lower())
    
    df['articles'] = df['articles'].str.replace('  ', ' ')
    
    df['articles'] = df['articles'].apply(lambda x: ' '.join(word for word in x.split()
                                                             if word not in stopwords.words('english')))
            
    return df

In [None]:
train = preprocessor(train)
test = preprocessor(test)

In [None]:
train.head()

Unnamed: 0,articles,label
0,detecting impact public transit transmission e...,0
1,proxima centauri b habitable study atmospheric...,1
2,verifying security protocols using dynamic str...,0
3,scenic language based scene generation synthet...,0
4,near optimal discrete optimization experimenta...,0


## Roberta

In [None]:
from simpletransformers.classification import ClassificationModel

In [None]:
args = {'train_batch_size': 32,
        'reprocess_input_data': True,
        'num_train_epochs': 3,
        'fp16': False,
        'do_lower_case': False,
        'max_seq_length': 256,
        'regression': False,
        'learning_rate': 5e-5,
        'weight_decay': 0.0,
        'save_eval_checkpoints': False,
        'save_model_every_epoch': False}

In [None]:
model = ClassificationModel('roberta', 'roberta-base', use_cuda=True, num_labels=4, args=args)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
model.train_model(train)

  0%|          | 0/15472 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/484 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/484 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/484 [00:00<?, ?it/s]

(1452, 0.46390723254911337)

In [None]:
res_r, output_r, wrong_pred_r = model.eval_model(test)

  0%|          | 0/4844 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/606 [00:00<?, ?it/s]

In [None]:
soft_r = softmax(output_r, axis=1)

In [None]:
pred_r = np.argmax(output_r, axis=-1)
pred_r

array([1, 0, 2, ..., 1, 1, 1])

In [None]:
sub['label'] = pred_r
sub.to_csv('res_roberta.csv', index=False)

## Bert

In [None]:
args = {'train_batch_size': 32,
        'reprocess_input_data': True,
        'overwrite_output_dir': True,
        'num_train_epochs': 2,
        'fp16': False,
        'do_lower_case': False,
        'max_seq_length': 256,
        'regression': False,
        'learning_rate': 5e-5,
        'weight_decay': 0.0,
        'save_eval_checkpoints': False,
        'save_model_every_epoch': False}

In [None]:
model = ClassificationModel('bert', 'bert-base-uncased', use_cuda=True, num_labels=4, args=args)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
model.train_model(train)

  0%|          | 0/15472 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/484 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/484 [00:00<?, ?it/s]

(968, 0.4755143125287511)

In [None]:
res_b, output_b, wrong_pred_b = model.eval_model(test)

  0%|          | 0/4844 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/606 [00:00<?, ?it/s]

In [None]:
soft_b = softmax(output_b, axis=1)

In [None]:
pred_b = np.argmax(output_b, axis=-1)
pred_b

array([1, 0, 2, ..., 1, 1, 1])

In [None]:
sub['label'] = pred_b
sub.to_csv('res_bert.csv', index=False)

## Ensemble

In [None]:
pred_e = np.argmax(soft_r + soft_b, axis=-1)
pred_e

array([1, 0, 2, ..., 1, 1, 1])

In [None]:
sub['label'] = pred_e
sub.to_csv('res_ensemble.csv', index=False)