# Changelog

### Version 4

* Use `xlnet-base-cased`
* Change batch_size 320 to 128
* Change epoch 5 to 2
* Change LR 5e-5 to 3e-4

### Version 3

* Change batch_size 128 to 320
* Change epoch 3 to 5
* Change LR 3e-5 to 5e-5

### Version 2

* Use `distilroberta-base`
* Tidy code cell position
* Change batch_size & maxlen parameter

### Version 1

* Initial code

In [1]:
!pip install ktrain

Collecting ktrain
  Downloading ktrain-0.19.1.tar.gz (25.2 MB)
[K     |████████████████████████████████| 25.2 MB 4.9 MB/s 
[?25hCollecting tensorflow==2.1.0
  Downloading tensorflow-2.1.0-cp37-cp37m-manylinux2010_x86_64.whl (421.8 MB)
[K     |████████████████████████████████| 421.8 MB 21 kB/s 
Collecting scikit-learn==0.21.3
  Downloading scikit_learn-0.21.3-cp37-cp37m-manylinux1_x86_64.whl (6.7 MB)
[K     |████████████████████████████████| 6.7 MB 32.2 MB/s 
Collecting keras_bert>=0.81.0
  Downloading keras-bert-0.86.0.tar.gz (26 kB)
Collecting langdetect
  Downloading langdetect-1.0.8.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 46.8 MB/s 
Collecting cchardet==2.1.5
  Downloading cchardet-2.1.5-cp37-cp37m-manylinux1_x86_64.whl (241 kB)
[K     |████████████████████████████████| 241 kB 47.7 MB/s 
Collecting seqeval
  Downloading seqeval-0.0.12.tar.gz (21 kB)
Collecting syntok
  Downloading syntok-1.3.1.tar.gz (23 kB)
Collecting whoosh
  Do

In [2]:
import os
import random
import gc

import numpy as np
import pandas as pd
import ktrain



In [3]:
!pip freeze > requirements.txt

In [4]:
print('Numpy version:', np.__version__)
print('Pandas version:', pd.__version__)
print('ktrain version:', ktrain.__version__)

Numpy version: 1.18.5
Pandas version: 1.0.3
ktrain version: 0.19.1


In [5]:
SEED = 42

os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)

## Check system specifiction 

In [6]:
!lscpu

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              2
On-line CPU(s) list: 0,1
Thread(s) per core:  2
Core(s) per socket:  1
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               85
Model name:          Intel(R) Xeon(R) CPU @ 2.00GHz
Stepping:            3
CPU MHz:             2000.142
BogoMIPS:            4000.28
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            1024K
L3 cache:            39424K
NUMA node0 CPU(s):   0,1
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3

In [7]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:          16045        1129        4663           0       10252       14639
Swap:             0           0           0


In [8]:
!nvidia-smi

Sat Aug  1 05:52:23 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

# Dataset

In [9]:
X_train = pd.read_parquet('/kaggle/input/shopee-review-cleaned/X_train.parquet', engine='pyarrow')
X_train = X_train['X']

X_test = pd.read_parquet('/kaggle/input/shopee-review-cleaned/X_test.parquet', engine='pyarrow')
X_test = X_test['X']

y_train = pd.read_parquet('/kaggle/input/shopee-review-cleaned/y_train.parquet', engine='pyarrow')
y_train = y_train['y']

# Preprocess dataset

In [10]:
t = ktrain.text.Transformer('distilroberta-base', maxlen=65, classes=[str(r) for r in range(1, 6)])



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=480.0, style=ProgressStyle(description_…




In [11]:
X_train = X_train.apply(lambda t: ' '.join(t))
X_test = X_test.apply(lambda t: ' '.join(t))

y_train = y_train.apply(lambda r: str(r))

# to fix this issue https://github.com/huggingface/transformers/issues/3809
X_train = X_train.replace({'': '.'})

In [12]:
train = t.preprocess_train(X_train.to_list(), y_train.to_list())

preprocessing train...
language: en
train sequence lengths:
	mean : 18
	95percentile : 43
	99percentile : 61




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




Is Multi-Label? False


In [13]:
gc.collect()

32

# Train

In [14]:
model = t.get_classifier()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=487203636.0, style=ProgressStyle(descri…




In [15]:
model.summary()

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  82118400  
_________________________________________________________________
classifier (TFRobertaClassif multiple                  594437    
Total params: 82,712,837
Trainable params: 82,712,837
Non-trainable params: 0
_________________________________________________________________


In [16]:
learner = ktrain.get_learner(model, train_data=train, batch_size=320)

In [17]:
# Google recommender LR : 2e-5 to 5e-5
learner.fit_onecycle(3e-4, 5)



begin training using onecycle policy with max lr of 0.0003...
Train for 5155 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8ebe3ff490>

In [18]:
gc.collect()

631

# Test

In [19]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [20]:
y_test_pred = predictor.predict(X_test.to_list())
y_test_pred = [np.int32(y) for y in y_test_pred]

In [21]:
df_submission = pd.concat([pd.Series(list(range(1,60428)), name='review_id', dtype=np.int32), pd.Series(y_test_pred, name='rating')], axis=1)
df_submission.to_csv('submission_preprocess_text.csv', index=False)

df_submission

Unnamed: 0,review_id,rating
0,1,3
1,2,3
2,3,5
3,4,5
4,5,5
...,...,...
60422,60423,4
60423,60424,3
60424,60425,4
60425,60426,5


In [22]:
df_test = pd.read_csv('/kaggle/input/student-shopee-code-league-sentiment-analysis/test.csv')
y_test_pred2 = predictor.predict(df_test['review'].to_list())

In [23]:
df_submission2 = pd.concat([pd.Series(list(range(1,60428)), name='review_id', dtype=np.int32), pd.Series(y_test_pred2, name='rating')], axis=1)
df_submission2.to_csv('submission_raw_text.csv', index=False)

df_submission2

Unnamed: 0,review_id,rating
0,1,4
1,2,3
2,3,5
3,4,5
4,5,5
...,...,...
60422,60423,4
60423,60424,3
60424,60425,4
60425,60426,5


In [24]:
y_test_pred3 = predictor.predict(X_test.to_list(), return_proba=True)
for i in range(len(y_test_pred3)):
    y_test_pred3[i, 0] = y_test_pred3[i, 0] * 1.05
    y_test_pred3[i, 3] = y_test_pred3[i, 4] * 1.3
    y_test_pred3[i, 4] = y_test_pred3[i, 3] * 1.3
y_test_pred3 = np.argmax(y_test_pred3, axis=1)
for i in range(len(y_test_pred3)):
    y_test_pred3[i] = y_test_pred3[i] + 1
y_test_pred3 = [np.int32(y) for y in y_test_pred]

In [25]:
df_submission = pd.concat([pd.Series(list(range(1,60428)), name='review_id', dtype=np.int32), pd.Series(y_test_pred3, name='rating')], axis=1)
df_submission.to_csv('submission_preprocess_text_mod_proba.csv', index=False)

df_submission

Unnamed: 0,review_id,rating
0,1,3
1,2,3
2,3,5
3,4,5
4,5,5
...,...,...
60422,60423,4
60423,60424,3
60424,60425,4
60425,60426,5
