#### I really  like potential of modern NLP tools and frameworks like Flair, Texthero, Distillbert
... with couple of lines you could do quite a lot with quite impresive results

In [2]:
import pandas as pd
import numpy as np

from flair.data import Sentence, Dictionary
from flair.datasets import ClassificationCorpus
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings, FlairEmbeddings, TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from torch.optim.adam import Adam

from pathlib import Path
from texthero import preprocessing

### Loading data

In [3]:
df= pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission = pd.read_csv("sample_submission.csv")
print('There are {} rows and {} columns in train'.format(df.shape[0],df.shape[1]))
print('There are {} rows and {} columns in train'.format(test.shape[0],test.shape[1]))

There are 7613 rows and 5 columns in train
There are 3263 rows and 4 columns in train



### Preprocessing column 'text' with predefined tools from TextHero package 
#### ( steps are selfexplanatory)

In [4]:
custom_pipeline = [preprocessing.fillna,
                   preprocessing.remove_stopwords,
                   preprocessing.lowercase,
                   preprocessing.remove_whitespace,
                   preprocessing.remove_digits,
                   preprocessing.remove_html_tags,
                   preprocessing.remove_punctuation,
                   
                   ] 


In [5]:
df['text_cleaner'] = preprocessing.clean(df["text"], custom_pipeline)
test['text_cleaner'] = preprocessing.clean(test["text"], custom_pipeline)

In [6]:
df.sample(3)

Unnamed: 0,id,keyword,location,text,target,text_cleaner
6929,9938,trouble,on twitter,why is it trouble@niallhariss / @simply_vain l...,0,trouble niallhariss simply vain live http c...
4540,6454,injured,Ikorodu,Photos: 17 people killed and over 25 injured i...,1,photos people killed injured deadly saudi...
7205,10321,weapon,statesboro/vidalia,@ThatRussianMan you're too busy finishing thos...,0,thatrussianman busy finishing weapon designs


In [7]:
test.sample(3)

Unnamed: 0,id,keyword,location,text,text_cleaner
1016,3343,demolished,"Bolton, England",@OpTic_DKarma dude they demolished you!,optic dkarma dude demolished
1777,6008,hazardous,,#diablo #dsp Olap #world pres: http://t.co/LFE...,diablo dsp olap world pres http co lfetnrx...
2533,8460,screamed,"Holland,MI",I just screamed when I saw Jordan come out the...,i screamed i saw jordan come door behind donni...


### Preparing input for Distilbert

In [13]:
!head tweet_train/tweet_text/dev.txt

__label__0	trauma team needs come american e shop 
__label__1	 ptsd chat yes  i feel root shame   found rubble trauma   ptsdchat
__label__1	hiroshima  they told paint story  eighty nine year old man recalls terror trauma   http co spe7u8t40k
__label__1	photo  lavenderpoetrycafe  the forgotten history sexual trauma hysteria affliction seen primarily  http co u2es0uk1u3
__label__1	trauma injuries involving kids sport usually cycling related   cbc ca http co 0dqjeretxu
__label__1	butt trauma extraordinaire
__label__0	author interview michele rosenthal author your life after trauma 
__label__0	a1  i started writing i   talk trauma therapy way i could communicate  gravitychat
__label__0	 ashghebranious civil rights continued 60s  and trans generational trauma  anything listen americans 
__label__0	 thetimepast  saalon i childhood trauma resolved   actual trauma  fricken babies 


 #### This is the format of file which Flair classifier understand


In [9]:
!mkdir -p tweet_train/tweet_text 

In [10]:
# creating three files aout of training data
def save_like_fasttext(text_feat, target_feat, dir_path):

    df['label'] = '__label__' + df['target'].astype(str)

    df[ ['label', text_feat] ].iloc[0:int(len(df)*0.8)].to_csv(dir_path + '/train.txt', sep='\t', index=False, header=False)
    df[ ['label', text_feat] ].iloc[int(len(df)*0.8):int(len(df)*0.9)].to_csv(dir_path + '/test.txt', sep='\t', index=False, header=False)
    df[ ['label', text_feat] ].iloc[int(len(df)*0.9):].to_csv(dir_path + '/dev.txt', sep='\t', index=False, header=False);
    
    
    
save_like_fasttext("text_cleaner", "target", "tweet_train/tweet_text") 

In [None]:
data_folder = Path('./tweet_train/tweet_text').resolve()

#creating corpus  for Classifier
corpus = ClassificationCorpus(
    data_folder,
    test_file='test.txt',
    dev_file='dev.txt',
    train_file='train.txt')


# print(corpus.obtain_statistics())

## DistillBert in action

In [28]:
label_dict = corpus.make_label_dictionary()
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)
trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

2020-11-09 17:58:58,694 Computing label dictionary. Progress:


100%|██████████| 6851/6851 [00:02<00:00, 2656.42it/s]

2020-11-09 17:59:01,439 [b'1', b'0']





In [None]:
trainer.train('models/tweets',
              learning_rate=3e-5, # use very small learning rate
              mini_batch_size=16,
              mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
              max_epochs=5, # terminate after 5 epochs
              )

In [None]:
# what should be done is to predict values for the test part based on the trained model best-model.pt

#### This semi automatic approach it gives  similar score on kaggle ie __0.817__ on Leaderboard.
#### Of course it is just a beginning as no info at all was used from other columns and other actions to improve the score.