In [None]:
import pandas as pd
train = pd.read_csv('../input/hope-english/english_hope_train.csv',names=['tweet','hope','un'],sep='\t')
val = pd.read_csv('../input/hope-english/english_hope_dev.csv',names=['tweet','hope','un'],sep='\t')
test = pd.read_csv('../input/hope-english/english_hope_test.csv',sep=';')

In [None]:
!nvidia-smi

In [None]:
train=train[train['hope']!='not-English']
val=val[val['hope']!='not-English']
test=test[test['label']!='not-English']

In [None]:
hope_df=pd.DataFrame()
non_hope_df=pd.DataFrame()
hope_df=train[train['hope']=='Hope_speech']
non_hope_df=pd.DataFrame()
non_hope_df=train[train['hope']=='Non_hope_speech']
from collections import Counter
count_hope=Counter()
count_nonhope=Counter()
hope_df.tweet.str.split().apply(count_hope.update)
non_hope_df.tweet.str.split().apply(count_nonhope.update)

In [None]:
overlap=[]
for key in count_hope:
  if key in count_nonhope and count_hope[key]>50 and count_nonhope[key]>50:
    overlap.append(key)
len(overlap)

In [None]:
def remove_overlap(text):
  wrd=[]
  for word in text.split():
    if word not in overlap:
      wrd.append(word)
  return " ".join(wrd)

In [None]:
train['modified']=train['tweet'].apply(lambda x:remove_overlap(x))
val['modified']=val['tweet'].apply(lambda x:remove_overlap(x))
test['modified']=test['text'].apply(lambda x:remove_overlap(x))


In [None]:
df=train[train['hope']=='Hope_speech']
df['modified']=df['tweet'].apply(lambda x:remove_overlap(x))
df1=train[train['hope']=='Non_hope_speech']
df1['modified']=df1['tweet']
new_train=df1.append(df)

In [None]:
df=val[val['hope']=='Hope_speech']
df['modified']=df['tweet'].apply(lambda x:remove_overlap(x))
df1=val[val['hope']=='Non_hope_speech']
df1['modified']=df1['tweet']
new_val=df1.append(df)

In [None]:
df=test[test['label']=='Hope_speech']
df['modified']=df['text'].apply(lambda x:remove_overlap(x))
df1=test[test['label']=='Non_hope_speech']
df1['modified']=df1['text']
new_test=df1.append(df)

In [None]:
!python3 -m pip install -q git+https://github.com/Zhylkaaa/simpletransformers.git@add_losses
!pip install -q tensorboardX
from simpletransformers.config.model_args import ClassificationArgs
from simpletransformers.classification import ClassificationModel

In [None]:
from sklearn.preprocessing import LabelEncoder
import torch
encoder=LabelEncoder()
train_df=pd.DataFrame()
valid_df=pd.DataFrame()
test_df=pd.DataFrame()
train_df['text']=train['modified']
valid_df['text']=val['modified']
test_df['text']=test['modified']
train_df['labels']=encoder.fit_transform(train['hope'])
valid_df['labels']=encoder.fit_transform(val['hope'])
test_df['labels']=encoder.fit_transform(test['label'])

In [None]:
from sklearn.preprocessing import LabelEncoder
import torch
encoder=LabelEncoder()
train_df=pd.DataFrame()
valid_df=pd.DataFrame()
test_df=pd.DataFrame()
train_df['text']=new_train['modified']
valid_df['text']=new_val['modified']
test_df['text']=new_test['modified']
train_df['labels']=encoder.fit_transform(new_train['hope'])
valid_df['labels']=encoder.fit_transform(new_val['hope'])
test_df['labels']=encoder.fit_transform(new_test['label'])

In [None]:
valid_df['tweet']=new_val['tweet']
test_df['tweet']=new_test['text']

In [None]:
model_args = ClassificationArgs(
    num_train_epochs=5,
    overwrite_output_dir=True, 
    manual_seed=42,
    max_seq_length=160,
    #sliding_window=True,
    eval_batch_size=64,
    do_lower_case=True,
    early_stopping_patience=2,
    save_model_every_epoch=False,
    early_stopping_consider_epochs=True,
    use_early_stopping=True,
    evaluate_during_training=False,
    #wandb_project='Hope_NoOverlap_FL',
    learning_rate=3e-05,
    loss_type = 'focal', # or 'dice' or 'tversky'
    loss_args = {
            'alpha': None,
            'gamma': 2,
            'reduction': 'mean',
            'ignore_index': -100,
        })
model = ClassificationModel(
    model_type='bert',
    model_name='bert-base-multilingual-uncased', 
    use_cuda=torch.cuda.is_available(),
    #cuda_device=3,
    num_labels=len(encoder.classes_), 
    args=model_args)

In [None]:
model.train_model(train_df,args={'train_batch_size':32},eval_df=valid_df)

In [None]:
val_predictions, raw_outputs = model.predict(valid_df['text'].tolist())
test_predictions, raw_outputs = model.predict(test_df['text'].tolist())

In [None]:
from sklearn.metrics import confusion_matrix, classification_report,f1_score
#print('f1 score:', f1_score(valid_df['labels'], val_predictions,average='macro'))
print('class_report\n',classification_report(valid_df['labels'],val_predictions,target_names=list(encoder.classes_),digits=4))

In [None]:
#print('f1 score:', f1_score(test_df['labels'], test_predictions,average='macro'))
print('class_report\n',classification_report(test_df['labels'],test_predictions,target_names=list(encoder.classes_),digits=4))

In [None]:
valid_df['overlap_pred_fl']=val_predictions
test_df['overlap_pred_fl']=test_predictions

In [None]:
valid_df['labels']=encoder.inverse_transform(valid_df['labels'])
valid_df['overlap_pred_fl']=encoder.inverse_transform(valid_df['overlap_pred_fl'])
test_df['labels']=encoder.inverse_transform(test_df['labels'])
test_df['overlap_pred_fl']=encoder.inverse_transform(test_df['overlap_pred_fl'])

In [None]:
valid_df.to_csv('valid_results.csv')
test_df.to_csv('test_results.csv')