# Analyze Improvement

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import TensorDataset, DataLoader

RANDOM_SEED=1

def set_random_seed_data(seed):
    RANDOM_SEED = seed

def lowercase(text):
    return text.lower()

def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

def preprocess_text(text):
    text = lowercase(text)
    text = remove_nonaplhanumeric(text)
    text = remove_unnecessary_char(text)
    return text

def load_data(PATH):
    data = pd.read_csv(PATH)
    data['label'] = ((data['HS'] == 1) | (data['Abusive'] == 1)).apply(lambda x: int(x))
    data = data[['Tweet', 'label']]
    data = data.rename(columns={'Tweet': 'text'})

    X_train, X_test, y_train, y_test = train_test_split(data.text.values, 
                                                        data.label.values, 
                                                        test_size=0.1,
                                                        random_state=RANDOM_SEED,
                                                        stratify=data.label.values)
    train = pd.DataFrame({'text': X_train,
                          'label': y_train})

    test = pd.DataFrame({'text': X_test,
                         'label': y_test})
    
    return test

In [3]:
test = load_data('../../../../../data/toxic/preprocessed_indonesian_toxic_tweet.csv')
test.head()

Unnamed: 0,text,label
0,gubernur daerah khusus ibukota salat jumat ber...,0
1,hindu tidak mengenal hari persidangan hindu p...,0
2,jancuk jancuk,1
3,bukti on bukti jangan congor doang,1
4,19 akhirnya adalah khilafah yang ditempuh deng...,1


# Improvement

In [4]:
data = pd.read_csv('result_toxic_toxic_XLM_R_A_11852_0.5_full.csv')
data['y_pred_int'] = data['y_pred'].apply(lambda x: int(x>=0.5))
data.head()

Unnamed: 0,y_pred,y_true,y_pred_int
0,0.039386,0,0
1,0.032706,0,0
2,0.905435,1,1
3,0.980905,1,1
4,0.126742,1,0


In [6]:
data_improve = pd.read_csv('result_toxic_toxic_XLM_R_C_11852_3_full.csv')
data_improve['y_pred_int'] = data_improve['y_pred'].apply(lambda x: int(x>=0.5))
data_improve.head()

Unnamed: 0,y_pred,y_true,y_pred_int
0,0.004836,0,0
1,0.002019,0,0
2,0.999285,1,1
3,0.987206,1,1
4,0.188467,1,0


In [7]:
data['text'] = test['text'].values
data['y_pred_improve'] = data_improve['y_pred'].values
data['y_pred_improve_int'] = data_improve['y_pred_int']
data.head()

Unnamed: 0,y_pred,y_true,y_pred_int,text,y_pred_improve,y_pred_improve_int
0,0.039386,0,0,gubernur daerah khusus ibukota salat jumat ber...,0.004836,0
1,0.032706,0,0,hindu tidak mengenal hari persidangan hindu p...,0.002019,0
2,0.905435,1,1,jancuk jancuk,0.999285,1
3,0.980905,1,1,bukti on bukti jangan congor doang,0.987206,1
4,0.126742,1,0,19 akhirnya adalah khilafah yang ditempuh deng...,0.188467,0


In [18]:
count_pos = 0
count_neg = 0
for i in range(0, data.shape[0]):
    if (data.loc[i].y_pred_int != data.loc[i].y_true) and (data.loc[i].y_pred_improve_int == data.loc[i].y_true) and (data.loc[i].y_true == 1):
        if data.loc[i].y_true == 1:
            count_pos +=1
        else:
            count_neg += 1
            
        print("\ny_pred: {}".format(data.loc[i].y_pred))
        print("y_true: {}".format(data.loc[i].y_true))        
        print("y_pred_improve: {}".format(data.loc[i].y_pred_improve))        
        print("text: {}".format(data.loc[i].text))                


y_pred: 0.41355938
y_true: 1
y_pred_improve: 0.9799235000000001
text:  pasti ada yang bacot begini meminta ganti mantel lah mantel kan banyak tinggal beli dan masalahnya mantel itu mantel yang jumlah xe2 x80 xa6 

y_pred: 0.074108094
y_true: 1
y_pred_improve: 0.8050988
text: suasana stq tingkat kabupaten tidak ada ntt cukup dalam masjid tanpa mimbar tanpa dana swasembada dari umat islam untuk bawa nama pemerintah daerah dalam musabaqah tilawatil quran ke tingkat propinsi ntt menyedihkan 2019 ganti presiden

y_pred: 0.42839316
y_true: 1
y_pred_improve: 0.6419626
text: selama proses sidang ahok mewaspadai ulama ulama kubu ahok yakni ulama ulama munafik yang sesat dan menyesatkan 

y_pred: 0.25542468
y_true: 1
y_pred_improve: 0.5927736
text: apalagi yang paling bermasalah dengan partai komunis indonesia itu ya angkatan darat cukup dengan menggerakkan massa anti jokowi dengan isu isu anti agama komunis

y_pred: 0.3151176
y_true: 1
y_pred_improve: 0.88784593
text: sompret sekali lagi enak 

In [14]:
print(count_pos,
count_neg)

46 36


In [15]:
count_pos+count_neg

82