# Analyze Improvement

In [18]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import TensorDataset, DataLoader

RANDOM_SEED=1

def set_random_seed_data(seed):
    RANDOM_SEED = seed

def lowercase(text):
    return text.lower()

def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

def preprocess_text(text):
    text = lowercase(text)
    text = remove_nonaplhanumeric(text)
    text = remove_unnecessary_char(text)
    return text

def load_data(PATH):
    test = pd.read_csv(PATH, sep='\t', header=None)
    test = test.rename(columns={0: "text", 1: "label"})
    test = test[test['label'] != 'neutral']
    test['label'] = test['label'].apply(lambda x: 1 if x=='positive' else 0)
    test['text'] = test['text'].apply(lambda x: preprocess_text(x))
    
    return test

In [25]:
test = load_data('../../../../../data/prosa/data_testing_full.tsv')
test.head()

Unnamed: 0,text,label
0,kemarin gue datang ke tempat makan baru yang a...,0
1,kayak nya sih gue tidak akan mau balik lagi ke...,0
2,kalau dipikir pikir sebenarnya tidak ada yang ...,0
3,ini pertama kalinya gua ke bank buat ngurusin ...,0
4,waktu sampai dengan gue pernah disuruh ibu lat...,0


# Improvement

In [21]:
data = pd.read_csv('result_prosa_yelp_XLM_R_A_10981_0.5_full.csv')
data['y_pred_int'] = data['y_pred'].apply(lambda x: int(x>=0.5))
data.head()

Unnamed: 0,y_pred,y_true,y_pred_int
0,0.002837,0,0
1,0.001181,0,0
2,0.000966,0,0
3,0.001127,0,0
4,0.001636,0,0


In [22]:
data_improve = pd.read_csv('result_prosa_yelp_XLM_R_C_10981_1.5_full.csv')
data_improve['y_pred_int'] = data_improve['y_pred'].apply(lambda x: int(x>=0.5))
data_improve.head()

Unnamed: 0,y_pred,y_true,y_pred_int
0,0.000971,0,0
1,0.000583,0,0
2,0.001124,0,0
3,0.005189,0,0
4,0.028678,0,0


In [23]:
data['text'] = test['text'].values
data['y_pred_improve'] = data_improve['y_pred'].values
data['y_pred_improve_int'] = data_improve['y_pred_int']
data.head()

Unnamed: 0,y_pred,y_true,y_pred_int,text,y_pred_improve,y_pred_improve_int
0,0.002837,0,0,kemarin gue datang ke tempat makan baru yang a...,0.000971,0
1,0.001181,0,0,kayak nya sih gue tidak akan mau balik lagi ke...,0.000583,0
2,0.000966,0,0,kalau dipikir pikir sebenarnya tidak ada yang ...,0.001124,0
3,0.001127,0,0,ini pertama kalinya gua ke bank buat ngurusin ...,0.005189,0
4,0.001636,0,0,waktu sampai dengan gue pernah disuruh ibu lat...,0.028678,0


In [24]:
for i in range(0, data.shape[0]):
    if (data.loc[i].y_pred_int != data.loc[i].y_true) and (data.loc[i].y_pred_improve_int == data.loc[i].y_true):
        print("\ny_pred: {}".format(data.loc[i].y_pred))
        print("y_true: {}".format(data.loc[i].y_true))        
        print("y_pred_improve: {}".format(data.loc[i].y_pred_improve))        
        print("text: {}".format(data.loc[i].text))                


y_pred: 0.047497958
y_true: 1
y_pred_improve: 0.8276683000000001
text: temen gue telepon genggam nya xiaomi yang 2 juta saja masih kembali pas gue tanya kenapa dia beli itu padahal bisa saja di beli iphone dia jawab karena dia suka karena spek nya melebih iphone yang lo pandang ricuh itu

y_pred: 0.85668373
y_true: 0
y_pred_improve: 0.05138001
text: produk lokal memang cetek

y_pred: 0.7040569000000001
y_true: 0
y_pred_improve: 0.16294375
text: model baju yang ada di matahari tidak sebagus model baju di ramayana

y_pred: 0.21851113
y_true: 1
y_pred_improve: 0.9922719999999999
text: tidak usah ditanya kehebatan jokowi apa semua juga bisa dilakukan sama bapak presiden terkuat satu itu

y_pred: 0.0009434521
y_true: 1
y_pred_improve: 0.830151
text: gue cuma mau dipimpin sama jokowi

y_pred: 0.014259845
y_true: 1
y_pred_improve: 0.99666846
text: gila sih rasa iga bakar di iga bakar si jangkung terbaik parah

y_pred: 0.074680805
y_true: 1
y_pred_improve: 0.7940718000000001
text: ayam sama k

In [15]:
data.loc[0]

y_pred            0
y_true            0
y_pred_improve    0
Name: 0, dtype: int64