In [13]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [14]:
import os
import pandas as pd
import json

os.chdir('/gdrive/My Drive/Sentihood')

In [15]:
def prepare_multi_data(file):
    with open(file) as f:
        data = json.load(f)
    
    ids, text, target, aspect, sentiment = [], [], [], [], []
    ids_p, text_p, target_p = [], [], []
    
    for d in data:
        if len(d['opinions'])==0:
            if 'LOCATION1' in d['text']:
                text_p.append(d['text'])
                target_p.append('LOCATION1')
                ids_p.append(d['id'])
            if 'LOCATION2' in d['text']:
                text_p.append(d['text'])
                target_p.append('LOCATION2')
                ids_p.append(d['id'])
        else:
            loc1_a, loc1_s, loc2_a, loc2_s = [], [], [], []
            for y in d['opinions']:
                if y['target_entity']=='LOCATION1':
                    loc1_a.append(y['aspect'])
                    loc1_s.append(y['sentiment'])
                else:
                    loc2_a.append(y['aspect'])
                    loc2_s.append(y['sentiment'])
            
            if len(loc1_a)>0:
                ids.append(d['id'])
                text.append(d['text'])
                target.append('LOCATION1')
                aspect.append(' '.join(loc1_a))
                sentiment.append(' '.join(loc1_s))
                
            if len(loc2_a)>0:
                ids.append(d['id'])
                text.append(d['text'])
                target.append('LOCATION2')
                aspect.append(' '.join(loc2_a))
                sentiment.append(' '.join(loc2_s))
            
            
            
                
    df = pd.DataFrame({'id':ids, 'text':text, 'target':target, 'aspect':aspect, 'sentiment':sentiment})
    df_p = pd.DataFrame({'id':ids_p, 'text':text_p, 'target':target_p})
    df_p['aspect'] = 'price'
    df_p['sentiment'] = 'Positive'
    
    return df, df_p

In [16]:
train, p1 = prepare_multi_data('train.json')
dev, p2 = prepare_multi_data('dev.json')
test, p3 = prepare_multi_data('test.json')

In [17]:
# will create pseudo-label from train and dev sets only
df_p = pd.concat([p1, p2], axis=0)

In [18]:
train.to_csv('train_multi.csv', index=False)
dev.to_csv('dev_multi.csv', index=False)
test.to_csv('test_multi.csv', index=False)
df_p.to_csv('pseudo_multi.csv', index=False)

In [19]:
df_p.loc[:, 'aspect'] = 'price'
df_p.loc[:, 'sentiment'] = 'None'
df_p.head(5)

Unnamed: 0,id,text,target,aspect,sentiment
0,1404,Down here in South London the accent [local]...,LOCATION1,price,
1,2476,I also live in LOCATION1,LOCATION1,price,
2,957,I don't live in London but I know the gay pa...,LOCATION1,price,
3,995,I lived in LOCATION1 from birth to 23 years ...,LOCATION1,price,
4,2549,"I was born in LOCATION2 I am a cockney,my fa...",LOCATION1,price,


In [20]:
test.head(3)

Unnamed: 0,id,text,target,aspect,sentiment
0,153,LOCATION1 is in Greater London and is a ve...,LOCATION1,safety,Positive
1,1130,All the neighborhoods around LOCATION1 are v...,LOCATION1,general safety,Positive Positive
2,1271,"Cheap is LOCATION2, LOCATION1, but not reall...",LOCATION1,general price,Negative Positive


In [21]:
test_comb = pd.concat([test, p3], axis=0)
test_comb.shape

(1843, 5)

In [22]:
# saving complete test set
test_comb.to_csv('complete_test.csv', index=False)