In [1]:
from collections import Counter
import pandas as pd
import spacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language
from collections import Counter
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
@Language.factory('language_detector')
def language_detector(nlp, name):
    return LanguageDetector()

In [2]:

data = pd.DataFrame()
for slice in range(0, 8000, 1000):
    data = pd.concat([data, pd.read_json(f'scraper_booking/hotels_reviews_data_{slice}_{slice + 1000}.json', orient='index')])
data.reset_index(drop=True, inplace=True)

In [3]:
data

Unnamed: 0,category,link,hotel_review
0,hotels,https://www.booking.com/hotel/rs/apartman-maki...,Ljubazni domaćini. Sve na dohvat ruke. Odlična...
1,hotels,https://www.booking.com/hotel/rs/apartman-maki...,"Nema ničeg lošeg. Sve je bilo ok. Ponavljam, z..."
2,hotels,https://www.booking.com/hotel/rs/uzivancija-na...,"Jednom rečju sve... pogled, ambijent za svaku ..."
3,hotels,https://www.booking.com/hotel/rs/uzivancija-na...,Bez i ljedne zamerke...
4,hotels,https://www.booking.com/hotel/rs/apartments-no...,Izvrstan smeštaj i prijatan domaćin.\nParking ...
...,...,...,...
111735,hotels,https://www.booking.com/hotel/rs/apartman-gord...,Sve pohvale i preporuke!!
111736,hotels,https://www.booking.com/hotel/rs/apartman-gord...,"Lokacija odlična, sve je bilo super!"
111737,hotels,https://www.booking.com/hotel/rs/apartman-gord...,Apartman je mnogo lepši nego što sam očekivala...
111738,hotels,https://www.booking.com/hotel/rs/apartman-padr...,"Sve,hrana,domaćini...!!!"


In [4]:
data.hotel_review = data.hotel_review.apply(lambda x: x.strip().replace('\n', ' '))

In [5]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x12f767d3f70>

In [7]:
%%time
data['language'] = data['hotel_review'].apply(lambda x: nlp(x)._.language['language'])

CPU times: total: 39min 32s
Wall time: 39min 53s


In [23]:
Counter(data_lang.language.to_list())

Counter({'hr': 78769, 'sl': 23162})

In [21]:
data_lang = data[(data.language == 'sl') | (data.language == 'hr')]
data_lang.reset_index(drop=True, inplace=True)
data_lang.to_json("reviews_lang.json", orient='index')

In [25]:
data.to_json("data/reviews_all_lang.json", orient='index')

In [34]:
data[['category','hotel_review']].loc[0]

category                                                   hotels
hotel_review    Ljubazni domaćini. Sve na dohvat ruke. Odlična...
Name: 0, dtype: object

## Shuffle data

In [36]:
from sklearn.utils import shuffle
df = pd.read_json('data/reviews_lang.json', orient='index')
df = shuffle(df, random_state=0)
df.reset_index(drop=True, inplace=True)
# Add 4 label columns
entities = ['amenities', 'location', 'cleanliness', 'staff']
for entity in entities:
    df[entity] = 'n/a'
# Split to two
df1 = df.iloc[:len(df)//2, :].reset_index(drop=True)
df2 = df.iloc[len(df)//2:, :].reset_index(drop=True)
# save progress to file
for idx, df in enumerate([df1, df2]):
    out = df.to_json(orient='index', indent=4, force_ascii=False)
    with open(f'data/annotated_reviews_lang_{idx + 1}.json', 'w', encoding='utf-8') as f:
        f.write(out)

## Callibrating annotation

In [None]:
df1 = pd.read_json(f'data/annotated_reviews_lang_1.json', orient='index')
df1 = df1.iloc[:150,:]
df2 = pd.read_json(f'data/annotated_reviews_lang_2.json', orient='index')

## JSON to UTF

### Scraped reviews

In [6]:
scraped_reviews = pd.read_json('data/reviews_lang.json', orient='index')

In [26]:
with open('data/scraped_data.txt', 'w', encoding='utf-8') as f:
    text = scraped_reviews.to_csv(header=False, index=False, columns=['category', 'link', 'hotel_review'], sep='\t', line_terminator='\n')
    f.write(text)

### Annotated reviews

In [34]:
df1 = pd.read_json(f'data/annotated_reviews_lang_1.json', orient='index')
df2 = pd.read_json(f'data/annotated_reviews_lang_2.json', orient='index')
annotated_reviews = pd.concat([df1, df2])
annotated_reviews = annotated_reviews.drop(axis=1, columns=['language'])
annotated_reviews = annotated_reviews[(annotated_reviews.amenities != 'n/a') & (annotated_reviews.amenities != 'skipped')]
annotated_reviews.drop_duplicates(inplace=True)
columns = annotated_reviews.columns
def map_entities(x):
    for column in range(3, len(x)):
        if x[column] == '':
            continue
        elif x[column] == 'n':
            x[column] = f'({columns[column]}, NEG);'
        else:
            x[column] = f'({columns[column]}, POS);'
    return x
annotated_reviews = annotated_reviews.apply(lambda x: map_entities(x), axis=1)
annotated_reviews.reset_index(drop=True, inplace=True)

In [35]:
with open('data/annotated_data.txt', 'w', encoding='utf-8') as f:
    text = annotated_reviews.to_csv(header=False, index=False, sep='\t', line_terminator='\n')
    f.write(text)