# Check for english in tagged as non-english

## Step 1: Prepare non-english only (not verified)

In [None]:
import math
import re

import pandas as pd

from accessory import clean_text_data, filter_text_data, get_text_data, word_counter
from comprehend import get_dominant_language

pd.options.mode.chained_assignment = None

In [None]:
df = pd.read_pickle('available_20200407.pkl')

### Add language column

In [None]:
def language(hash):
    path = 'data/{}.json'.format(hash)
    return pd.read_json(path).lang

In [None]:
df['language'] = df.hash.apply(language)

### Filter by language

In [None]:
dff = df[df.language != 'en']

### Save to pickle file

In [None]:
dff.to_pickle('non-english_not_verified_20201116.pkl')

## Step 2: Get text

In [None]:
df = pd.read_pickle('non-english_not_verified_20201116.pkl')

### Add text column

In [None]:
def get_text(hash):
    path = 'data/{}.json'.format(hash)
    dff = pd.read_json(path, encoding='utf-8')
    children = dff._children[0]
    data = get_text_data(children)
    data = filter_text_data(data)
    data = clean_text_data(data)
    text = ' '.join(data)
    return text

In [None]:
df['text'] = df.hash.apply(get_text)

### Save to pickle file

In [None]:
df.to_pickle('non-english_with_text_not_verified_20201116.pkl')

## Step 3: Verify language

In [None]:
df = pd.read_pickle('non-english_with_text_not_verified_20201116.pkl')

### Add `characters_num` column

In [None]:
def count_characters(text):
    return len(text)

In [None]:
df['characters_num'] = df.text.apply(count_characters)

### Add `units_num` column

In [None]:
def count_units(characters):
    units = characters / 100.0
    units = math.ceil(units)
    if units < 3:
        units = 3
    return units

In [None]:
df['units_num'] = df.characters_num.apply(count_units)

### Count price of Amazon Comprehend language detection

In [None]:
df.units_num.sum() * 0.0001

### Add `comprehend` column

In [None]:
def get_comprehend(text):
    result = get_dominant_language(text)
    if result == 1:
        return None, None
    return result

In [None]:
df['comprehend'] = df.text.apply(get_comprehend)

### Save to pickle file

In [None]:
df.to_pickle('non-english_comprehend_20201116.pkl')

## Step 4: Check for english

In [None]:
df = pd.read_pickle('non-english_comprehend_20201116.pkl')

### Add `language_by_aws` column

In [None]:
def get_language_by_aws(data):
    return data[0]

In [None]:
df['language_by_aws'] = df.comprehend.apply(get_language_by_aws)

### Add `aws_score` column

In [None]:
def get_aws_score(data):
    return data[1]

In [None]:
df['aws_score'] = df.comprehend.apply(get_aws_score)

### Select only verified english

In [None]:
dff = df[df.language_by_aws == 'en'][df.aws_score > 0.9]

### Save to pickle file

In [None]:
dff.to_pickle('english_verified_additional_20201116.pkl')