In [3]:
%load_ext autoreload
%autoreload 2

import json
import pandas as pd
import languagetool as lt
from collections import Counter
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
df = []
for x in pd.read_json('data/qa_captions.json', lines=True, chunksize=1_000_000):
    y = x.groupby('template_id').sample(10)
    df.append(y)

df = pd.concat(df, axis=0, ignore_index=True)

In [9]:
from joblib import Parallel, delayed

def check(df, top_k=5):
    # errors = [lt.process(**row) for _, row in df.iterrows()]
    errors = Parallel(n_jobs=-1, verbose=2)(delayed(lt.process)(**row) for _, row in df.iterrows())
    errors = [k for k in errors if len(k['matches']) > 0]
    msgs = []
    for m in errors:
        for k in m['matches']:
            msgs.append(k['message'])
    return Counter(msgs).most_common(top_k)
c = check(df)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 538 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 1536 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 2829 out of 2860 | elapsed:   12.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 2860 out of 2860 | elapsed:   12.6s finished


In [10]:
# df['caption'] = df['caption'].apply(
#     lt.space_before_bracket
#     ).apply(
#     lt.trim_leading_symbols
#     )

In [11]:
c

[('Possible spelling mistake found.', 352),
 ('A determiner may be missing.', 159),
 ('Possible spelling mistake. ‘enrolment’ is British English.', 68),
 ('It appears that a hyphen is missing.', 50),
 ('Possible typo: you repeated a word', 40)]

In [12]:
deter = df[
    df.apply(lambda x: lt.process(**x), axis=1).apply(json.dumps).str.contains('determiner may be missing')
]
deter['caption'].sample(5).tolist()

['The sum of the CPIA rating in Lao PDR and CPIA rating in Djibouti is always more than the CPIA rating in Solomon Islands.',
 'The Taxes (in US$) in United Arab Emirates in 2006 exceeds the Taxes (in US$) in the Gambia in 2005 by -4666935500.0.',
 'Across Merchandise Trading, the maximum Trade with economies of Middle East &amp; North Africa (%) of Malaysia is 1.085303186.',
 'The amount of industrial nitrous oxide emitted in Philippines is strictly greater than the amount of industrial nitrous oxide emitted in Philippines over the years.',
 'The Amount exported (in US$) in Marshall Islands in 2012 is greater than that in 2008 by a factor of 3.6656891495601176.']

In [13]:
from spacy import load
nlp = load('en_core_web_lg')

2023-01-21 17:42:13.445177: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-21 17:42:14.613764: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-21 17:42:14.613819: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-01-21 17:42:16.213085: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: 

In [19]:
gpes = deter['caption'].apply(lambda x: [c for c in nlp(x).ents if c.label_ == 'GPE'])

In [24]:
def check_GPE_determiner(doc):
    gpes = [c for c in doc.ents if c.label_ == 'GPE' and c[0].text.lower() != 'the']
    return set([c.text for c in gpes])

In [33]:
docs = deter['caption'].apply(nlp)

gpes = set()
for doc in docs:
    gpes.update(check_GPE_determiner(doc))

In [35]:
gpes = """Bahamas
Cayman Islands
Central African Republic
Congo
Czech Republic
Dominican Republic
Gambia
Netherlands
Philippines
Slovak Republic
Solomon Islands
US
United Arab Emirates
United Kingdom
United States
Virgin Islands
West Bank""".splitlines()
gpes

['Bahamas',
 'Cayman Islands',
 'Central African Republic',
 'Congo',
 'Czech Republic',
 'Dominican Republic',
 'Gambia',
 'Netherlands',
 'Philippines',
 'Slovak Republic',
 'Solomon Islands',
 'US',
 'United Arab Emirates',
 'United Kingdom',
 'United States',
 'Virgin Islands',
 'West Bank']

In [47]:
import re

def replace_gpes(s):
    for gpe in gpes:
        if gpe.lower() in s.lower():
            s = re.sub(f'{gpe}, The', f'the {gpe}', s)            
            if f'the {gpe}' in s.lower():
                continue
            else:
                s = re.sub(f'{gpe}', f'the {gpe}', s)
        else:
            continue
    return s

In [49]:
c = check(df)
c

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 2120 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 2860 out of 2860 | elapsed:    5.7s finished


[('Possible spelling mistake found.', 352),
 ('Possible typo: you repeated a word', 76),
 ('Possible spelling mistake. ‘enrolment’ is British English.', 68),
 ('A determiner may be missing.', 66),
 ('It appears that a hyphen is missing.', 50)]

In [50]:
deter = df[
    df.apply(lambda x: lt.process(**x), axis=1).apply(json.dumps).str.contains('determiner may be missing')
]
deter['caption'].sample(5).tolist()

['The maximum Rating (1=low 6=high) in Caribbean small states across years is 3.68.',
 'In the year 2007, the Revenue generated (in %) in Caribbean small states is greater than the Revenue generated (in %) in Belgium by 12.1753.',
 '9.1811529259 is the % of population in Middle East &amp; North Africa (all income levels) in 2006.',
 'Across years, the highest Employment (as % of total employment) in Middle East &amp; North Africa (developing only) is 27.1444936262.',
 'The sum of the percentage of amount spent on subsidies in Middle East &amp; North Africa (all income levels) and Middle East &amp; North Africa (developing only) is higher than the greatest percentage of amount spent on other expenses across all countries.']

In [55]:
gpes.extend(['Caribbean', 'Middle East'])

In [58]:
gpes.extend(['Comoros', 'Maldives'])
gpes.extend(['Marshall Islands'])

In [57]:
df['caption'] = df['caption'].apply(replace_gpes)
deter = df[
    df.apply(lambda x: lt.process(**x), axis=1).apply(json.dumps).str.contains('determiner may be missing')
]
deter['caption'].sample(5).tolist()

['The sum of the percentage of new male entrants in the the the Philippines and percentage of new male entrants in the the the West Bank and Gaza is not more than the sum of the {ylabel3} of {legendlabel4} and percentage of new male entrants in Least developed countries in every year.',
 'The LPI (1=low to 5=high) in Least developed countries is less than the LPI (1=low to 5=high) in Lebanon by -0.2069 in 2012.',
 'The total Total Trade (current the the the US$) in Marshall Islands in the graph is 0.0.',
 'The Amount exported (in the the the US$) in Marshall Islands in 2012 is greater than that in 2008 by a factor of 3.6656891495601176.',
 'The Earnings (current the the the US$) in Least developed countries in 1999 is 1.0610079575596818 times greater than that in 1997.']

In [59]:
gpes

['Bahamas',
 'Cayman Islands',
 'Central African Republic',
 'Congo',
 'Czech Republic',
 'Dominican Republic',
 'Gambia',
 'Netherlands',
 'Philippines',
 'Slovak Republic',
 'Solomon Islands',
 'US',
 'United Arab Emirates',
 'United Kingdom',
 'United States',
 'Virgin Islands',
 'West Bank',
 'Carribean',
 'Middle East',
 'Caribbean',
 'Middle East',
 'Comoros',
 'Maldives',
 'Comoros',
 'Maldives',
 'Marshall Islands']

In [60]:
df = []
for x in pd.read_json('data/qa_captions.json', lines=True, chunksize=1_000_000):
    y = x.groupby('template_id').sample(10)
    df.append(y)

df = pd.concat(df, axis=0, ignore_index=True)

In [61]:
df['caption'] = df['caption'].apply(replace_gpes)
deter = df[
    df.apply(lambda x: lt.process(**x), axis=1).apply(json.dumps).str.contains('determiner may be missing')
]
deter['caption'].sample(5).tolist()

['The average per Labor market programs Benefits incidence in poorest quintile (%) of Armenia is 17.4226.',
 'The Amount (%) in 1960 in Least developed countries exceeds the Amount (%) in 1961 in Lower middle income by 1.628.',
 'In the year 2005, the Number of children in Kyrgyz Republic is greater than the Number of children in Turks and Caicos Islands by 19520.0.',
 'The Total Labor Force (in %) in Least developed countries in 2009 and the Total Labor Force (in %) in Middle income in 2011 differ by 13.9996.',
 'The difference between the Rent (as % of GDP) in Natural Gas in Latin America (developing only) and that in Least developed countries is 0.121.']

In [64]:
deter['caption'].tolist()

['The sum of the earnings from merchandise exports in Least developed countries and High income: OECD is not greater than the highest earnings from merchandise imports across all countries.',
 'Across years, the lowest Amount of electricity produced (kWh) in Least developed countries is 52000000.0.',
 'The maximum Female students (as % of total students) of Least developed countries across years is 46.9951210022.',
 'The Access to electricity (% of population) in 2010 in High income is greater than that in Least developed countries by a factor of 3.165.',
 'The Total Labor Force (in %) in Least developed countries in 2009 and the Total Labor Force (in %) in Middle income in 2011 differ by 13.9996.',
 'The population in largest city in the Cambodia country was the least.',
 'In the year 2005, the Number of children in Kyrgyz Republic is greater than the Number of children in Turks and Caicos Islands by 19520.0.',
 'The difference between the Rent (as % of GDP) in Natural Gas in Latin Am

In [72]:
gpes.append('Channel Islands')
def replace_gpes(s):
    for gpe in gpes:
        if gpe.lower() in s.lower():
            s = re.sub(f'{gpe}, The', f' the {gpe}', s, flags=re.IGNORECASE)
            s = re.sub(f'(?<!the)\s{gpe}', f' the {gpe}', s, flags=re.IGNORECASE)
        else:
            continue
    return s.replace('  ', ' ')

In [73]:
df = []
for x in pd.read_json('data/qa_captions.json', lines=True, chunksize=1_000_000):
    y = x.groupby('template_id').sample(10)
    df.append(y)

df = pd.concat(df, axis=0, ignore_index=True)
df['caption'] = df['caption'].apply(replace_gpes)

deter = df[
    df.apply(lambda x: lt.process(**x), axis=1).apply(json.dumps).str.contains('determiner may be missing')
]
deter['caption'].sample(5).tolist()

['Across years, the minimum Benefits incidence in poorest quintile (%) in Social Protection and Labor is 2.776901653.',
 'The Rating (1=low 6=high) in Burkina Faso in 2014 is higher than the Rating (1=low 6=high) in Least developed countries in 2011 by -0.2907.',
 'The air freight in Latin America (all income levels) is not less than that in Least developed countries.',
 'The difference between the Depositors (per 1000 adults) of Moldova in 2010 and the Depositors (per 1000 adults) of Least developed countries in 2006 is -1148.8527.',
 'The proportion of the male labor force who participated in production in Least developed countries is greater than 72 % in 3 years.']

In [76]:
## Final GPE Replacement
with open('data/gpes.txt', 'r') as fin:
    gpes = [g.strip() for g in fin.readlines()]

for i, df in tqdm(enumerate(pd.read_json('data/qa_captions.json', lines=True, chunksize=1_000_000))):
    df['caption'] = df['caption'].apply(replace_gpes)
    df.to_json(f'data/gpe_fix_{i}.json', orient='records', lines=True)

19it [04:08, 13.09s/it]


In [83]:
# Final check

df = []
for x in pd.read_json('data/qa_captions.json', lines=True, chunksize=1_000_000):
    n_samples = min(x['template_id'].value_counts().min(), 100)
    y = x.groupby('template_id').sample(n_samples)
    df.append(y)

df = pd.concat(df, axis=0, ignore_index=True)
c = check(df, top_k=None)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 220 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 4056 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 6976 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 10536 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 14752 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 19608 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done 25120 tasks      | elapsed:   58.6s
[Parallel(n_jobs=-1)]: Done 27315 out of 27346 | elapsed:  1.1min remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 27346 out of 27346 | elapsed:  1.1min finished


In [86]:
c = [elem for elem in c if 'British English' not in elem[0]]

In [87]:
c

[('Possible spelling mistake found.', 3492),
 ('It appears that a hyphen is missing.', 447),
 ('Possible typo: you repeated a word', 300),
 ('Please verify that the plural noun “countries” is in agreement with the quantifier “1”. Did you mean to use the singular form?',
  227),
 ('Use “a thousand”, or use a number before ‘thousand’.', 210),
 ('The auxiliary verb ‘do’ requires the base form of the verb.', 165),
 ('The verb form ‘is’ does not seem to match the subject ‘imports’.', 125),
 ('A determiner may be missing.', 113),
 ('Please verify that the plural noun “years” is in agreement with the quantifier “1”. Did you mean to use the singular form?',
  111),
 ('The verb form ‘is’ does not seem to match the subject ‘exports’.', 90),
 ('‘Sao Tome and Principe’ is an imported foreign name or expression, which originally has a diacritic.',
  67),
 ('An apostrophe may be missing.', 62),
 ('This word is normally spelled with a hyphen.', 54),
 ('This word is normally spelled as one.', 48),
 ('