# Initial import

Let's import what we need for this part and also load all necessary datasets

In [0]:
import pandas as pd
import shutil
import re
import nltk
import random

In [2]:
nltk.download("punkt")
nltk.download("perluniprops")

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

from nltk.tokenize.moses import MosesDetokenizer
detokenizer = MosesDetokenizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!


In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [6]:
import shutil

drive_path = "Clean REALEC dumps/" #@param {type:"string"}
filename = "realec_110319_2315.tar.gz" #@param {type:"string"}


shutil.copy2('/content/gdrive/My Drive/'+drive_path+filename,'.')

'./realec_110319_2315.tar.gz'

In [0]:
import tarfile

tar = tarfile.open(filename)
tar.extractall()

In [9]:
import shutil

drive_path = "" #@param {type:"string"}
filename = "Bots2 (1).tar" #@param {type:"string"}


shutil.copy2('/content/gdrive/My Drive/'+drive_path+filename,'.')

'./Bots2 (1).tar'

In [0]:
tar = tarfile.open(filename)
tar.extractall("process_bots")

In [0]:
filename = "All_Entries.json"
shutil.copy2('/content/gdrive/My Drive/'+filename,'.')

All_Entries = pd.read_json(filename).reset_index(drop=True)
All_Entries.begin = All_Entries.begin.astype(int)
All_Entries.end = All_Entries.end.astype(int)
All_Entries.delete = All_Entries.delete.astype(bool)
All_Entries.substr_words = All_Entries.substr_words.astype(int)

In [12]:
%%time

Texts = list(set([x[:-4]+'.txt' for x in list(All_Entries["path"])]))
Text_Dict = {path: open(path, 'r', encoding='utf-8-sig').read() for path in Texts}

CPU times: user 206 ms, sys: 68.6 ms, total: 275 ms
Wall time: 276 ms


In [13]:
print(len(Text_Dict))

5554


# Generating structure for results presentation

First let's specify the error types we will be working with

In [0]:
import os

In [0]:
chosen_types = os.listdir("./process_bots")

In [16]:
chosen_types

['Spelling',
 'Noun_number',
 'delete',
 'Capitalisation',
 'Category_confusion',
 'Often_confused',
 'lex_item_choice',
 'Prepositions',
 'Agreement_errors']

Let's load our reserve files as dict

In [0]:
RESERVE = {"Spelling": ['./data/exam/exam2014/EPa_38_1.txt', './data/exam/exam2017_4/DPe_83_1.txt', './data/Exam_practice/OV_2_year/essays_it/st_100_7.txt', './data/exam/exam2014/2012-2014_3/esl_00700.txt', './data/exam/exam2017/EGe_177_2.txt', './data/exam/undefined/EEm_151_1.txt', './data/exam/exam2017/DOv_32_1.txt', './data/2012-2014/esl_00016.txt', './data/old IELTS/IELTS2016/MTsy_40_2.txt', './data/exam/exam2017_2/ABl_18_2.txt', './data/old IELTS/IELTS2016/EEm_200_1.txt', './data/exam/exam2017_6/OBy_5_2.txt', './data/old IELTS/IELTS2015/ESha_49_2.txt', './data/2012-2014/esl_00749.txt', './data/exam/exam2016/JSl_87_1.txt'],
"lex_item_choice": ['./data/2012-2014/esl_00382.txt', './data/exam/best_works/2_2.txt', './data/2012-2014/esl_01177.txt', './data/exam/exam2016/EKu_12_2.txt', './data/exam/exam2017/VSa_35_1.txt', './data/exam/exam2017_6/OBy_69_2.txt', './data/exam/exam2014/AAl_9_2.txt', './data/exam/exam2014/EEm_36_1.txt', './data/Exam_practice/OV_2_year/essays_fr/st_45_7.txt', './data/old IELTS/IELTS2016/OR_127_1.txt', './data/exam/exam2017_6/NMya_14_1.txt', './data/2012-2014/esl_00437.txt', './data/old IELTS/IELTS2016/JSl_124_1.txt', './data/old IELTS/IELTS2015/AKhr_31_2.txt', './data/exam/exam2014/2012-2014_3/esl_00583.txt'],
"delete": ['./data/exam/exam2017/VSa_75_1.txt', './data/2012-2014/esl_00637.txt', './data/exam/exam2016/OR_108_2.txt', './data/exam/exam2014/2012-2014_2/esl_00445.txt', './data/exam/undefined/AKhr_16_2.txt', './data/exam/exam2017/OBy_101_2.txt', './data/exam/exam2017/OBy_74_2.txt', './data/exam/exam2017/EGe_111_1.txt', './data/2012-2014/esl_00950.txt', './data/exam/best_works/30_1.txt', './data/exam/exam2017_6/NMya_25_2.txt', './data/old IELTS/IELTS2015/MTsy_28_2.txt', './data/old IELTS/IELTS2016/JSl_67_2.txt', './data/old IELTS/IELTS2016/EKu_72_2.txt', './data/2012-2014/esl_00837.txt'],
"Prepositions": ['./data/exam/exam2014/AMe_8_1.txt', './data/exam/exam2014/2012-2014_4/esl_00945.txt', './data/exam/exam2014/EPa_5_1.txt', './data/exam/exam2014/TSha_2_1.txt', './data/exam/exam2014/2012-2014_2/esl_00380.txt', './data/2012-2014/esl_00330.txt', './data/old IELTS/IELTS2016/JSl_95_1.txt', './data/old IELTS/IELTS2015/ADe_18_1.txt', './data/2012-2014/esl_00107.txt', './data/exam/exam2017/EGe_220_1.txt', './data/exam/exam2016/JSl_140_2.txt', './data/exam/exam2014/MTsy_34_2.txt', './data/old IELTS/IELTS2015/EEm_24_2.txt', './data/exam/exam2014/EPa_6_1.txt', './data/exam/exam2017_6/OBy_103_2.txt'],
"Agreement_errors": ['./data/old IELTS/IELTS2016/OR_129_2.txt', './data/exam/exam2017_4/DPe_38_2.txt', './data/2012-2014/esl_00225.txt', './data/old IELTS/IELTS2016/EKu_6_1.txt', './data/exam/exam2017/EGe_190_2.txt', './data/old IELTS/IELTS2016/JSl_41_2.txt', './data/exam/exam2014/MGr_13_2.txt', './data/old IELTS/IELTS2015/EPa_31_1.txt', './data/exam/exam2017_6/OBy_112_1.txt', './data/exam/exam2014/ZEv_35_2.txt', './data/2012-2014/esl_00437.txt', './data/exam/exam2016/OR_48_2.txt', './data/exam/exam2014/EPa_7_2.txt', './data/exam/exam2016/EKu_25_1.txt', './data/exam/exam2017_2/ABl_27_2.txt'],
"Noun_number": ['./data/old IELTS/IELTS2016/JSl_43_1.txt', './data/exam/exam2017/VSa_80_2.txt', './data/2012-2014/esl_00978.txt', './data/exam/exam2017_6/OBy_62_1.txt', './data/old IELTS/IELTS2015/EPa_6_2.txt', './data/exam/exam2014/2012-2014_4/esl_00808.txt', './data/exam/exam2014/VKo_1_2.txt', './data/exam/exam2016/JSl_50_2.txt', './data/exam/exam2014/MTsy_15_2.txt', './data/exam/exam2017/DOv_27_1.txt', './data/exam/exam2014/VKo_7_1.txt', './data/exam/exam2017/EGe_264_1.txt', './data/old IELTS/IELTS2015/EEm_29_2.txt', './data/exam/exam2017/OBy_75_2.txt', './data/exam/exam2017/OBy_179_1.txt'],
"Category_confusion": ['./data/exam/exam2014/EEm_7_1.txt', './data/old IELTS/IELTS2016/EKu_145_1.txt', './data/exam/exam2016/best_works/ZEv_2_2.txt', './data/exam/exam2017_2/ABl_26_2.txt', './data/exam/exam2017/OBy_149_2.txt', './data/exam/exam2017_5_2/EGe_226_2.txt', './data/exam/exam2017_5_2/EGe_146_1.txt', './data/old IELTS/IELTS2016/EKu_61_1.txt', './data/exam/exam2014/AAl_24_2.txt', './data/exam/exam2016/ZEv_52_2.txt', './data/exam/exam2014/AAl_31_2.txt', './data/exam/exam2017_4/DPe_83_1.txt', './data/old IELTS/IELTS2016/EKu_132_1.txt', './data/old IELTS/IELTS2015/EEm_38_1.txt', './data/exam/exam2014/MBi_22_1.txt'],
"Ref_device": ['./data/exam/exam2017_7/VSa_70_2.txt', './data/2012-2014/esl_01230.txt', './data/Exam_practice/OV201617/DIAG/st_29_11.txt', './data/2012-2014/esl_01206.txt', './data/exam/exam2016/OR_5_1.txt', './data/exam/exam2017/OBy_119_1.txt', './data/exam/exam2014/VKo_19_1.txt', './data/old IELTS/IELTS2015/EPa_9_2.txt', './data/exam/exam2017_7/OBy_193_1.txt', './data/2012-2014/esl_00209.txt', './data/exam/exam2017/OBy_181_2.txt', './data/old IELTS/IELTS2015/AKhr_18_2.txt', './data/exam/exam2017/NMya_24_1.txt', './data/exam/exam2017_7/VSa_58_2.txt', './data/exam/exam2014/EPa_41_2.txt'],
"Capitalisation": ['./data/exam/exam2017_6/OBy_17_1.txt', './data/exam/exam2017_6/OBy_123_2.txt', './data/exam/exam2016/OR_55_1.txt', './data/exam/exam2014/EPa_38_2.txt', './data/exam/undefined/DZu_156_1.txt', './data/exam/exam2017/EGe_129_2.txt', './data/exam/exam2016/OR_19_1.txt', './data/exam/exam2017_7/VSa_23_1.txt', './data/exam/exam2017_7/VSa_3_1.txt', './data/old IELTS/IELTS2015/ASt_11_1.txt', './data/exam/exam2017/DPe_29_1.txt', './data/exam/exam2017/ABl_44_1.txt', './data/exam/exam2014/VPe_27_1.txt', './data/exam/exam2017_4/DPe_59_2.txt', './data/old IELTS/IELTS2015/AMe_7_1.txt'],
"Often_confused": ['./data/old IELTS/IELTS2015/EEm_10_2.txt', './data/exam/exam2014/EPa_83_2.txt', './data/exam/exam2014/DAr_38_1.txt', './data/exam/exam2017_4/DPe_69_2.txt', './data/exam/exam2016/best_works/ZEv_10_2.txt', './data/exam/exam2017_5_1/EGe_16_2.txt', './data/exam/exam2014/MGr_6_2.txt', './data/exam/exam2017_4/DPe_59_2.txt', './data/Exam_practice/AV_1_year/Test_essays/student68_final.txt', './data/exam/exam2017/EGe_15_1.txt', './data/Exam_practice/OV_2_year/essays_fr/st_43_1.txt', './data/exam/exam2017/OBy_61_1.txt', './data/old IELTS/IELTS2016/JSl_3_1.txt', './data/exam/exam2014/DAr_11_1.txt', './data/exam/exam2014/EPa_4_2.txt']}

Let's create a resulting dict here:

In [0]:
RESULT_DICT = {err_type: {} for err_type in chosen_types}

First let's append human annotated `divs` here

In [0]:
for err_type in chosen_types:
  for filepath in RESERVE[err_type]:
    shortname = filepath.split("/")[-1][:-4]
    RESULT_DICT[err_type][shortname] = {}
    text = Text_Dict[filepath]
    poses = []
    if err_type == 'delete':
      for i, row in All_Entries.loc[(All_Entries["path"] == filepath[:-4]+".ann") & (All_Entries["delete"] == True)].iterrows():
        poses.append(int(row["begin"]))
        poses.append(int(row["end"]))
    else:
      for i, row in All_Entries.loc[(All_Entries["path"] == filepath[:-4]+".ann") & (All_Entries["type"] == err_type)].iterrows():
        poses.append(int(row["begin"]))
        poses.append(int(row["end"]))
    poses = sorted(poses, reverse=True)
    dum = text
    i = len(poses)-1
    for item in poses:
      if i % 2:
        dum = dum[:item] + "</div>" + dum[item:]
      else:
        dum = dum[:item] + '<div class="nulla">' + dum[item:]
      i -= 1
    RESULT_DICT[err_type][shortname]["nulla"] = dum

In [20]:
RESULT_DICT["delete"]["VSa_75_1"]["nulla"]

'The graph provide us information about print and eBook market in USA, Germany, and UK in 2014 annd 2018. The biggest number in this tip of print and eBook in USA. HowEver, number of eBook and print market also big in Germany. In China we can see less number of market and in the UK this number the smallest between all country in 2014 and 2018. In USA\'s graph we see that number of eBook market increas in 2018 on 3,5% comparasion with 2014 and number of print market desceased on 3% in 2018. In Germany number of print market stay the same in 2018 as in 2014 but number of eBook market <div class="nulla">small</div> increas in 2018. This number is 0,5%. In China number of print fall in 2018 on 0,6% but number of E book increase on 0,5%. In UK we can see the same tend. Number of prin markets fall in 2018 on 0,6%. Also number of Ebook market rise on 0,5%. Finally we can see that in all four countries number of eBook market rise in 2018 and number of print market will fall in 2018. However Ge

Then let's process computer annotations

In [0]:
latin = ['unus', 'duo', 'tres', 'quattuor', 'quinque']

In [0]:
for err_type in chosen_types:
  Models = os.listdir('./process_bots/'+err_type+'/')
  for i in range(len(Models)):
    for filepath in os.listdir('./process_bots/'+err_type+'/'+Models[i]+'/'):
      shortname = filepath[:-5]
      model = Models[i]
      div = open('./process_bots/'+err_type+'/'+model+'/'+filepath).read()
      div = re.search(r'<body>(.*?)</body>', div).group(1)
      ediv = div
      ediv = re.sub(r'<div (.*?)>', "TAGGEDHERETAGGEDHERE", ediv)
      ediv = re.sub(r'</div>', "TAGGEDHERETAGGEDHERE", ediv)
      clean_text = re.sub(r'<(/)?div(.*?)?>',"",RESULT_DICT[err_type][shortname]["nulla"])
      ect = clean_text
      tokens_soup = tknzr.tokenize(ediv)
      for token in tokens_soup:
        token = re.sub(r'TAGGEDHERETAGGEDHERE', '', token)
        ect = ect.replace(token, chr(8), 1)
      for k in range(len(tokens_soup)):
        entry = ""
        if tokens_soup[k].startswith("TAGGEDHERETAGGEDHERE"):
          entry += '<div class="'+latin[i]+'">'
        entry += re.sub(r'TAGGEDHERETAGGEDHERE', '', tokens_soup[k])
        if tokens_soup[k].endswith("TAGGEDHERETAGGEDHERE"):
          entry += '</div>'
        ect = ect.replace(chr(8), entry, 1)
      RESULT_DICT[err_type][shortname][latin[i]] = ect

In [23]:
RESULT_DICT

{'Agreement_errors': {'ABl_27_2': {'duo': 'In <div class="duo">fact</div> pirate copies led to different producers lose their benefits every year. That is why a lot of people think that downloading of music or films without paying money for that production should be punished. There are some advantages and disadvantages of this point of view.\nFirst and foremost, people spend a lot of money, huge sums to do a new film or music composition and it is unfaire when they are not able to get their money. Internet <div class="duo">survesis</div> <div class="duo">get</div> access to cope and share of somebody’s <div class="duo">production</div>. Perhaps, <div class="duo">this</div> types of sites should be block. And then authors’ rights will not break.\nMoreover, arctors, producers, musicians, directors and other people who work in <div class="duo">this</div> industry should get their salary for their job. Sometimes people do understand that produsing of films is very expensive. Legal copies l

In [0]:
import json

with open("RESULT_DICT.json", "w") as outjson:
  json.dump(RESULT_DICT, outjson)

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once in a notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [26]:
# Create & upload a file.

jsonname = "RESULT_DICT"
uploaded = drive.CreateFile({'title': jsonname+".json"})
uploaded.SetContentFile(jsonname+".json")
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

Uploaded file with ID 1_Eq9zWNalzPL4u93iaN7-Ps_6aIKRQvv
