In [1]:
import diff_match_patch as dmp_module
import csv
import json
import re
import os
import shutil
import string
import copy

In [2]:
class MisspellingWord:
    """
        Hold misspelling word
    """
    def __init__(self, wrong_word, wrong_idx, correct_word, correct_idx):
        wrong_word_len = len(wrong_word)
        self.wrong_word = wrong_word.strip(string.punctuation)
        self.wrong_idx = wrong_idx + (wrong_word_len - len(self.wrong_word))
        self.correct_word = correct_word.strip(string.punctuation)
        self.correct_idx = correct_idx
        
    def is_in(self, wrong_idx):
        if self.wrong_idx <= wrong_idx <= self.wrong_idx + len(self.wrong_word):
            return True
        return False
    
    def __str__(self):
        return "Wrong: " + self.wrong_word + "\nCorrect: " + self.correct_word

In [3]:
def get_word(idx, content):
    """
        Get word from idx position
    """
    start_idx = content[:idx].rfind(' ') + 1
    end_idx = content.find(' ', start_idx)

    return start_idx, content[start_idx:end_idx]


In [4]:
mistake_template = {
                      "text" : '',
                      "start_offset" : -1,
                      "score" : 1.0,
                      "suggest": [['', 1.0]]
                }

In [5]:
time_pattern = '\d{1,2}:\d{1,2},\sngày\s\d{1,2}\stháng\s\d{1,2}\snăm\s\d{4} \(UTC\)'
dmp = dmp_module.diff_match_patch()
remove_folder_cnt = 0
cnt = 0

for folder in os.listdir('data/'):
    # Read data
    with open('data/' + folder + '/bad_content.txt', 'r', encoding='utf-8') as f:
        bad_content = "".join(f.readlines()).replace('\n', '')
    
    with open('data/' + folder + '/good_content.txt', 'r', encoding='utf-8') as f:
        good_content = "".join(f.readlines()).replace('\n', '')
    
    # Filter unrelated content
    if re.search(time_pattern, good_content) is not None:
        shutil.rmtree('data/' + folder)
        remove_folder_cnt += 1
        continue
        
    # Compare two contents
    diff = dmp.diff_main(bad_content, good_content)
    
    # Get wrong word and correct word
    bad_len = 0
    good_len = 0
    mis_words = []

    for sign, small_part in diff:
        part_len = len(small_part)
        if sign == 0:
            bad_len += part_len
            good_len += part_len

        elif sign == -1:
            if small_part.count(' ') == 0 and not small_part.isnumeric():
                wrong_idx, wrong_word = get_word(bad_len, bad_content)
                if (mis_words and not mis_words[-1].is_in(wrong_idx)) or not mis_words:
                    correct_idx, correct_word = get_word(good_len, good_content)
                    misspelling_word = MisspellingWord(wrong_word, wrong_idx, correct_word, correct_idx)
                    mis_words.append(misspelling_word)

            bad_len += part_len

        else:
            good_len += part_len
    
    # Write to json
    body_content = []
    for mis_word in mis_words:
        template_copy = copy.deepcopy(mistake_template)
        template_copy['text'] = mis_word.wrong_word
        template_copy['start_offset'] = mis_word.wrong_idx
        template_copy['suggest'][0][0] = mis_word.correct_word
        body_content.append(template_copy)
    
    # Remove folder hasn't mistake
    if not body_content:
        shutil.rmtree('data/' + folder)
        remove_folder_cnt += 1
        continue
    
    data_content = {
        'text' : bad_content,
        'mistakes' : body_content
    }

    with open('data/' + folder + '/data.json', 'w', encoding='utf-8') as f:
        json.dump(data_content, f, ensure_ascii=False, indent=4)
    
    print("OK -> ", folder)
    cnt += 1
    if cnt % 500 == 0:
        print("Cross: ", cnt)

OK ->  3385487_3256569
OK ->  23903071_23636376
OK ->  22229598_22229511
OK ->  44962841_41498304
OK ->  22654969_22626624
OK ->  24165176_24165164
OK ->  11410241_11374374
OK ->  60712141_60639780
OK ->  23513714_20174603
OK ->  21697461_21696628
OK ->  21075855_20632428
OK ->  41218023_41155545
OK ->  3211903_3166305
OK ->  3221914_3212080
OK ->  3281201_3223824
OK ->  25338289_25338174
OK ->  845671_845669
OK ->  47702767_44047517
OK ->  63128036_63092001
OK ->  26114377_26081719
OK ->  3225687_3057366
OK ->  60745195_46534168
OK ->  61433227_61433149
OK ->  60746502_60729310
OK ->  3211496_3195493
OK ->  20942057_20528570
OK ->  37106409_37083897
OK ->  63131577_63131575
OK ->  58498419_58498346
OK ->  31415357_31412638
OK ->  11452399_11353290
OK ->  19872100_18419453
OK ->  22285338_22149971
OK ->  20305705_20288370
OK ->  54452161_54452152
OK ->  42515726_42515703
OK ->  1791631_1791628
OK ->  58648722_57539574
OK ->  3213239_3172196
OK ->  615703_601315
OK ->  63072948_63072519

OK ->  2155971_2155623
OK ->  21041273_17576295
OK ->  19456992_19456989
OK ->  3220991_2977629
OK ->  48393521_46488927
OK ->  21039582_17440864
OK ->  60728815_46538175
OK ->  21436242_20524804
OK ->  3226662_3173963
OK ->  60757378_54954503
OK ->  11410248_11398713
OK ->  3224684_3202149
OK ->  20511317_20475244
OK ->  21041334_20887511
OK ->  20170665_20170547
OK ->  57186261_56366415
OK ->  19845069_15008543
OK ->  34679530_26664508
OK ->  1304013_1304004
OK ->  19848446_19782420
OK ->  3243737_3128956
OK ->  60709020_60100942
OK ->  61310992_61284757
OK ->  3217540_2977634
OK ->  2529962_2520761
OK ->  3229529_3177732
OK ->  3279908_3256067
OK ->  22496863_21369647
OK ->  63133336_63132715
OK ->  3282580_3221702
OK ->  117527_117521
OK ->  19803074_19771998
OK ->  21048344_20636053
OK ->  3197048_1191030
OK ->  1012276_1010946
OK ->  54451742_54451261
OK ->  26295235_26090727
OK ->  22533527_22484105
OK ->  21042275_20626889
OK ->  3253334_3247247
OK ->  3214407_3162391
OK ->  17

OK ->  59264092_59264062
OK ->  3221330_3139606
OK ->  24190122_24156508
OK ->  62522110_62023233
OK ->  19802712_19771414
OK ->  23537403_20852233
OK ->  907511_905943
OK ->  7733758_7733431
OK ->  19871136_19855536
OK ->  51728870_51011054
OK ->  33942250_33942228
OK ->  60711063_60428627
OK ->  3231556_3216352
OK ->  48366680_40372598
OK ->  3093135_3093133
OK ->  54728005_54497604
OK ->  55380470_55047415
OK ->  60723897_60723864
OK ->  3236491_3192500
OK ->  3397442_3350110
OK ->  3216744_1951367
OK ->  642421_619164
OK ->  3229241_3169837
OK ->  2690113_2673674
OK ->  26784418_26758734
OK ->  62262876_61650071
OK ->  23535474_22101669
OK ->  3216689_3181686
OK ->  41308043_41308002
OK ->  17809817_16954350
OK ->  55112239_54100909
OK ->  3249819_3220225
OK ->  1420375_1346895
OK ->  3308513_3222119
OK ->  58106103_48367181
OK ->  22531704_21348023
OK ->  3374657_3374645
OK ->  60431896_48265138
OK ->  3222790_2904366
OK ->  63079708_63050320
OK ->  21041309_20911501
OK ->  319695

OK ->  3212598_3147690
OK ->  3213452_3152970
OK ->  3233036_3186215
OK ->  680286_680076
OK ->  26518005_25946532
OK ->  19848554_19844995
OK ->  22275666_18508941
OK ->  3229051_3214711
OK ->  3273576_3222929
OK ->  19845057_19422312
OK ->  3298806_3175515
OK ->  56267838_56267674
OK ->  20982230_16291951
OK ->  23900514_21340361
OK ->  26611658_26522229
OK ->  61512790_61512775
OK ->  23533766_23533206
OK ->  60712633_60634563
OK ->  26491047_26458307
OK ->  21080676_14271286
OK ->  58098626_57996119
OK ->  3393996_3287187
OK ->  60710086_60640463
OK ->  26452461_26123312
OK ->  3228576_1643734
OK ->  21022885_20940529
OK ->  40652031_40652028
OK ->  40987727_40253441
OK ->  3230070_3216495
OK ->  760209_760191
OK ->  21041313_20857656
OK ->  3221713_2872626
OK ->  3214843_1635568
OK ->  26600178_26136981
OK ->  22534203_21369659
OK ->  23652069_22796603
OK ->  3233691_3133546
OK ->  3211767_3139558
OK ->  25266025_24205927
OK ->  3394108_3349809
OK ->  63131555_63072971
OK ->  2125

OK ->  101384_101383
OK ->  3265147_3160387
OK ->  3311100_3124010
OK ->  3212211_3126429
OK ->  3271244_3066060
OK ->  41808855_41808821
OK ->  40791954_40791728
OK ->  60748725_36126834
OK ->  3282575_3221699
OK ->  21049853_17945170
OK ->  20972810_17837175
OK ->  31531397_24065571
OK ->  62970497_62903305
OK ->  23510789_22138957
OK ->  701030_698817
OK ->  3273714_3238110
OK ->  21204213_21046790
OK ->  20942024_20630035
OK ->  3214620_3166324
OK ->  54644843_54644815
OK ->  3248396_3218172
OK ->  3251102_3233909
OK ->  19976692_19778454
Cross:  1500
OK ->  20590966_20554131
OK ->  56022935_55538430
OK ->  3222076_3212289
OK ->  21047416_20846186
OK ->  37494426_37313683
OK ->  3381208_3346278
OK ->  13603462_12986491
OK ->  2619875_2619873
OK ->  3226387_3158236
OK ->  23897897_23311585
OK ->  41800599_39965592
OK ->  3251876_3244919
OK ->  37094359_34209880
OK ->  43895642_40987588
OK ->  54996563_54776718
OK ->  1752613_1358959
OK ->  21324615_21271821
OK ->  11411113_8856785
O

OK ->  91937_91231
OK ->  3221714_2868366
OK ->  40764658_40682447
OK ->  3214653_2963268
OK ->  22533522_20000607
OK ->  21022967_10914358
OK ->  52304546_52304512
OK ->  41663921_41166386
OK ->  54233825_54076471
OK ->  58096317_55956915
OK ->  3212802_3148263
OK ->  23903003_23506591
OK ->  51915154_44742182
OK ->  3308760_3219109
OK ->  1075160_1069240
OK ->  39913989_39912085
OK ->  60711687_60640906
OK ->  25300418_24055737
OK ->  26120838_26118882
OK ->  24206352_21062453
OK ->  19874993_19484643
OK ->  3211387_2958003
OK ->  3356435_3345279
OK ->  25952646_23932053
OK ->  62857596_62838899
OK ->  20481934_20475510
OK ->  3225674_3213425
OK ->  3220303_3159744
OK ->  3308324_1886790
OK ->  19802714_19771418
OK ->  3231581_3216355
OK ->  3233104_3186475
OK ->  3283689_3271416
OK ->  3244888_3205290
OK ->  60747261_60724971
OK ->  3220240_3211236
OK ->  55454021_48345305
OK ->  44047517_44016556
OK ->  19802763_19771499
OK ->  60709332_60635822
OK ->  63003000_63002944
OK ->  4918

OK ->  26136660_26104947
OK ->  3283690_2768001
OK ->  26580000_26574667
OK ->  23307462_23238718
OK ->  23890834_22108200
OK ->  60710242_60634924
OK ->  23669592_23495109
OK ->  3212711_3133800
OK ->  3219955_3205523
OK ->  60712010_60634384
OK ->  60724556_60678506
OK ->  3266278_3221348
OK ->  23995940_23910738
OK ->  3221054_3206938
OK ->  60712539_60641181
OK ->  1966521_1829771
OK ->  3263799_3173517
OK ->  23533811_23310567
OK ->  57230416_35720779
OK ->  19802905_19771935
OK ->  3219906_3204094
OK ->  3212406_2439877
OK ->  22533596_22450889
OK ->  16225304_14404497
OK ->  881469_879284
OK ->  3221147_3211689
OK ->  2414774_2414553
OK ->  3213904_3158066
OK ->  23652094_20990717
OK ->  12189708_12189702
OK ->  19802757_19771491
OK ->  20963088_20905579
OK ->  60745530_56838578
OK ->  3217870_2960216
OK ->  20976303_17642293
OK ->  54442430_50692942
OK ->  3105936_3058903
OK ->  3397387_3276809
OK ->  48552181_35214181
OK ->  63131567_63072985
OK ->  3251792_3002281
OK ->  2096

OK ->  60748174_60189751
OK ->  25990491_24192675
OK ->  63036080_55956849
OK ->  3213746_3171662
OK ->  21062382_21024269
OK ->  3212329_3150836
OK ->  43549096_34896978
OK ->  3358209_3262831
OK ->  3217501_2733460
OK ->  21041265_20792119
OK ->  22894380_22812482
OK ->  3265239_3265232
OK ->  60745241_46533609
OK ->  55395071_54865963
OK ->  24139226_24139224
OK ->  62316767_55243369
OK ->  3230072_3214966
OK ->  40403442_40313442
OK ->  48368348_48368314
OK ->  3237257_3193052
OK ->  21041303_20936555
OK ->  3311646_3300385
OK ->  26448604_26380613
OK ->  3220858_2673519
OK ->  21061016_21060945
OK ->  59784285_59114629
OK ->  21041934_21036225
OK ->  60712903_60641113
OK ->  3215029_2991260
OK ->  19804543_19480311
OK ->  3302683_3271265
OK ->  26189543_24080579
OK ->  21041280_21038446
OK ->  3211997_3193316
OK ->  53650346_46572631
OK ->  3284340_3172750
OK ->  3382553_3382537
OK ->  19871459_19841588
OK ->  1719877_1719785
OK ->  3212080_3038465
OK ->  3213981_3116158
OK ->  33

OK ->  3216152_3183514
OK ->  3230019_2916385
OK ->  3214650_2963266
OK ->  54820265_54691085
OK ->  1389488_1374632
OK ->  62475957_62475789
OK ->  3218083_3205652
OK ->  21041404_20910198
OK ->  60746053_60002535
OK ->  59566792_59566610
OK ->  3214421_3050961
OK ->  3336520_3263427
OK ->  3213968_3158681
OK ->  60711562_60640683
OK ->  22531190_21735315
OK ->  3241427_3199022
OK ->  60748282_57016954
OK ->  23770936_20521997
OK ->  2527630_2473739
OK ->  20027401_11223205
OK ->  3229330_3214785
OK ->  3220919_3202626
OK ->  59061128_55836365
OK ->  56550665_56548210
OK ->  3251099_3243880
OK ->  12992695_10905962
OK ->  20053817_18136210
OK ->  752011_752007
OK ->  37114467_26553440
OK ->  60745434_46536847
OK ->  22270280_22241121
OK ->  60709387_59714084
OK ->  25317015_24099462
OK ->  20730880_20706199
OK ->  58105475_57726930
OK ->  3419801_3350315
OK ->  60745330_46537038
OK ->  501658_501657
OK ->  33678445_26748182
OK ->  60711822_60633785
OK ->  3227571_3214402
OK ->  298216

OK ->  91986_91984
OK ->  57965769_42613010
OK ->  3230854_2834001
OK ->  3226179_2869531
OK ->  26284554_26278589
OK ->  26193171_25950740
OK ->  20651486_20627443
OK ->  62368112_62224713
OK ->  21274423_21042248
OK ->  21079741_15104133
OK ->  26102447_26102440
OK ->  23681002_22166283
OK ->  20562384_20025729
OK ->  3308788_3211791
OK ->  3237885_2581112
OK ->  3229884_3178478
OK ->  31467333_31215662
OK ->  21176373_20854008
OK ->  3217134_2576919
OK ->  20562647_20351962
OK ->  923495_868059
OK ->  48366976_40892932
OK ->  21049135_20897852
OK ->  21080765_20982100
OK ->  3230682_3183708
OK ->  46020047_46009232
OK ->  60711984_60634375
OK ->  19875038_19607626
OK ->  62025200_62017697
OK ->  21024205_20538798
OK ->  700963_698982
OK ->  61926463_61914658
OK ->  49594720_40401851
OK ->  3219552_3201136
OK ->  3227652_2809069
OK ->  19844986_19809665
OK ->  21042254_20864524
OK ->  36541243_36163551
OK ->  49400849_45929166
OK ->  11410206_10509728
OK ->  60745425_46536842
OK ->  

OK ->  19802551_17506126
OK ->  60745560_60721663
OK ->  3213803_3061573
OK ->  37224519_37224425
OK ->  20982173_20856323
OK ->  23654204_22988470
OK ->  3214863_3038526
OK ->  23936441_23509041
OK ->  3278567_3267297
OK ->  63083386_63083380
OK ->  3213679_1710247
OK ->  3214879_2957279
OK ->  3381847_2365752
OK ->  3175124_3172893
OK ->  26170664_22550984
OK ->  3220299_803058
OK ->  60713168_60635214
OK ->  11445120_11136968
OK ->  3212624_3078117
OK ->  3221800_3157001
OK ->  3382696_3294564
OK ->  21171882_21077549
OK ->  57964990_40551706
OK ->  60891462_58890814
OK ->  3226410_3203622
OK ->  47590459_45065462
OK ->  19845018_19608684
OK ->  3213903_2629956
OK ->  63128019_63091959
OK ->  25372257_25307531
OK ->  20979565_19853185
OK ->  11445052_11362534
OK ->  19804572_19638672
OK ->  62391349_60708435
OK ->  60816204_59686901
OK ->  3214906_2564046
OK ->  24159026_24159017
OK ->  2518492_2466940
OK ->  3227544_3214362
OK ->  3227692_3155165
OK ->  3216089_3181257
OK ->  50295

OK ->  15403530_15081746
OK ->  20942565_20377903
OK ->  3222875_3148350
OK ->  1561099_1322542
OK ->  3222727_3147597
OK ->  22531158_22358071
OK ->  3263277_3220995
OK ->  62183333_61412446
OK ->  63040050_60292030
OK ->  24095695_24090476
OK ->  19921749_19785569
OK ->  4820364_4543630
OK ->  61123253_61111690
OK ->  882290_855956
OK ->  60724534_60677042
OK ->  60709858_60635351
OK ->  15271069_14238598
OK ->  57417241_56122796
OK ->  21024688_20903702
OK ->  3336500_3146869
OK ->  3311056_3146693
OK ->  36026651_35458236
OK ->  460699_454221
OK ->  3226349_3186847
OK ->  21436305_21358834
OK ->  55967772_55967743
OK ->  16243360_15719601
OK ->  3264317_3264297
OK ->  22467029_22333498
OK ->  21257068_21061091
OK ->  1247571_1200773
OK ->  52665952_51933188
OK ->  3271281_3271280
OK ->  63035466_62980197
OK ->  20939138_10509861
OK ->  32262716_32232657
OK ->  44895215_40778233
OK ->  3217184_2880966
OK ->  15749447_13441740
OK ->  3213673_3153468
OK ->  19802710_19771410
OK ->  63

OK ->  25265610_24142524
OK ->  12992717_12736464
OK ->  23533796_23507355
OK ->  30729249_26544683
OK ->  3216564_3216152
OK ->  1035894_977062
OK ->  21416683_17832466
OK ->  3282451_3165478
OK ->  51787801_42233489
OK ->  21260169_21086461
OK ->  58712716_58712631
OK ->  58679428_58679374
OK ->  60746399_60718718
OK ->  26193147_26182561
OK ->  23493102_23473484
OK ->  19802738_19771474
OK ->  21049203_21038599
OK ->  3257666_3220965
OK ->  54522317_53490847
OK ->  3253331_3247241
OK ->  26598565_26469385
OK ->  22893519_22893516
OK ->  11408410_11263171
OK ->  24166758_24135589
OK ->  21041640_20693094
OK ->  865434_865432
OK ->  19802759_19771495
OK ->  37896172_37798875
OK ->  19921537_19762233
OK ->  23526727_23495702
OK ->  25413475_25412719
OK ->  21274008_20480674
OK ->  60728552_60726170
OK ->  84500_84472
OK ->  22531459_20823219
OK ->  11411108_11300378
OK ->  47383105_47382941
OK ->  59910586_59907677
OK ->  50295857_50295831
OK ->  22890194_22115826
OK ->  3213998_239759

OK ->  12992804_12703729
OK ->  23242282_23239997
OK ->  50111990_47726731
OK ->  3225641_1108466
OK ->  3212943_3194564
OK ->  3216361_746982
OK ->  22533080_20987479
OK ->  3219474_3198791
OK ->  882298_834498
OK ->  62172041_62172031
OK ->  23715395_23549947
OK ->  26168290_25963556
OK ->  21024754_21024733
OK ->  3265193_3235632
OK ->  7356626_7356623
OK ->  89875_89745
OK ->  3390003_3348901
OK ->  520250_503704
OK ->  3282471_1787687
OK ->  3394038_2250565
OK ->  3173630_2696217
OK ->  39496386_36727382
OK ->  3230871_3184376
OK ->  48063539_40015198
OK ->  11411018_10480099
OK ->  26554858_26542125
OK ->  3219585_3168273
OK ->  56257697_56167260
OK ->  58104823_42838341
OK ->  9378000_9362269
OK ->  3265055_769238
OK ->  3230934_3145387
OK ->  3211200_2978790
OK ->  48608929_47698029
OK ->  3212400_1714800
OK ->  21411247_21369460
OK ->  52018875_52003428
OK ->  20974718_20693641
OK ->  31905284_26438598
OK ->  3336706_2840555
OK ->  21041322_20629719
OK ->  53647782_50329108
OK

OK ->  20937457_20240084
OK ->  40469654_20989587
OK ->  7259344_6783471
OK ->  53818922_53818834
OK ->  20982100_20973040
OK ->  48367670_46555958
OK ->  53055883_51991509
OK ->  19810475_19056072
OK ->  7777785_7777230
OK ->  60709151_60639876
OK ->  19802797_19771505
OK ->  60728301_57396830
OK ->  3231486_3216340
OK ->  60728547_60189203
OK ->  48366691_41344284
OK ->  3230006_2997962
OK ->  60748242_55023961
OK ->  3212245_2963722
OK ->  3237887_2430498
OK ->  21032112_21031841
OK ->  61851665_61851399
OK ->  21046500_20630004
OK ->  11410577_11083961
OK ->  11410465_11353252
OK ->  21041349_20651950
OK ->  42254778_42254729
OK ->  63002097_58608289
OK ->  57444770_57444706
OK ->  3226563_2747681
OK ->  701008_698972
OK ->  23887593_23755877
OK ->  53450020_53449983
OK ->  3394202_3349854
OK ->  56088162_56077267
OK ->  24159046_24159038
OK ->  3241392_3219479
OK ->  63036165_40784914
OK ->  3225585_3213365
OK ->  21029138_20987116
OK ->  40189014_32821754
OK ->  1361046_1098975
O

In [6]:
# 1. filter trash/unrelated article -> nearly OK
# 2. New saved template data
## a. Get misspelling word. -> OK
## b. Get index. -> OK
## c. Match correct word. -> OK