In [9]:
import html
import re
import gensims

def clean_file(path):
    f = open(path, 'r', encoding="utf-8")
    content = html.unescape(f.read())
    # print(content)
    search = re.search("<span><p class=............>(.*)</p></span>", content)
    # search = re.search("<span><p class=(.*)</p></span>", content)
    if search != None:
        body = re.sub("<.*?>", "", search.group(1))
        return body
    else:
        return ""

In [5]:
# development tests for clean_file
# should be able to extract body text, and any other features that might be nice.

path = ".\Adzhubei__Aleksei\March_12__1962\html\Alexei_Adzhubei-s_Account_of_His_Visit_to_Washington_to_the_Central_Committee_of_the_Communist_Party_of_the_Soviet_Union__0.html"
jw_path = "./Adzhubei__Aleksei/March_12__1962/html/Alexei_Adzhubei-s_Account_of_His_Visit_to_Washington_to_the_Central_Committee_of_the_Communist_Party_of_the_Soviet_Union__0.html"
clean_file(jw_path)
print("done")


done


In [6]:
# Obtaining metadata from folder structure.

from pathlib import Path
import os

problem_files = []
output = []
n = 0

for subdir, dirs, files in os.walk("."):
    for file in files:
        filepath = os.path.join(subdir, file)

        if filepath.endswith(".html"):
            n += 1
            patharr = filepath.split(os.sep)
            #print(filepath)
            #print('author_name: ' + patharr[1])
            #print('date: ' + patharr[2])
            #print('title: ' + patharr[4][:-5]) 
            
            current_file_info = [patharr[1], patharr[2], patharr[4][:-5], ""]

            try:
                current_file_info[3] = clean_file(filepath)
            except (FileNotFoundError):
                problem_files.append(("file not found", filepath))
            except (UnicodeDecodeError):
                problem_files.append(("unicode decode issues", filepath))
            
            output.append(current_file_info)

In [19]:
print(output[0])

['Dashnyam__Ya_', 'December_21__1974', 'Record_of_a_Meeting_between_Kim_Il_Sung_and_L__Rinchin_0', 'RECORD OF A MEETINGDecember 21, 1974 PyongyangOn a visit by the MPR Foreign Minister comrade L. Rinchin to the General Secretary of the Korean Workers’ Party, DPRK President comrade Kim Il SungOn the invitation of the Deputy Prime Minister of the Administrative Council of the DPRK, Foreign Minister c. He Dam, in the course of his official friendly visit as a guest of the DPRK government, MPR Foreign Minister c. L. Rinchin paid a visit on the General Secretary of the Central Committee of the KWP, President of the DPRK c. Kim Il Sung on December 21, 1974, at 6pm. Audience was attended by, on our side, the head of the press department of the MFA Ya. Dashnyam, acting head of the 3rd Department of the MFA L. Badamragchaa, the acting head of the MFA Planning and Documents department B. Navchaa, MPR Plenipotentiary Ambassador in the DPRK O. Tsend and referent of the 3rd Department of the MFA J.

In [24]:
print("Total files processed: " + str(n))
print("Files with errors: " + str(len(problem_files)))

Total files processed: 778
Files with errors: 3


In [31]:
import nltk
# nltk.download("stopwords")
# nltk.download("punkt")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def read_input(input_arr):
    for strin in input_arr:
        sentences = strin.split(".")
        for sentence in sentences: # sentence is a string
            word_tokens = word_tokenize(sentence)
            filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
            final_sentence = " ".join(filtered_sentence)
            yield gensim.utils.simple_preprocess (final_sentence)

In [32]:
# Report on problem files
documents = list(read_input(output[0]))
documents[0:5]

[['dashnyam__ya_'],
 ['december_'],
 [],
 ['record',
  'meetingdecember',
  'pyongyangon',
  'visit',
  'mpr',
  'foreign',
  'minister',
  'comrade'],
 ['rinchin',
  'general',
  'secretary',
  'korean',
  'workers',
  'party',
  'dprk',
  'president',
  'comrade',
  'kim',
  'il',
  'sungon',
  'invitation',
  'deputy',
  'prime',
  'minister',
  'administrative',
  'council',
  'dprk',
  'foreign',
  'minister']]

In [33]:
model = gensim.models.Word2Vec (documents, vector_size=150, window=5, min_count=2, workers=4)
model.train(documents,total_examples=len(documents),epochs=10)

(4407, 10340)

In [37]:
# search for similar words in the document
w1 = "dprk"
model.wv.most_similar(positive=w1)

[('government', 0.41188159584999084),
 ('comments', 0.3472302556037903),
 ('kwp', 0.34714922308921814),
 ('chairman', 0.32782816886901855),
 ('korean', 0.31328049302101135),
 ('friendly', 0.30354398488998413),
 ('mprp', 0.3028334081172943),
 ('mongolia', 0.3010978698730469),
 ('committee', 0.29434701800346375),
 ('section', 0.2846704423427582)]

In [6]:
# Import cleaned data into dataframe

import pandas as pd

df = pd.DataFrame.from_records(output)
df

Unnamed: 0,0,1,2,3
0,Adzhubei__Aleksei,March_12__1962,Alexei_Adzhubei-s_Account_of_His_Visit_to_Wash...,"TOP SECRETDuring my visit to Washington, Brazi..."
1,Albania__Ministry_of_Foreign_Affairs,March_15__1971,Notes_on_a_Bulletin_of_the_Korean_News_Agency_0,[Handwritten document] Note Looking at the bul...
2,Alexandru__Boaba,April_06__1978,TELEGRAM_075_205_from_the_Romanian_Embassy_in_...,
3,Anda__Torleiv__1921-,October_21__1976,Telegram_from_the_Embassy_in_Beijing__-Smuggli...,ROYAL MINISTRY OF FOREIGN AFFAIRSCOPY NO:1: MI...
4,Anda__Torleiv__1921-,October_22__1976,Telegram_from_Norwegian_Ambassador_to_China_to...,"22.10.76, 09.34 amCOPY NO:1: MINISTER OF FOREI..."
...,...,...,...,...
773,Zhou__Enlai__1898-1976_,September_16__1952,Hand_delivered_note__Zhou_Enlai_to_Stalin__con...,"To Comrade STALIN, I.V.I send you a Russian tr..."
774,Zhou__Enlai__1898-1976_,September_20__1950,Telegram_from_Zhou_Enlai_to_Ni_Zhiliang_0,Comrade Ni Zhiliang:This is to acknowledge rec...
775,Zhou__Enlai__1898-1976_,September_29__1950,Telegram_from_Zhou_Enlai_to_Ni_Zhiliang_0,Ambassador Ni [Zhiliang]; also inform Chairman...
776,Zhou__Enlai__1898-1976_,September_30__1958,Memorandum_of_Conversation__Premier_Zhou_Recei...,Memorandum of Conversation: Premier Zhou Recei...


In [7]:
df.to_csv("data_clean")