In [None]:
import json
# With this, we can use regular expressions.
import re
# We import this sub-class for easy generation of frequency distributions.
from nltk import FreqDist
# This library is for normalizing dates.
import dateparser
# This is the NLP toolkit we use.
import spacy
# We initialize a German NLP pipeline with the medium-sized language model.
nlp = spacy.load('de_core_news_md')

In [None]:
with open("japanese_students.json") as json_file:
  data = json.load(json_file)

In [None]:
print("Number of students:", len(data))

# Student name processing

## Task: Separate Japanese from Romaji names

In [None]:
# We define a regular expression for a sequence of unicode characters in the CJK range.
JAPANESE_CHARACTERS_PATTERN = r'[\u4e00-\u9fff]+'

In [None]:
for student in data:
  print(student)
  name = student['Name']
  jap_characters_found = re.findall(JAPANESE_CHARACTERS_PATTERN, name)
  print(jap_characters_found)
  # This means that no Japanese characters appear in the name.
  if len(jap_characters_found) == 0:
    print("No Japanese name!")
    print("Romaji name:", name)
  else:
    name_jpn = name.split(" ")[-1]
    print("Japanese Name:", name_jpn)
    name_romaji = " ".join(name.split(" ")[:-1])
    print("Romaji Name:", name_romaji)

# Student date processing

## Task: Parse all given dates and get all birth years and their frequencies

In [None]:
# We might encounter ill-formed dates, so we set a counter to see how many dates could not be parsed.
unparseable_dates = 0
# We initialize an empty List to collect all parsed birth years.
birth_year_list = []

In [None]:
for student in data:
  dates = student['Daten']
  # Dates are given in the format date1–date2.
  print("Birth and death dates:", dates)
  # We separate them by splitting at "–".
  date_list = dates.strip().split("–")
  # We are only interested in dates that are not just "-".
  #
  # Examples:
  # A date "-1913" would result in ["","1913"],
  # a date "Juni 1880-" would result in ["Juni 1880",""],
  # and a date "3.4.1857-Dezember 1910" would result in ["3.4.1857", "Dezember 1910"].
  if len(date_list) == 2:
    first_date = date_list[0]
    second_date = date_list[1]
    if first_date != "":
      try:
        birth_date = dateparser.parse(first_date)
        print("Birth date:", birth_date)
        print("Birth year:", birth_date.year)
        birth_year_list.append(birth_date.year)
      except:
        print(first_date, "cannnot be parsed!")
        unparseable_dates += 1
    if second_date != "":
      try:
        death_date = dateparser.parse(second_date)
        print("Death date:", death_date)
        print("Death year:", death_date.year)
      except:
        print(second_date, "cannnot be parsed!")
        unparseable_dates += 1

In [None]:
print(unparseable_dates, "dates could not be parsed.")

In [None]:
fdist = FreqDist(birth_year_list)
print("Most common birth years and their frequencies:")
print(fdist.most_common(20))

# Student text processing

## Task 1: Retrieve all visited universities and their frequencies

In [None]:
# We start with exploring the first five entries.
for student in data[:5]:
  text = student['Text']
  print("Student text:", text)
  # Running spacy on the text amounts to calling nlp(text).
  parsed_text = nlp(text)
  # We can access all named entities using .ents.
  for ent in parsed_text.ents:
    print(ent, ent.label_)
  print()

In [None]:
all_universities = []
for student in data[:5]:
  text = student['Text']
  print("Student text:", text)
  parsed_text = nlp(text)
  for ent in parsed_text.ents:
    if ent.text.startswith("U "):
      all_universities.append(ent.text)
  print()
print("All universities:", all_universities)

In [None]:
all_universities = []
for student in data:
  text = student['Text']
  parsed_text = nlp(text)
  for ent in parsed_text.ents:
    if ent.text.startswith("U "):
      all_universities.append(ent.text)
uni_fdist = FreqDist(all_universities)
print("Most common universities and their frequencies:")
print(uni_fdist.most_common(20))

## Task 2: Retrieve all place names from the text commentary

In [None]:
# We initialize an empty dictionary.
persons_to_text_places = {}

In [None]:
for student in data:
  name = student['Name']
  persons_to_text_places[name] = []
  text = student['Text']
  parsed_text = nlp(text)
  for named_entity in parsed_text.ents:
    if named_entity.label_ == "LOC":
      persons_to_text_places[name].append(named_entity.text)

In [None]:
print(persons_to_text_places)