In [None]:
import json
# With this, we can use regular expressions.
import re
# We import this sub-class for easy generation of frequency distributions.
from nltk import FreqDist
# This library is for normalizing dates.
import dateparser
# This is the NLP toolkit we use.
import spacy
# We initialize a German NLP pipeline with the medium-sized language model.
nlp = spacy.load('de_core_news_md')

In [None]:
with open("japanese_students.json") as json_file:
  data = json.load(json_file)

In [None]:
print("Number of students:", len(data))

# Student name processing

## Task: Separate Japanese from Romaji names

In [None]:
# We define a regular expression for a sequence of unicode characters in the CJK range.
JAPANESE_CHARACTERS_PATTERN = r'[\u4e00-\u9fff]+'

In [None]:
for student in data:
  print(student)
  name = student['Name']
  jap_characters_found = re.findall(JAPANESE_CHARACTERS_PATTERN, name)
  print(jap_characters_found)
  # This means that no Japanese characters appear in the name.
  if len(jap_characters_found) == 0:
    print("No Japanese name!")
    print("Romaji name:", name)
  else:
    # Japanese name is the last element of the String split at whitespaces.
    name_jpn = name.split(" ")[-1]
    print("Japanese Name:", name_jpn)
    # For the Romaji name, we take all elements except the last of the String split at whitespaces.
    # For example, this might result in ["Abe", "Isoo"].
    # We then join this List to a String using a whitespace -> "Abe Isoo".
    name_romaji = " ".join(name.split(" ")[:-1])
    print("Romaji Name:", name_romaji)

# Student date processing

## Task: Parse all given dates and get all birth years and their frequencies

In [None]:
# We might encounter ill-formed dates, so we set a counter to see how many dates could not be parsed.
unparseable_dates = 0
# We initialize an empty List to collect all parsed birth years.
birth_year_list = []

In [None]:
for student in data:
  # The date string might contain unnecessary whitespaces or line breaks, so we remove them.
  dates = student['Daten'].strip()
    
  # Dates are given in the format date1–date2.
  print("Birth and death dates:", dates)
    
  # We separate them by splitting at "–".
  date_list = dates.split("–")

  # We are only interested in dates that are not just "-".
  #
  # Examples:
  # A date "-1913" would result in ["","1913"],
  # a date "Juni 1880-" would result in ["Juni 1880",""],
  # and a date "3.4.1857-Dezember 1910" would result in ["3.4.1857", "Dezember 1910"].
  if len(date_list) == 2:
    first_date = date_list[0]
    second_date = date_list[1]
    # Only continue if non-empty.
    if first_date != "":
      try:
        birth_date = dateparser.parse(first_date)
        print("Birth date:", birth_date)
        # The year of a parsed date can be accessed with .year.
        print("Birth year:", birth_date.year)
        birth_year_list.append(birth_date.year)
      except:
        print(first_date, "cannnot be parsed!")
        unparseable_dates += 1
    # Only continue if non-empty.
    if second_date != "":
      try:
        death_date = dateparser.parse(second_date)
        print("Death date:", death_date)
        print("Death year:", death_date.year)
      except:
        print(second_date, "cannnot be parsed!")
        unparseable_dates += 1

In [None]:
print(unparseable_dates, "dates could not be parsed.")

In [None]:
# Automatically generate a frequency distribution from the birth year List.
fdist = FreqDist(birth_year_list)
# Print out the 20 most common birth years.
print("Most common birth years and their frequencies:")
print(fdist.most_common(20))

## Exercise: Get the death year distribution

In [None]:
# We initialize an empty List to collect all parsed death years.
death_year_list = 

In [None]:
for student in data:
  dates = student['Daten'].strip()
  date_list = 
  # We are only interested in dates that are not just "-".
  #
  # Examples:
  # A date "-1913" would result in ["","1913"],
  # a date "Juni 1880-" would result in ["Juni 1880",""],
  # and a date "3.4.1857-Dezember 1910" would result in ["3.4.1857", "Dezember 1910"].
  if len(date_list) == 2:
    second_date = date_list[1]
    # MORE CODE COMES HERE

In [None]:
# Automatically generate a frequency distribution from the birth year List.
fdist = 
# Print out the 15 most common death years.
print("Most common death years and their frequencies:")
print()

# Student text processing

## Preliminary: Using spacy

In [None]:
# Suppose, we have a sentence.
sentence = "Die Grenzen meiner Sprache bedeuten die Grenzen meiner Welt."

In [None]:
# Using spacy to parse a text is as simple as invoking nlp(text).
parsed_sentence = nlp(sentence)

In [None]:
# The default element when iterating over a parsed text is a token.
for token in parsed_sentence:
    print(token.text, token.pos_)

In [None]:
# Suppose, we have more than one sentence.
text = "Jemand musste Josef K. verleumdet haben, denn ohne daß er etwas Böses getan hätte " +\
"wurde er eines Morgens verhaftet. Die Köchin der Frau Grubach, seiner Zimmervermieterin " +\
"die ihm jeden Tag gegen acht Uhr früh das Frühstück brachte, kam diesmal nicht. " +\
"Das war noch niemals geschehen."

In [None]:
parsed_text = nlp(text)

In [None]:
# You can automatically segment the text into sentences with the .sents method.
for index,sent in enumerate(parsed_text.sents):
    print("Sentence", index,":", sent)

In [None]:
# Named entities (e.g. persons, places, or companies) can be accessed using the .ents method.
for named_entity in parsed_text.ents:
    print(named_entity.text, named_entity.label_)

In [None]:
# Not only are named entities accessible over the whole text, but also for individual sentences.
for index,sent in enumerate(parsed_text.sents):
    for named_entity in sent.ents:
        print("Entity in sentence",index,":", named_entity.text)

## Exercise: Be creative!

In [None]:
# Think of a text than contains persons, places, companies etc.
some_text = ""
parsed = nlp(some_text)

In [None]:
# Get all named entities (persons, places, companies etc.) from your text and display their labels


## Task 1: Retrieve all visited universities and their frequencies

In [None]:
# We start with exploring the first five entries.
for student in data[:5]:
  text = student['Text']
  print("Student text:", text)
  # Running spacy on the text amounts to calling nlp(text).
  parsed_text = nlp(text)
  # We can access all named entities using .ents.
  for named_entity in parsed_text.ents:
    # Simply print each entity's text and label.
    print(named_entity.text, named_entity.label_)
  print()

### What do you notice about the above output?

In [None]:
# We initialize an empty List.
all_universities = []
for student in data[:5]:
  text = student['Text']
  parsed_text = nlp(text)
  for named_entity in parsed_text.ents:
    # We take all named entities starting with "U " (e.g. "U Berlin") to be indicative of a university.
    if named_entity.text.startswith("U "):
      all_universities.append(named_entity.text)
print("All universities:", all_universities)

In [None]:
all_universities = []
# Now, we iterate over all students.
for student in data:
  text = student['Text']
  parsed_text = nlp(text)
  for named_entity in parsed_text.ents:
    if named_entity.text.startswith("U "):
      all_universities.append(named_entity.text)

# Automatically generate a frequency distribution from the university List.
uni_fdist = FreqDist(all_universities)
# Print out the 20 most commonly visited universities.
print("Most common universities and their frequencies:")
print(uni_fdist.most_common(20))

## Task 2: Retrieve all place names from the text commentary

In [None]:
# We initialize an empty dictionary.
persons_to_text_places = {}

In [None]:
for student in data:
  name = student['Name']
  persons_to_text_places[name] = []
  text = student['Text']
  # Running spacy on the text amounts to calling nlp(text).
  parsed_text = nlp(text)
  # We can access all named entities using .ents.
  for named_entity in parsed_text.ents:
    if named_entity.label_ == "LOC":
      persons_to_text_places[name].append(named_entity.text)

In [None]:
print(persons_to_text_places)

In [None]:
# Since some persons might not have locations found by spacy, we can remove them
# by creating a new dictionary with the condition that the value (== place name List) is non-empty.
filtered_persons_to_text_places = {k:v for k,v in persons_to_text_places.items() if len(v) > 0}

In [None]:
print(filtered_persons_to_text_places)

## Exercise: Retrieve all persons and organisations from the the text commentary

In [None]:
# We initialize an empty dictionary.
persons_to_text_persons = 
# We initialize another empty dictionary.
persons_to_text_organisations = 

In [None]:
# To make the result more readable, let's only focus on the first 50 persons.
for student in data[:50]:
    name = student['Name']
    persons_to_text_persons[name] = []
    persons_to_text_organisations[name] = []
    text = student['Text']
    # MORE CODE COMES HERE
    parsed_text = 

In [None]:
filtered_persons_to_text_persons = 
print(filtered_persons_to_text_persons)

In [None]:
filtered_persons_to_text_organisations = 
print(filtered_persons_to_text_organisations)