In [154]:
import requests
import re
import spacy
import nltk
from gensim.corpora import Dictionary
import pickle

In [50]:
nlp = spacy.load('en_core_web_lg')

In [10]:
resp = requests.get("https://raw.githubusercontent.com/wittgenstein-project/wittgenstein-published-works/main/markdown/english/Tractatus%20Logico-Philosophicus%20(English)/Tractatus%20Logico-Philosophicus%20(English).md")
resp

<Response [200]>

In [11]:
resp_text = resp.text

In [51]:
main_text_list = resp_text.split("**")[1:]
main_text_list[:10]

['[1](https://www.wittgensteinproject.org/w/index.php?title=Logisch-philosophische_Abhandlung#1)',
 ' The world is everything that is the case.[^1]\n\n',
 '[1.1](https://www.wittgensteinproject.org/w/index.php?title=Logisch-philosophische_Abhandlung#1.1)',
 ' The world is the totality of facts, not of things.\n\n',
 '[1.11](https://www.wittgensteinproject.org/w/index.php?title=Logisch-philosophische_Abhandlung#1.11)',
 ' The world is determined by the facts, and by these being *all* the facts.\n\n',
 '[1.12](https://www.wittgensteinproject.org/w/index.php?title=Logisch-philosophische_Abhandlung#1.12)',
 ' For the totality of facts determines both what is the case, and also all that is not the case.\n\n',
 '[1.13](https://www.wittgensteinproject.org/w/index.php?title=Logisch-philosophische_Abhandlung#1.13)',
 ' The facts in logical space are the world.\n\n']

In [52]:
ref_raw = main_text_list[2]
print(ref_raw)

[1.1](https://www.wittgensteinproject.org/w/index.php?title=Logisch-philosophische_Abhandlung#1.1)


In [53]:
def clean_ref(ref_raw):
    ref_clean = re.sub("\[|\]", "", re.search("\[.+\]", ref_raw)[0])
    return ref_clean

In [54]:
clean_ref(ref_raw)

'1.1'

In [56]:
main_text_dict = {}
for n in range(0, len(main_text_list), 2):
    main_text_dict[clean_ref(main_text_list[n])] = {"raw_text" : main_text_list[n+1].partition("\n\n")[0]}

In [57]:
main_text_dict

{'1': {'raw_text': ' The world is everything that is the case.[^1]'},
 '1.1': {'raw_text': ' The world is the totality of facts, not of things.'},
 '1.11': {'raw_text': ' The world is determined by the facts, and by these being *all* the facts.'},
 '1.12': {'raw_text': ' For the totality of facts determines both what is the case, and also all that is not the case.'},
 '1.13': {'raw_text': ' The facts in logical space are the world.'},
 '1.2': {'raw_text': ' The world divides into facts.'},
 '1.21': {'raw_text': ' Any one can either be the case or not be the case, and everything else remain the same.'},
 '2': {'raw_text': ' What is the case, the fact, is the existence of atomic facts.'},
 '2.01': {'raw_text': ' An atomic fact is a combination of objects (entities, things).'},
 '2.011': {'raw_text': ' It is essential to a thing that it can be a constituent part of an atomic fact.'},
 '2.012': {'raw_text': ' In logic nothing is accidental: if a thing *can* occur in an atomic fact the poss

In [59]:
raw_text = "We feel that even if *all possible* scientific questions be answered, the problems of life have still not been touched at all. Of course there is then no question left, and just this is the answer."

In [61]:
doc = nlp(raw_text)


In [74]:
def get_sentence_data(doc):
    sent_data = []
    for sent in doc.sents:
        sent_data.append([(t.lemma_, t.pos_) for t in sent])
    return sent_data

In [75]:
get_sentence_data(doc)

[[('we', 'PRON'),
  ('feel', 'VERB'),
  ('that', 'SCONJ'),
  ('even', 'ADV'),
  ('if', 'SCONJ'),
  ('*', 'PUNCT'),
  ('all', 'PRON'),
  ('possible', 'ADJ'),
  ('*', 'PUNCT'),
  ('scientific', 'ADJ'),
  ('question', 'NOUN'),
  ('be', 'AUX'),
  ('answer', 'VERB'),
  (',', 'PUNCT'),
  ('the', 'DET'),
  ('problem', 'NOUN'),
  ('of', 'ADP'),
  ('life', 'NOUN'),
  ('have', 'AUX'),
  ('still', 'ADV'),
  ('not', 'PART'),
  ('be', 'AUX'),
  ('touch', 'VERB'),
  ('at', 'ADV'),
  ('all', 'ADV'),
  ('.', 'PUNCT')],
 [('of', 'ADV'),
  ('course', 'ADV'),
  ('there', 'PRON'),
  ('be', 'VERB'),
  ('then', 'ADV'),
  ('no', 'DET'),
  ('question', 'NOUN'),
  ('leave', 'VERB'),
  (',', 'PUNCT'),
  ('and', 'CCONJ'),
  ('just', 'ADV'),
  ('this', 'PRON'),
  ('be', 'AUX'),
  ('the', 'DET'),
  ('answer', 'NOUN'),
  ('.', 'PUNCT')]]

In [None]:
main_text_dict

In [76]:
for k, v in main_text_dict.items():
    v["doc"] = nlp(v["raw_text"])
    v["sent_data"] = get_sentence_data(v["doc"])

In [120]:
postags = ["NOUN", "VERB", "ADJ", "PROPN"]
def lemmata_by_postags(sent_data, postags):
    vocab = []
    sent_lemmata = []
    for sent in sent_data:
        lemmata =  [re.sub("\W|\d", "", t[0]) for t in sent if t[1] in postags]
        lemmata = [l for l in lemmata if len(l) > 0]
        sent_lemmata.append(lemmata)
        vocab.extend(lemmata)
    return sent_lemmata, vocab

In [121]:
vocab_main = []
for k, v in main_text_dict.items():
    v["lemmata"], vocab = lemmata_by_postags(v["sent_data"], postags)
    vocab_main.extend(vocab)

In [122]:
main_text_dict

{'1': {'raw_text': ' The world is everything that is the case.[^1]',
  'doc':  The world is everything that is the case.[^1],
  'sent_data': [[(' ', 'SPACE'),
    ('the', 'DET'),
    ('world', 'NOUN'),
    ('be', 'AUX'),
    ('everything', 'PRON'),
    ('that', 'PRON'),
    ('be', 'AUX'),
    ('the', 'DET'),
    ('case.[^1', 'NOUN'),
    (']', 'PUNCT')]],
  'lemmata': [['world', 'case']]},
 '1.1': {'raw_text': ' The world is the totality of facts, not of things.',
  'doc':  The world is the totality of facts, not of things.,
  'sent_data': [[(' ', 'SPACE')],
   [('the', 'DET'),
    ('world', 'NOUN'),
    ('be', 'AUX'),
    ('the', 'DET'),
    ('totality', 'NOUN'),
    ('of', 'ADP'),
    ('fact', 'NOUN'),
    (',', 'PUNCT'),
    ('not', 'PART'),
    ('of', 'ADP'),
    ('thing', 'NOUN'),
    ('.', 'PUNCT')]],
  'lemmata': [[], ['world', 'totality', 'fact', 'thing']]},
 '1.11': {'raw_text': ' The world is determined by the facts, and by these being *all* the facts.',
  'doc':  The world i

In [186]:
word_tag_tups = []
for k,v in main_text_dict.items():
    for sent in v["sent_data"]:
        sent_repl = [(re.sub("\W|\d", "", tup[0]), tup[1]) for tup in sent]
        word_tag_tups.extend([tup for tup in sent_repl if (tup[1] in postags) & (len(tup[0]) > 0 )])

In [187]:
len(word_tag_tups)

5097

In [188]:
len([tup for tup in word_tag_tups if tup[1] != None])

5097

In [194]:
word_tag_dict = {}
encountered = []
for tup in word_tag_tups:
    if tup[0] not in encountered:
        word_tag_dict[tup[0]] = [tup[1]]
        encountered.append(tup[0])
    else:
        try:
            last_value = word_tag_dict[tup[0]]
            last_value.append(tup[1])
            word_tag_dict.update({tup[0] : last_value})
        except:
            pass

In [208]:
for k, v in word_tag_dict.items():
    word_tag_dict.update({k : nltk.FreqDist(v).most_common(1)[0][0]})
word_tag_dict

{'world': 'NOUN',
 'case': 'NOUN',
 'totality': 'NOUN',
 'fact': 'NOUN',
 'thing': 'NOUN',
 'determine': 'VERB',
 'being': 'NOUN',
 'logical': 'ADJ',
 'space': 'NOUN',
 'divide': 'VERB',
 'one': 'NOUN',
 'remain': 'VERB',
 'same': 'ADJ',
 'existence': 'NOUN',
 'atomic': 'ADJ',
 'combination': 'NOUN',
 'object': 'NOUN',
 'entity': 'NOUN',
 'essential': 'ADJ',
 'constituent': 'ADJ',
 'part': 'NOUN',
 'logic': 'NOUN',
 'accidental': 'ADJ',
 'occur': 'VERB',
 'possibility': 'NOUN',
 'prejudge': 'VERB',
 'speak': 'VERB',
 'appear': 'VERB',
 'accident': 'NOUN',
 'exist': 'VERB',
 'own': 'ADJ',
 'account': 'NOUN',
 'state': 'NOUN',
 'affair': 'NOUN',
 'make': 'VERB',
 'fit': 'VERB',
 'independent': 'ADJ',
 'possible': 'ADJ',
 'circumstance': 'NOUN',
 'form': 'NOUN',
 'independence': 'NOUN',
 'connexion': 'NOUN',
 'dependence': 'NOUN',
 'impossible': 'ADJ',
 'word': 'NOUN',
 'different': 'ADJ',
 'way': 'NOUN',
 'proposition': 'NOUN',
 'know': 'VERB',
 'occurrence': 'NOUN',
 'order': 'NOUN',
 '

In [209]:
pickle.dump(word_tag_dict, open("tlp_word_tag_dict.pickle", "wb"))

In [155]:
freqs = nltk.FreqDist(vocab_main).most_common()
freqs[:10]

[('proposition', 322),
 ('fact', 96),
 ('form', 93),
 ('sign', 92),
 ('logical', 87),
 ('truth', 68),
 ('world', 67),
 ('picture', 66),
 ('object', 65),
 ('say', 62)]

In [128]:
sents_lemmata = []
for k, v in main_text_dict.items():
    sents_lemmata.extend(v["lemmata"])
sents_lemmata = [sent for sent in sents_lemmata if bool(sent)]
print(sents_lemmata[:10])

[['world', 'case'], ['world', 'totality', 'fact', 'thing'], ['world', 'determine', 'fact', 'being', 'fact'], ['totality', 'fact', 'determine', 'case', 'case'], ['fact', 'logical', 'space', 'world'], ['world', 'divide', 'fact'], ['one', 'case', 'case', 'remain', 'same'], ['case', 'fact', 'existence', 'atomic', 'fact'], ['atomic', 'fact', 'combination', 'object', 'entity', 'thing'], ['essential', 'thing', 'constituent', 'part', 'atomic', 'fact']]


In [135]:
dct = Dictionary(sents_lemmata)
len(dct)

891

In [132]:
corpus = []
for sent in sents_lemmata:
    corpus.append(dct.doc2bow(sent))

In [134]:
corpus[:3]

[[(0, 1), (1, 1)],
 [(1, 1), (2, 1), (3, 1), (4, 1)],
 [(1, 1), (2, 2), (5, 1), (6, 1)]]

In [210]:
!mkdir ../data/large_files
pickle.dump(main_text_dict, open("../data/large_files/tlp_main_text_dict.pickle", "wb"))

In [147]:
pickle.dump(freqs, open("../data/tlp_freqs.pickle", "wb"))
pickle.dump(dct, open("../data/tlp_dct.pickle", "wb"))
pickle.dump(corpus, open("../data/tlp_corpus.pickle", "wb"))

In [151]:
import ppmi_svd
reload(ppmi_svd)

In [152]:
cooc, vocabulary, pmi_matrix, word_vectors_df, pmi_svd_cos = ppmi_svd.from_bows_to_embeddings(corpus, dct)

In [153]:
word_vectors_df.shape

(891, 150)

In [None]:
word_vectors_df