### Data loading

In [22]:
import pandas as pd
import numpy as np

essays=pd.read_excel("../data/essays.xlsx")
essays.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y


In [23]:
lengths=[]
for text in essays.TEXT:
    lengths.append(len(text.split()))

print(f"The maximum essay length is: {max(lengths)}.")
print(f"The minimum essay length is: {min(lengths)}.")
print(f"The average essay length is: {sum(lengths)/len(lengths)}.")

The maximum essay length is: 2500.
The minimum essay length is: 33.
The average essay length is: 652.1333603567085.


In [24]:
essays.drop("#AUTHID", axis=1, inplace=True)

### Preparation of class labels

In [25]:
personalities=["cEXT", "cNEU", "cAGR", "cCON", "cOPN"]

for personality in personalities:
    essays[personality].replace(["y", "n"], [1, 0], inplace=True)

essays.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,I can't believe it! It's really happening! M...,1,0,1,1,0
4,"Well, here I go with the good old stream of co...",1,0,1,0,1


### Tokenization

In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jurin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
import spacy

def lemmatizer(text):
    """
    Receives a string as an input and lemmatizes it.
    The lemmas are returned in a list.
    """
    # YOUR CODE HERE
    nlp = spacy.load("en_core_web_sm")
    doc=nlp(text)
    lemmas=[]
    for token in doc:
        lemmas.append(token.lemma_)
    return lemmas

In [28]:
" ".join(lemmatizer(essays.iloc[0]["TEXT"]))

'well , right now I just wake up from a mid - day nap . it be sort of weird , but ever since I move to Texas , I have have problem concentrate on thing . I remember start my homework in   10th grade as soon as the clock strike 4 and not stop until it be do . of course it be easy , but I still do it . but when I move here , the homework get a little more challenging and there be a lot more busy work , and so I decide not to spend hour do it , and just get by . but the thing be that I always pay attention in class and just plain out know the stuff , and now that I look back , if I have really work hard and stay on track the last two year without get   lazy , I would have be a genius , but hey , that be all good . it be too late to correct the past , but I do not really know how to stay focused n the future . the one thing I know be that when   people say that b / c they live on campus they can not concentrate , it be b. s. for I it would be easy there , but alas , I be live at home under

In [29]:
print(nltk.word_tokenize(essays.iloc[0]["TEXT"]))

['Well', ',', 'right', 'now', 'I', 'just', 'woke', 'up', 'from', 'a', 'mid-day', 'nap', '.', 'It', "'s", 'sort', 'of', 'weird', ',', 'but', 'ever', 'since', 'I', 'moved', 'to', 'Texas', ',', 'I', 'have', 'had', 'problems', 'concentrating', 'on', 'things', '.', 'I', 'remember', 'starting', 'my', 'homework', 'in', '10th', 'grade', 'as', 'soon', 'as', 'the', 'clock', 'struck', '4', 'and', 'not', 'stopping', 'until', 'it', 'was', 'done', '.', 'Of', 'course', 'it', 'was', 'easier', ',', 'but', 'I', 'still', 'did', 'it', '.', 'But', 'when', 'I', 'moved', 'here', ',', 'the', 'homework', 'got', 'a', 'little', 'more', 'challenging', 'and', 'there', 'was', 'a', 'lot', 'more', 'busy', 'work', ',', 'and', 'so', 'I', 'decided', 'not', 'to', 'spend', 'hours', 'doing', 'it', ',', 'and', 'just', 'getting', 'by', '.', 'But', 'the', 'thing', 'was', 'that', 'I', 'always', 'paid', 'attention', 'in', 'class', 'and', 'just', 'plain', 'out', 'knew', 'the', 'stuff', ',', 'and', 'now', 'that', 'I', 'look', 'ba

In [30]:
essays["words"]=essays.apply(lambda x: nltk.word_tokenize(x["TEXT"]), axis=1)

In [31]:
essays["sentences"]=essays.apply(lambda x: nltk.sent_tokenize(x["TEXT"]), axis=1)

In [32]:
essays.iloc[0].words

['Well',
 ',',
 'right',
 'now',
 'I',
 'just',
 'woke',
 'up',
 'from',
 'a',
 'mid-day',
 'nap',
 '.',
 'It',
 "'s",
 'sort',
 'of',
 'weird',
 ',',
 'but',
 'ever',
 'since',
 'I',
 'moved',
 'to',
 'Texas',
 ',',
 'I',
 'have',
 'had',
 'problems',
 'concentrating',
 'on',
 'things',
 '.',
 'I',
 'remember',
 'starting',
 'my',
 'homework',
 'in',
 '10th',
 'grade',
 'as',
 'soon',
 'as',
 'the',
 'clock',
 'struck',
 '4',
 'and',
 'not',
 'stopping',
 'until',
 'it',
 'was',
 'done',
 '.',
 'Of',
 'course',
 'it',
 'was',
 'easier',
 ',',
 'but',
 'I',
 'still',
 'did',
 'it',
 '.',
 'But',
 'when',
 'I',
 'moved',
 'here',
 ',',
 'the',
 'homework',
 'got',
 'a',
 'little',
 'more',
 'challenging',
 'and',
 'there',
 'was',
 'a',
 'lot',
 'more',
 'busy',
 'work',
 ',',
 'and',
 'so',
 'I',
 'decided',
 'not',
 'to',
 'spend',
 'hours',
 'doing',
 'it',
 ',',
 'and',
 'just',
 'getting',
 'by',
 '.',
 'But',
 'the',
 'thing',
 'was',
 'that',
 'I',
 'always',
 'paid',
 'attention

In [33]:
essays.iloc[0].sentences

['Well, right now I just woke up from a mid-day nap.',
 "It's sort of weird, but ever since I moved to Texas, I have had problems concentrating on things.",
 'I remember starting my homework in  10th grade as soon as the clock struck 4 and not stopping until it was done.',
 'Of course it was easier, but I still did it.',
 'But when I moved here, the homework got a little more challenging and there was a lot more busy work, and so I decided not to spend hours doing it, and just getting by.',
 "But the thing was that I always paid attention in class and just plain out knew the stuff, and now that I look back, if I had really worked hard and stayed on track the last two years without getting  lazy, I would have been a genius, but hey, that's all good.",
 "It's too late to correct the past, but I don't really know how to stay focused n the future.",
 "The one thing I know is that when  people say that b/c they live on campus they can't concentrate, it's b. s. For me it would be easier ther

In [34]:
essays["bigrams"]=essays.apply(lambda x: list(nltk.ngrams(nltk.word_tokenize(x["TEXT"]),2)), axis=1)

In [35]:
essays.iloc[0].bigrams

[('Well', ','),
 (',', 'right'),
 ('right', 'now'),
 ('now', 'I'),
 ('I', 'just'),
 ('just', 'woke'),
 ('woke', 'up'),
 ('up', 'from'),
 ('from', 'a'),
 ('a', 'mid-day'),
 ('mid-day', 'nap'),
 ('nap', '.'),
 ('.', 'It'),
 ('It', "'s"),
 ("'s", 'sort'),
 ('sort', 'of'),
 ('of', 'weird'),
 ('weird', ','),
 (',', 'but'),
 ('but', 'ever'),
 ('ever', 'since'),
 ('since', 'I'),
 ('I', 'moved'),
 ('moved', 'to'),
 ('to', 'Texas'),
 ('Texas', ','),
 (',', 'I'),
 ('I', 'have'),
 ('have', 'had'),
 ('had', 'problems'),
 ('problems', 'concentrating'),
 ('concentrating', 'on'),
 ('on', 'things'),
 ('things', '.'),
 ('.', 'I'),
 ('I', 'remember'),
 ('remember', 'starting'),
 ('starting', 'my'),
 ('my', 'homework'),
 ('homework', 'in'),
 ('in', '10th'),
 ('10th', 'grade'),
 ('grade', 'as'),
 ('as', 'soon'),
 ('soon', 'as'),
 ('as', 'the'),
 ('the', 'clock'),
 ('clock', 'struck'),
 ('struck', '4'),
 ('4', 'and'),
 ('and', 'not'),
 ('not', 'stopping'),
 ('stopping', 'until'),
 ('until', 'it'),
 ('it', 

In [36]:
essays["trigrams"]=essays.apply(lambda x: list(nltk.ngrams(nltk.word_tokenize(x["TEXT"]),3)), axis=1)

In [37]:
essays.iloc[0].trigrams

[('Well', ',', 'right'),
 (',', 'right', 'now'),
 ('right', 'now', 'I'),
 ('now', 'I', 'just'),
 ('I', 'just', 'woke'),
 ('just', 'woke', 'up'),
 ('woke', 'up', 'from'),
 ('up', 'from', 'a'),
 ('from', 'a', 'mid-day'),
 ('a', 'mid-day', 'nap'),
 ('mid-day', 'nap', '.'),
 ('nap', '.', 'It'),
 ('.', 'It', "'s"),
 ('It', "'s", 'sort'),
 ("'s", 'sort', 'of'),
 ('sort', 'of', 'weird'),
 ('of', 'weird', ','),
 ('weird', ',', 'but'),
 (',', 'but', 'ever'),
 ('but', 'ever', 'since'),
 ('ever', 'since', 'I'),
 ('since', 'I', 'moved'),
 ('I', 'moved', 'to'),
 ('moved', 'to', 'Texas'),
 ('to', 'Texas', ','),
 ('Texas', ',', 'I'),
 (',', 'I', 'have'),
 ('I', 'have', 'had'),
 ('have', 'had', 'problems'),
 ('had', 'problems', 'concentrating'),
 ('problems', 'concentrating', 'on'),
 ('concentrating', 'on', 'things'),
 ('on', 'things', '.'),
 ('things', '.', 'I'),
 ('.', 'I', 'remember'),
 ('I', 'remember', 'starting'),
 ('remember', 'starting', 'my'),
 ('starting', 'my', 'homework'),
 ('my', 'homewor

### Testing the embeddings feature

In [38]:
nlp = spacy.load("en_core_web_lg")
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.vector)

dog [-4.0176e-01  3.7057e-01  2.1281e-02 -3.4125e-01  4.9538e-02  2.9440e-01
 -1.7376e-01 -2.7982e-01  6.7622e-02  2.1693e+00 -6.2691e-01  2.9106e-01
 -6.7270e-01  2.3319e-01 -3.4264e-01  1.8311e-01  5.0226e-01  1.0689e+00
  1.4698e-01 -4.5230e-01 -4.1827e-01 -1.5967e-01  2.6748e-01 -4.8867e-01
  3.6462e-01 -4.3403e-02 -2.4474e-01 -4.1752e-01  8.9088e-02 -2.5552e-01
 -5.5695e-01  1.2243e-01 -8.3526e-02  5.5095e-01  3.6410e-01  1.5361e-01
  5.5738e-01 -9.0702e-01 -4.9098e-02  3.8580e-01  3.8000e-01  1.4425e-01
 -2.7221e-01 -3.7016e-01 -1.2904e-01 -1.5085e-01 -3.8076e-01  4.9583e-02
  1.2755e-01 -8.2788e-02  1.4339e-01  3.2537e-01  2.7226e-01  4.3632e-01
 -3.1769e-01  7.9405e-01  2.6529e-01  1.0135e-01 -3.3279e-01  4.3117e-01
  1.6687e-01  1.0729e-01  8.9418e-02  2.8635e-01  4.0117e-01 -3.9222e-01
  4.5217e-01  1.3521e-01 -2.8878e-01 -2.2819e-02 -3.4975e-01 -2.2996e-01
  2.0224e-01 -2.1177e-01  2.7184e-01  9.1703e-02 -2.0610e-01 -6.5758e-01
  1.8949e-01 -2.6756e-01  9.2639e-02  4.3316e-0

In [39]:
def avg_text_embedding(text):

    nlp = spacy.load("en_core_web_lg")
    tokens = nlp(text)
    sum=0
    counter=0
    for token in tokens:
        sum+=token.vector
        counter+=1
    return sum/counter

### Getting average text embeddings

In [40]:
essays["average_word_embedding"]=essays.apply(lambda x: avg_text_embedding(x["TEXT"]), axis=1)

In [41]:
essays.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,words,sentences,bigrams,trigrams,average_word_embedding
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,"[Well, ,, right, now, I, just, woke, up, from,...","[Well, right now I just woke up from a mid-day...","[(Well, ,), (,, right), (right, now), (now, I)...","[(Well, ,, right), (,, right, now), (right, no...","[0.00353472, 0.19089618, -0.21122074, -0.08882..."
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0,"[Well, ,, here, we, go, with, the, stream, of,...","[Well, here we go with the stream of conscious...","[(Well, ,), (,, here), (here, we), (we, go), (...","[(Well, ,, here), (,, here, we), (here, we, go...","[0.010748824, 0.22639543, -0.23046331, -0.1286..."
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1,"[An, open, keyboard, and, buttons, to, push, ....","[An open keyboard and buttons to push., The th...","[(An, open), (open, keyboard), (keyboard, and)...","[(An, open, keyboard), (open, keyboard, and), ...","[0.012327042, 0.16489002, -0.2121276, -0.09739..."
3,I can't believe it! It's really happening! M...,1,0,1,1,0,"[I, ca, n't, believe, it, !, It, 's, really, h...","[I can't believe it!, It's really happening!, ...","[(I, ca), (ca, n't), (n't, believe), (believe,...","[(I, ca, n't), (ca, n't, believe), (n't, belie...","[-0.024120355, 0.23071356, -0.18399644, -0.128..."
4,"Well, here I go with the good old stream of co...",1,0,1,0,1,"[Well, ,, here, I, go, with, the, good, old, s...","[Well, here I go with the good old stream of c...","[(Well, ,), (,, here), (here, I), (I, go), (go...","[(Well, ,, here), (,, here, I), (here, I, go),...","[-0.01877131, 0.22508281, -0.2055339, -0.09347..."


In [74]:
essays.to_csv("../data/essays_expanded.csv", index=False)