### Data loading

In [1]:
import pandas as pd
import numpy as np

essays=pd.read_excel("../data/essays.xlsx")
essays.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y


In [2]:
lengths=[]
for text in essays.TEXT:
    lengths.append(len(text.split()))

print(f"The maximum essay length is: {max(lengths)}.")
print(f"The minimum essay length is: {min(lengths)}.")
print(f"The average essay length is: {sum(lengths)/len(lengths)}.")

The maximum essay length is: 2500.
The minimum essay length is: 33.
The average essay length is: 652.1333603567085.


In [3]:
essays.drop("#AUTHID", axis=1, inplace=True)

### Preparation of class labels

In [4]:
personalities=["cEXT", "cNEU", "cAGR", "cCON", "cOPN"]

for personality in personalities:
    essays[personality].replace(["y", "n"], [1, 0], inplace=True)

essays.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,I can't believe it! It's really happening! M...,1,0,1,1,0
4,"Well, here I go with the good old stream of co...",1,0,1,0,1


### Tokenization

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jurin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
print(nltk.word_tokenize(essays.iloc[0]["TEXT"]))

['Well', ',', 'right', 'now', 'I', 'just', 'woke', 'up', 'from', 'a', 'mid-day', 'nap', '.', 'It', "'s", 'sort', 'of', 'weird', ',', 'but', 'ever', 'since', 'I', 'moved', 'to', 'Texas', ',', 'I', 'have', 'had', 'problems', 'concentrating', 'on', 'things', '.', 'I', 'remember', 'starting', 'my', 'homework', 'in', '10th', 'grade', 'as', 'soon', 'as', 'the', 'clock', 'struck', '4', 'and', 'not', 'stopping', 'until', 'it', 'was', 'done', '.', 'Of', 'course', 'it', 'was', 'easier', ',', 'but', 'I', 'still', 'did', 'it', '.', 'But', 'when', 'I', 'moved', 'here', ',', 'the', 'homework', 'got', 'a', 'little', 'more', 'challenging', 'and', 'there', 'was', 'a', 'lot', 'more', 'busy', 'work', ',', 'and', 'so', 'I', 'decided', 'not', 'to', 'spend', 'hours', 'doing', 'it', ',', 'and', 'just', 'getting', 'by', '.', 'But', 'the', 'thing', 'was', 'that', 'I', 'always', 'paid', 'attention', 'in', 'class', 'and', 'just', 'plain', 'out', 'knew', 'the', 'stuff', ',', 'and', 'now', 'that', 'I', 'look', 'ba

In [7]:
essays["words"]=essays.apply(lambda x: nltk.word_tokenize(x["TEXT"]), axis=1)

In [10]:
essays["sentences"]=essays.apply(lambda x: nltk.sent_tokenize(x["TEXT"]), axis=1)

In [11]:
essays.iloc[0].words

['Well',
 ',',
 'right',
 'now',
 'I',
 'just',
 'woke',
 'up',
 'from',
 'a',
 'mid-day',
 'nap',
 '.',
 'It',
 "'s",
 'sort',
 'of',
 'weird',
 ',',
 'but',
 'ever',
 'since',
 'I',
 'moved',
 'to',
 'Texas',
 ',',
 'I',
 'have',
 'had',
 'problems',
 'concentrating',
 'on',
 'things',
 '.',
 'I',
 'remember',
 'starting',
 'my',
 'homework',
 'in',
 '10th',
 'grade',
 'as',
 'soon',
 'as',
 'the',
 'clock',
 'struck',
 '4',
 'and',
 'not',
 'stopping',
 'until',
 'it',
 'was',
 'done',
 '.',
 'Of',
 'course',
 'it',
 'was',
 'easier',
 ',',
 'but',
 'I',
 'still',
 'did',
 'it',
 '.',
 'But',
 'when',
 'I',
 'moved',
 'here',
 ',',
 'the',
 'homework',
 'got',
 'a',
 'little',
 'more',
 'challenging',
 'and',
 'there',
 'was',
 'a',
 'lot',
 'more',
 'busy',
 'work',
 ',',
 'and',
 'so',
 'I',
 'decided',
 'not',
 'to',
 'spend',
 'hours',
 'doing',
 'it',
 ',',
 'and',
 'just',
 'getting',
 'by',
 '.',
 'But',
 'the',
 'thing',
 'was',
 'that',
 'I',
 'always',
 'paid',
 'attention

In [12]:
essays.iloc[0].sentences

['Well, right now I just woke up from a mid-day nap.',
 "It's sort of weird, but ever since I moved to Texas, I have had problems concentrating on things.",
 'I remember starting my homework in  10th grade as soon as the clock struck 4 and not stopping until it was done.',
 'Of course it was easier, but I still did it.',
 'But when I moved here, the homework got a little more challenging and there was a lot more busy work, and so I decided not to spend hours doing it, and just getting by.',
 "But the thing was that I always paid attention in class and just plain out knew the stuff, and now that I look back, if I had really worked hard and stayed on track the last two years without getting  lazy, I would have been a genius, but hey, that's all good.",
 "It's too late to correct the past, but I don't really know how to stay focused n the future.",
 "The one thing I know is that when  people say that b/c they live on campus they can't concentrate, it's b. s. For me it would be easier ther

In [16]:
essays["bigrams"]=essays.apply(lambda x: list(nltk.ngrams(nltk.word_tokenize(x["TEXT"]),2)), axis=1)

In [17]:
essays.iloc[0].bigrams

[('Well', ','),
 (',', 'right'),
 ('right', 'now'),
 ('now', 'I'),
 ('I', 'just'),
 ('just', 'woke'),
 ('woke', 'up'),
 ('up', 'from'),
 ('from', 'a'),
 ('a', 'mid-day'),
 ('mid-day', 'nap'),
 ('nap', '.'),
 ('.', 'It'),
 ('It', "'s"),
 ("'s", 'sort'),
 ('sort', 'of'),
 ('of', 'weird'),
 ('weird', ','),
 (',', 'but'),
 ('but', 'ever'),
 ('ever', 'since'),
 ('since', 'I'),
 ('I', 'moved'),
 ('moved', 'to'),
 ('to', 'Texas'),
 ('Texas', ','),
 (',', 'I'),
 ('I', 'have'),
 ('have', 'had'),
 ('had', 'problems'),
 ('problems', 'concentrating'),
 ('concentrating', 'on'),
 ('on', 'things'),
 ('things', '.'),
 ('.', 'I'),
 ('I', 'remember'),
 ('remember', 'starting'),
 ('starting', 'my'),
 ('my', 'homework'),
 ('homework', 'in'),
 ('in', '10th'),
 ('10th', 'grade'),
 ('grade', 'as'),
 ('as', 'soon'),
 ('soon', 'as'),
 ('as', 'the'),
 ('the', 'clock'),
 ('clock', 'struck'),
 ('struck', '4'),
 ('4', 'and'),
 ('and', 'not'),
 ('not', 'stopping'),
 ('stopping', 'until'),
 ('until', 'it'),
 ('it', 

In [18]:
essays["trigrams"]=essays.apply(lambda x: list(nltk.ngrams(nltk.word_tokenize(x["TEXT"]),3)), axis=1)

In [19]:
essays.iloc[0].trigrams

[('Well', ',', 'right'),
 (',', 'right', 'now'),
 ('right', 'now', 'I'),
 ('now', 'I', 'just'),
 ('I', 'just', 'woke'),
 ('just', 'woke', 'up'),
 ('woke', 'up', 'from'),
 ('up', 'from', 'a'),
 ('from', 'a', 'mid-day'),
 ('a', 'mid-day', 'nap'),
 ('mid-day', 'nap', '.'),
 ('nap', '.', 'It'),
 ('.', 'It', "'s"),
 ('It', "'s", 'sort'),
 ("'s", 'sort', 'of'),
 ('sort', 'of', 'weird'),
 ('of', 'weird', ','),
 ('weird', ',', 'but'),
 (',', 'but', 'ever'),
 ('but', 'ever', 'since'),
 ('ever', 'since', 'I'),
 ('since', 'I', 'moved'),
 ('I', 'moved', 'to'),
 ('moved', 'to', 'Texas'),
 ('to', 'Texas', ','),
 ('Texas', ',', 'I'),
 (',', 'I', 'have'),
 ('I', 'have', 'had'),
 ('have', 'had', 'problems'),
 ('had', 'problems', 'concentrating'),
 ('problems', 'concentrating', 'on'),
 ('concentrating', 'on', 'things'),
 ('on', 'things', '.'),
 ('things', '.', 'I'),
 ('.', 'I', 'remember'),
 ('I', 'remember', 'starting'),
 ('remember', 'starting', 'my'),
 ('starting', 'my', 'homework'),
 ('my', 'homewor