# Imports and Load Data

In [None]:
# Install and load spaCy

print('Starting spaCy install...')
!pip install -U spacy --quiet
import spacy
print('spaCy version:', spacy.__version__)
print('Starting download...')
!python -m spacy download en_core_web_lg --quiet
print("Loading spacy.load('en_core_web_lg')")
nlp = spacy.load("en_core_web_lg")
print('Finished with core spacy module!')

print('Starting spacy syllables download...')
!pip install spacy_syllables --quiet
from spacy_syllables import SpacySyllables
nlp.add_pipe("syllables", after="tagger")
assert nlp.pipe_names == ["tok2vec", "tagger", "syllables", "parser",  "attribute_ruler", "lemmatizer", "ner"]
doc = nlp("terribly long")
data = [(token.text, token._.syllables, token._.syllables_count) for token in doc]
assert data == [("terribly", ["ter", "ri", "bly"], 3), ("long", ["long"], 1)]
print('If no assertion errors above, then you are finished installing spacy syllables')

Starting spaCy install...
spaCy version: 3.4.3
Starting download...
[K     |████████████████████████████████| 587.7 MB 18 kB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
Loading spacy.load('en_core_web_lg')
Finished with core spacy module!
Starting spacy syllables download...
[K     |████████████████████████████████| 1.9 MB 4.7 MB/s 
[?25hFinished with spacy syllables


In [None]:
import os
import re
import string
import sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from google.colab import drive
from textblob import TextBlob
from collections import defaultdict

## Global Variables

In [None]:
RUBRIC_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

# Universal POS tags (v2) - used by spaCy
UPOS_TAGS = {'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ',
             'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
             'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SPACE',
             'SYM', 'VERB', 'X'}

USER = 'Kurt'

## Mount Drive

In [None]:
# Mount drive where you will do your work.
drive.mount('/content/drive')
root_dir = "NOT SHOW FOR PRIVACY PURPOSES"
project_folder = "NOT SHOW FOR PRIVACY PURPOSES"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

  # create a test file to make sure it shows up in the right place
  # to test if all is working, you can uncomment these two lines below--it should write a file to the shared drive
  # !touch 'new_file_in_working_directory.txt'
  # print('\nYour working directory was changed to ' + root_dir + project_folder + \
  #       "\n\nAn empty text file was created there. You can also run !pwd to confirm the current working directory." )

os.chdir(root_dir + project_folder)

Mounted at /content/drive


## Read train and val CSV's

In [None]:
X_train = pd.read_csv('data/interim/X_train.csv')
y_train = pd.read_csv('data/interim/y_train.csv')
X_val = pd.read_csv('data/interim/X_val.csv')
y_val = pd.read_csv('data/interim/y_val.csv')
X_test = pd.read_csv('data/interim/X_test.csv')
y_test = pd.read_csv('data/interim/y_test.csv')
print("X_train size is: {}".format(X_train.shape))
print("y_train size is: {}".format(y_train.shape))
print("X_val size is: {}".format(X_val.shape))
print("y_val size is: {}".format(y_val.shape))
print("X_test size is: {}".format(X_test.shape))
print("y_test size is: {}".format(y_test.shape))
display(X_train.head(3))
display(y_train.head(3))

X_train size is: (2347, 3)
y_train size is: (2347, 8)
X_val size is: (782, 3)
y_val size is: (782, 8)
X_test size is: (782, 3)
y_test size is: (782, 8)


Unnamed: 0,essay_index,text_id,full_text
0,2759,CA7CE4CE7ED3,Has there been anyone in your life that shared...
1,598,2D508127DEF1,There is a debate today day about whether the ...
2,599,2D5A9BEEB30D,Do you agree or disagree with extending the sc...


Unnamed: 0,essay_index,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,2759,CA7CE4CE7ED3,4.0,3.5,3.5,3.5,3.0,3.5
1,598,2D508127DEF1,3.0,3.0,4.0,3.0,4.0,4.0
2,599,2D5A9BEEB30D,4.0,4.5,4.0,4.0,4.5,5.0


# Standardize Anonymization

In [None]:
# Example of '_NAME' in a text
X_train[X_train['text_id'] == 'AEE8A576989C']

Unnamed: 0,essay_index,text_id,full_text
1533,2352,AEE8A576989C,Dear friend\n\nmy name is STUDENT_NAME\n\nam c...


In [None]:
def replace_strings(col, replace_dict):
    """
    Update essays to include only one format of anonymized values. Ex: Instead
    of using 'Generic_Name' and 'Generic_NAME', we would only have
    'Generic_Name'.

    Args:
        col: series from a pandas dataframe that contains text
        replace_dict: dictionary with keys as desired standardization and values
        as alternative form of standardization

    Returns:
        Pandas series of text with standardization implemented
    """
    for existing_word, replacement_word in replace_dict.items():
        col = col.str.replace(existing_word, replacement_word)

    return col

In [None]:
replace_dict = {'_NAME':'_Name',
                'STUDENT_': 'Student_',
                '_school':'_School'}

X_train['full_text'] = replace_strings(X_train['full_text'], replace_dict)
X_val['full_text'] = replace_strings(X_val['full_text'], replace_dict)
X_test['full_text'] = replace_strings(X_test['full_text'], replace_dict)

In [None]:
X_train[X_train['text_id'] == 'AEE8A576989C']

Unnamed: 0,essay_index,text_id,full_text
1533,2352,AEE8A576989C,Dear friend\n\nmy name is Student_Name\n\nam c...


# Feature Engineering with spaCy

## Create spaCy Doc objects from each essay

In [None]:
%%time
# Create a column of spacy.tokens.doc.Doc that we can later use for feature
# engineering. Takes ~5 minutes to run.
X_train['spacy_doc'] = X_train['full_text'].apply(lambda text: nlp(text))
X_val['spacy_doc'] = X_val['full_text'].apply(lambda text: nlp(text))
X_test['spacy_doc'] = X_test['full_text'].apply(lambda text: nlp(text))
display(X_train.head(3))
display(X_val.head(3))

Unnamed: 0,essay_index,text_id,full_text,spacy_doc
0,2759,CA7CE4CE7ED3,Has there been anyone in your life that shared...,"(Has, there, been, anyone, in, your, life, tha..."
1,598,2D508127DEF1,There is a debate today day about whether the ...,"(There, is, a, debate, today, day, about, whet..."
2,599,2D5A9BEEB30D,Do you agree or disagree with extending the sc...,"(Do, you, agree, or, disagree, with, extending..."


Unnamed: 0,essay_index,text_id,full_text,spacy_doc
0,2066,9B63E800B429,Do you think attitude is a big part of life? I...,"(Do, you, think, attitude, is, a, big, part, o..."
1,1242,5FFB959ED7BA,"I agree.\n\nI agree with this statement, becau...","(I, agree, ., \n\n, I, agree, with, this, stat..."
2,453,222D14189E1F,Imagine that the school plans to add more time...,"(Imagine, that, the, school, plans, to, add, m..."


CPU times: user 4min 42s, sys: 2.97 s, total: 4min 45s
Wall time: 4min 44s


## Helper functions

In [None]:
def calc_mean_sentence_length(doc):
    """
    Calculates mean sentence length from a spacy doc, does not include punctuation

    Args:
    - doc (spacy doc): a spacy doc

    Returns:
    - mean_sentence_length (float): the average sentence length

    """
    sentence_lengths = []

    for sentence in list(doc.sents):
        sentence_length = 0
        for word in sentence:
            if not word.is_punct:
                sentence_length += 1
        sentence_lengths.append(sentence_length)

    mean_sentence_length = np.array(sentence_lengths).mean()

    return mean_sentence_length

In [None]:
def calc_variance_sentence_length(doc):
    """
    Calculates variance of the sentence lengths from a spacy doc, does not
    include punctuation

    Args:
    - doc (spacy doc): a spacy doc

    Returns:
    - var_sentence_length (float): the variance of sentence lengths

    """
    sentence_lengths = []

    for sentence in list(doc.sents):
        sentence_length = 0
        for word in sentence:
            if not word.is_punct:
                sentence_length += 1
        sentence_lengths.append(sentence_length)

    var_sentence_length = np.array(sentence_lengths).var()

    return var_sentence_length

In [None]:
def contraction_count(text):
    """
    Counts the number of contractions in a text
    
    Args:
    - text (str): a text as a string file

    Returns:
    - count (int): the number of contractions
    """
    count=0
    count += re.subn(r"won\'t",'', text)[1] 
    count += re.subn(r"can\'t",'', text)[1]
    count += re.subn(r"\'re",'', text)[1]
    count += re.subn(r"\'s", '', text)[1]
    count += re.subn(r"\'d", '', text)[1]
    count += re.subn(r"\'ll", '', text)[1]
    count += re.subn(r"\'t", '', text)[1]
    count += re.subn(r"\'ve", '', text)[1]
    count += re.subn(r"\'m", '', text)[1]
    return count

In [None]:
def make_pos_counts(doc):
    """
    Makes parts of speech counts for Universal Parts of Speech tags (v2) using 
    spaCy

    Args:
    - doc (spacy doc): doc of essay text

    Returns:
    - counts (pd.Series): a pandas Series of the counts for each part of speech
      in alphabetical order. Ex: ADJ is first count, ADP is second, X is last.
    """   

    # Create dictionary of pos and counts from doc
    pos_list = [token.pos_ for token in doc]
    d = defaultdict(int)
    for k in pos_list:
        d[k] += 1
    
    # Fill in any missing pos tags if they did not exist in the doc
    pos_tags_set = set(d.keys())
    set_diff = UPOS_TAGS - pos_tags_set
    if len(set_diff) > 0:
        for pos_tag in set_diff:
            d[pos_tag] += 0
    
    # Extract counts and return pd.Series in order to be able to add multiple
    # columns to dataframe
    counts = pd.Series([count for pos_tag, count in sorted(d.items())])

    return counts
    

In [None]:
def calc_fk_score(row):
    """Calculates Flesh-Kincaid Reading Ease Score given a row of pandas dataframe
    that contains word, sentence, and syllable count"""
    word_count = float(row['word_count'])
    sentence_count = row['sentence_count']
    syllable_count = row['syllable_count']

    score = 206.85 - 1.015 * (word_count / sentence_count) - 84.6 * (syllable_count / word_count)
    
    return score

## Create text features

In [None]:
def create_text_features_spacy(df, text_col, doc_col):
    """
    Generate engineered features from text using some basic string methods and
    spaCy methods

    Args:
    - df (Pandas DataFrame): dataframe with text
    - text_col (string): name of column that contains source text as a string
    - doc_col (string): name of column that contains source text as a 
      spacy.tokens.doc.Doc datatype

    Returns:
    - df (pandas dataframe): dataframe with original columns plus engineered
      features
    - Note that dataframe is also edited in place

    Note: implementation have NOT been optimized and redundant operations occur
    """

    # Paragraphs : We may want to not use this since it's not 100% accurate 
    # and/or remove newline characters after we calculate paragraph count
    df['paragraph_count'] = df[text_col].apply(lambda x: len(x.split('\n')))
    
    # Characters
    df['punctuation_count'] = df[doc_col].apply(lambda doc: len([token.text for token in doc if token.is_punct]))
    df['character_count'] = df[text_col].apply(lambda text: len(str(text)))
    df['contraction_count'] = df[text_col].apply(contraction_count)

    # Parts of speech
    df[list(UPOS_TAGS)] = df[doc_col].apply(make_pos_counts)

    #Syllables - doesn't work directly using doc_col so have to use nlp(text_col)
    df['syllable_count'] = df[text_col].apply(lambda text: sum([token._.syllables_count for token in nlp(text) if token._.syllables_count]))

    # Words
    df['words'] = df[doc_col].apply(lambda doc: [token.text for token in doc if not token.is_punct]) # Creates list of words for downstream features
    df['word_count'] = df['words'].apply(lambda word_tokens: len(word_tokens))
    df['title_count'] = df['words'].apply(lambda word_tokens: len([w for w in word_tokens if w.istitle()])) # w3schools.com/python/ref_string_istitle.asp
    df['mean_word_length'] = df['words'].apply(lambda word_tokens: np.mean([len(word) for word in word_tokens]))
    df['variance_word_length'] = df['words'].apply(lambda word_tokens: np.var([len(word) for word in word_tokens]))
    df['vocabulary'] = df['words'].apply(lambda word_tokens: len(set(word_tokens)))
    df['stopwords'] = df[doc_col].apply(lambda doc: [token.text for token in doc if token.is_stop])
    df['stopword_count'] = df['stopwords'].apply(lambda stopwords: len(stopwords))
    
    # Sentences
    df['sentence_count'] = df[doc_col].apply(lambda doc: len(list(doc.sents)))
    df['mean_sentence_length'] = df[doc_col].apply(calc_mean_sentence_length)
    df['variance_sentence_length'] = df[doc_col].apply(calc_variance_sentence_length)
    
    # TextBlob
    df['polarity'] = df[text_col].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['subjectivity'] = df[text_col].apply(lambda x: TextBlob(x).sentiment.subjectivity)

    #Flesh-Kincaid Reading Ease Score
    df['fk_score'] = df[['word_count', 'sentence_count', 'syllable_count']].apply(lambda row: calc_fk_score(row), axis=1)
    
    return df

In [None]:
# Add text features to df's
create_text_features_spacy(X_train, 'full_text', 'spacy_doc')
create_text_features_spacy(X_val, 'full_text', 'spacy_doc')
create_text_features_spacy(X_test, 'full_text', 'spacy_doc')

display(X_train.head(3))

Unnamed: 0,essay_index,text_id,full_text,spacy_doc,paragraph_count,punctuation_count,character_count,contraction_count,SCONJ,NUM,...,variance_word_length,vocabulary,stopwords,stopword_count,sentence_count,mean_sentence_length,variance_sentence_length,polarity,subjectivity,fk_score
0,2759,CA7CE4CE7ED3,Has there been anyone in your life that shared...,"(Has, there, been, anyone, in, your, life, tha...",7,33,2508,3,22,40,...,12.013955,192,"[Has, there, been, anyone, in, your, that, any...",323,25,19.92,101.9136,0.064453,0.397142,83.344453
1,598,2D508127DEF1,There is a debate today day about whether the ...,"(There, is, a, debate, today, day, about, whet...",9,42,2928,2,44,62,...,5.811991,222,"[There, is, a, about, whether, the, should, fr...",270,26,19.153846,121.284024,0.13786,0.431789,65.265473
2,599,2D5A9BEEB30D,Do you agree or disagree with extending the sc...,"(Do, you, agree, or, disagree, with, extending...",9,24,1306,4,11,18,...,4.197651,83,"[Do, you, or, with, the, I, with, the, The, wh...",133,17,14.411765,82.242215,0.135,0.519167,88.284916


# Regex

In [None]:
def preprocess_regex(text):
    """
    Replaces different web address, email address, and characters with white
    space

    Args:
    - text (string)

    Returns:
    - text (string)
    """
    
    text = re.sub(r'http\S+|www\.\S+',' ', text) # Removes https and www stuff
    text = re.sub(r'@[0-9a-zA-Z]*\W+',' ' , text) # Removes things like @email123
    text = re.sub(r'\d+', ' ', text) # Removes digits
    text = re.sub(r'[#\/=*|<>^]', ' ', text) # Removes assortment of atypical characters

    return text

In [None]:
# Check to see if preprocess regex works as intended
sample_text = 'https:// www.webpage.com Our \'fffff\'Deeds are the . Reason of @insta this #earthquake M 3'
print(preprocess_regex(sample_text))

    Our 'fffff'Deeds are the . Reason of  this  earthquake M  


In [None]:
# X_train['full_text'] = X_train['full_text'].apply(preprocess_regex)
# example = X_train.iloc[:2, :]
# example['full_text'].apply(preprocess_regex)

0    Has there been anyone in your life that shared...
1    There is a debate today day about whether the ...
Name: full_text, dtype: object

In [None]:
# Apply regex preprocessing
X_train['full_text'] = X_train['full_text'].apply(preprocess_regex)
X_val['full_text'] = X_val['full_text'].apply(preprocess_regex)
X_test['full_text'] = X_test['full_text'].apply(preprocess_regex)

# Check first training example after preprocessing
display(X_train.iloc[0]['full_text'])

"Has there been anyone in your life that shared any wisdom or experience with you? If not, why not ask someone that is important to you? They would love to share maybe a dad, mom ,grandparents they have tones of experience in the past. A person who has shared wisdom or experience with me would be my father ,because he has showed me how to be respectful to adults and show me how to work some business papers.\n\nOne thing that my father had shared with me is how to respect adults. For example, my father wasn't always the best child. He would do bad choices like going out with his friends all the time and not listen to this parents. But as he got older he realized no one though he was serious and rejected him. They thought he was trouble child, Soon when I was born ,and when i was able to talk and walk my. My father would make me say please and thank you. As well as listening to the adults, bow when entering a door and greeting them. My father did not want me to me like him when he was yo

# Format columns

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2347 entries, 0 to 2346
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   essay_index               2347 non-null   int64  
 1   text_id                   2347 non-null   object 
 2   full_text                 2347 non-null   object 
 3   spacy_doc                 2347 non-null   object 
 4   paragraph_count           2347 non-null   int64  
 5   punctuation_count         2347 non-null   int64  
 6   character_count           2347 non-null   int64  
 7   contraction_count         2347 non-null   int64  
 8   SCONJ                     2347 non-null   int64  
 9   NUM                       2347 non-null   int64  
 10  ADV                       2347 non-null   int64  
 11  PRON                      2347 non-null   int64  
 12  DET                       2347 non-null   int64  
 13  AUX                       2347 non-null   int64  
 14  ADJ     

In [None]:
def convert_64_to_32(df):
    """Convert from 64 bit to 32 bit to save space"""
    df[df.select_dtypes(np.int64).columns] = df.select_dtypes(np.int64).astype(np.int32)
    df[df.select_dtypes(np.float64).columns] = df.select_dtypes(np.float64).astype(np.float32)
    return df

In [None]:
convert_64_to_32(X_train)
convert_64_to_32(y_train)
convert_64_to_32(X_val)
convert_64_to_32(y_val)
convert_64_to_32(X_test)
convert_64_to_32(y_test)

display(X_train.head(3))
display(y_train.head(3))
X_train.info()

Unnamed: 0,essay_index,text_id,full_text,spacy_doc,paragraph_count,punctuation_count,character_count,contraction_count,SCONJ,NUM,...,variance_word_length,vocabulary,stopwords,stopword_count,sentence_count,mean_sentence_length,variance_sentence_length,polarity,subjectivity,fk_score
0,2759,CA7CE4CE7ED3,Has there been anyone in your life that shared...,"(Has, there, been, anyone, in, your, life, tha...",7,33,2508,3,22,40,...,12.013955,192,"[Has, there, been, anyone, in, your, that, any...",323,25,19.92,101.913597,0.064453,0.397142,83.344452
1,598,2D508127DEF1,There is a debate today day about whether the ...,"(There, is, a, debate, today, day, about, whet...",9,42,2928,2,44,62,...,5.811991,222,"[There, is, a, about, whether, the, should, fr...",270,26,19.153847,121.284027,0.13786,0.431789,65.265472
2,599,2D5A9BEEB30D,Do you agree or disagree with extending the sc...,"(Do, you, agree, or, disagree, with, extending...",9,24,1306,4,11,18,...,4.197651,83,"[Do, you, or, with, the, I, with, the, The, wh...",133,17,14.411765,82.242218,0.135,0.519167,88.28492


Unnamed: 0,essay_index,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,2759,CA7CE4CE7ED3,4.0,3.5,3.5,3.5,3.0,3.5
1,598,2D508127DEF1,3.0,3.0,4.0,3.0,4.0,4.0
2,599,2D5A9BEEB30D,4.0,4.5,4.0,4.0,4.5,5.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2347 entries, 0 to 2346
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   essay_index               2347 non-null   int32  
 1   text_id                   2347 non-null   object 
 2   full_text                 2347 non-null   object 
 3   spacy_doc                 2347 non-null   object 
 4   paragraph_count           2347 non-null   int32  
 5   punctuation_count         2347 non-null   int32  
 6   character_count           2347 non-null   int32  
 7   contraction_count         2347 non-null   int32  
 8   SCONJ                     2347 non-null   int32  
 9   NUM                       2347 non-null   int32  
 10  ADV                       2347 non-null   int32  
 11  PRON                      2347 non-null   int32  
 12  DET                       2347 non-null   int32  
 13  AUX                       2347 non-null   int32  
 14  ADJ     

# Write to File

In [None]:
# Write preprocessed tables to file
# X_train.to_csv('data/processed/X_train.csv')
# X_val.to_csv('data/processed/X_val.csv')
# X_test.to_csv('data/processed/X_test.csv')

# y_train.to_csv('data/processed/y_train.csv')
# y_val.to_csv('data/processed/y_val.csv')
# y_test.to_csv('data/processed/y_test.csv')

# Read in processed files
Use this to check files saved to disk or if you want to work on additional pre-process without needing to re-run the entire notebook

In [None]:
# Pull in processed data
X_train = pd.read_csv('data/processed/X_train.csv')
y_train = pd.read_csv('data/processed/y_train.csv')
X_val = pd.read_csv('data/processed/X_val.csv')
y_val = pd.read_csv('data/processed/y_val.csv')

# Drop pandas Unnamed: 0 column
X_train.drop(columns='Unnamed: 0', inplace=True)
y_train.drop(columns='Unnamed: 0', inplace=True)
X_val.drop(columns='Unnamed: 0', inplace=True)
y_val.drop(columns='Unnamed: 0', inplace=True)

print("X_train size is: {}".format(X_train.shape))
print("y_train size is: {}".format(y_train.shape))
print("X_val size is: {}".format(X_val.shape))
print("y_val size is: {}".format(y_val.shape))
display(X_train.head(1))
display(y_train.head(1))

X_train size is: (2347, 41)
y_train size is: (2347, 8)
X_val size is: (782, 41)
y_val size is: (782, 8)


Unnamed: 0,essay_index,text_id,full_text,spacy_doc,paragraph_count,punctuation_count,character_count,contraction_count,SCONJ,NUM,...,variance_word_length,vocabulary,stopwords,stopword_count,sentence_count,mean_sentence_length,variance_sentence_length,polarity,subjectivity,fk_score
0,2759,CA7CE4CE7ED3,Has there been anyone in your life that shared...,Has there been anyone in your life that shared...,7,33,2508,3,22,40,...,12.013955,192,"['Has', 'there', 'been', 'anyone', 'in', 'your...",323,25,19.92,101.9136,0.064453,0.397142,83.34445


Unnamed: 0,essay_index,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,2759,CA7CE4CE7ED3,4.0,3.5,3.5,3.5,3.0,3.5
