# Surface Features
Surface features include the nonlinguistic characteristics, such as total number of words per essay, average sentence length, and average word length, as well as linguistic characteristics, such as grammatical errors, and constructions. This section evaluates the effect of linguistic and non-linguistic surface features on essay scores.   

In [22]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import logging.config
from IPython.display import display, HTML
from atelier.data.io import YamlIO


In [23]:
# Seaborn
sns.set_palette("Blues_r")
sns.set_style("whitegrid")
# Pandas
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 20)
pd.set_option("display.max_colwidth", 5000)
# Configurations
LOGGING_CONFIG_FILE = "config/logging.yml"
FEATURE_CONFIG_FILE = "config/features.yml"
# Logging
io =  YamlIO()
LOGGING_CONFIG = io.read(LOGGING_CONFIG_FILE)
logging.config.dictConfig(LOGGING_CONFIG)
logger = logging.getLogger(__name__)
# Feature filepaths
FEATURE_STORE = io.read(FEATURE_CONFIG_FILE)

## Non-Linguistic Surface Features
Seven non-linguistic surface features will be essays:
1. Number of Words    
2. number of Unique Words    
3. Type-Token Ratio   
4. Number of Paragraphs    
5. Number of Sentences      
6. Average Word Length
7. Average Sentence Length


### Non-Linguistic Surface Features Descriptive Statistics

In [24]:
analytics = pd.read_csv(FEATURE_STORE['sinlp']['results'], index_col=None)[['text', ' number words', ' number types', ' TTR', ' Letters per word',
       ' number paragraphs', ' number of sentences',
       ' number of words per sentence']]
analytics['text_id'] = analytics['text'].apply(lambda x: os.path.basename(x))
analytics.head()
analytics.drop('text', inplace=True, axis=1)
analytics[[' number words', ' number types', ' TTR', ' Letters per word',
       ' number paragraphs', ' number of sentences',
       ' number of words per sentence']].describe().T

Unnamed: 0,text,number words,number types,TTR,Letters per word,number paragraphs,number of sentences,number of words per sentence,text_id
0,//wsl$/Ubuntu/home/john/projects/AES_ELLs/data/essays\E446D60AD551.txt,1091,336,0.31,4.48,17,23,47.43,essays\E446D60AD551.txt
1,//wsl$/Ubuntu/home/john/projects/AES_ELLs/data/essays\770E1717845C.txt,460,195,0.42,4.35,11,45,10.22,essays\770E1717845C.txt
2,//wsl$/Ubuntu/home/john/projects/AES_ELLs/data/essays\192D02CF8C9E.txt,616,195,0.32,3.84,13,22,28.0,essays\192D02CF8C9E.txt
3,//wsl$/Ubuntu/home/john/projects/AES_ELLs/data/essays\40AA29D2F32B.txt,700,216,0.31,3.8,49,33,21.21,essays\40AA29D2F32B.txt
4,//wsl$/Ubuntu/home/john/projects/AES_ELLs/data/essays\523130079501.txt,719,277,0.39,4.55,13,43,16.72,essays\523130079501.txt


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number words,3911.0,430.49,191.87,14.0,294.0,402.0,526.5,1260.0
number types,3911.0,165.45,56.72,13.0,124.0,158.0,198.0,439.0
TTR,3911.0,0.41,0.08,0.15,0.35,0.4,0.46,0.93
Letters per word,3911.0,4.26,0.28,3.3,4.07,4.26,4.44,5.6
number paragraphs,3911.0,10.08,6.23,1.0,7.0,9.0,11.0,103.0
number of sentences,3911.0,18.8,10.49,1.0,11.0,17.0,25.0,100.0
number of words per sentence,3911.0,28.88,25.41,6.34,17.89,22.5,31.43,565.5


### Non-Linguistic Surface Features Distribution

In [20]:
analytics_long = pd.melt(analytics, value_vars=['text_id',' number words', ' number types', ' TTR', ' Letters per word',
       ' number paragraphs', ' number of sentences',
       ' number of words per sentence'], var_name='feature', value_name='value')
analytics_long_features = analytics_long.drop('text_id', axis=1, inplace=False)       
fig, ax = plt.subplots(figsize=(12,6))
_ = sns.boxplot(x='value', y='feature', data=analytics_long_features,ax=ax).set(title="Distribution of Non-Linguistic Surface Features")
plt.show()


KeyError: "['text_id'] not found in axis"