In [50]:
import os
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def read_text(src = None, filetype = 'txt'):
    """
    this function reads all txt files under specified dir
    and does some corresponding feature engineering (word count, sentences)
    output is a list of dicts, with each dict containing relevant features
    """
    
    def word_count(str):
        """
        turns a string into counts, return a dict
        """
        counts = dict()
        words = str.split()
        for word in words:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
        return counts

    wd = os.getcwd() if src != None else src
    files = [f for f in os.listdir(wd) if f.endswith(filetype)]
    extracted = [{} for f in files]

    for i in range(len(files)):
        
        f = files[i]
        
        with open(f) as t:
            
            text = t.read()
            d = word_count(text)
            tokens = sent_tokenize(text)
            special_char = [c for c in text if not c.isalpha() and not c.isdigit() and c != '\n']
            
            extracted[i]['total word counts'] = sum(d.values())
            extracted[i]['unique word counts'] = len(d.values())
            extracted[i]['number of sentences'] = len(tokens)
            extracted[i]['average length of sentences (characters)'] = sum( map(len, tokens) ) / len(tokens)
            extracted[i]['special characters'] = len(special_char)
    
    return extracted



read_text()

[nltk_data] Downloading package punkt to /Users/jialechen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[{'total word counts': 42,
  'unique word counts': 35,
  'number of sentences': 2,
  'average length of sentences (characters)': 139.5,
  'special characters': 44},
 {'total word counts': 41,
  'unique word counts': 33,
  'number of sentences': 3,
  'average length of sentences (characters)': 92.33333333333333,
  'special characters': 44},
 {'total word counts': 11,
  'unique word counts': 8,
  'number of sentences': 1,
  'average length of sentences (characters)': 38.0,
  'special characters': 12}]