## Importing files sorted on filename ##

Here we import your .txt files and their filenames. We sort the filenames to keep the order of your index.

nb. you need to install the `natsort` module: https://anaconda.org/anaconda/natsort
    , which you can either do from the Anaconda Prompt or from the Anaconda Navigator.
    If you use the prompt write: `conda install natsort`

In [1]:
import os, glob
from natsort import natsorted

def read_txt_dir(dirpath):
    """ import all .txt files from directory of directory path dirpath
        - output file and filename in list
    """
    filenames = natsorted(glob.glob(os.path.join(dirpath,"*.txt")))
    files = list()
    for filename in filenames:
        with open(filename,"r") as fobj:
            files.append(fobj.read())
    filenames = [filename.split("/")[-1] for filename in filenames]
    return files, filenames 

# import articles
article_path = os.path.join("dat","articles")
articles, article_names = read_txt_dir(article_path)

# import magazines
magazine_path = os.path.join("dat","magazines")
magazines, magazine_names = read_txt_dir(magazine_path)

## Tokenization and basic corpus statistics ##

Start by computing basic corpus statistics. For this you need several functions for preprocessing your string data. We use `re` to remove punctuation and `NLTK` for tokenization. The functionality can be implemented with `re` alone.

In [3]:
######################################## TOKENIZE ###########################################

import re

# function to tokenize and lowercase strings
def tokenize(input, length = 0, casefold = True):   # ignore tokens shorter than or equal to 3
    tokenizer = re.compile(r'[^A-Za-z]+')           # only retain chars
    if casefold:                                    # lowercase if casefold = True
        input = input.lower()
    tokens = [token for token in tokenizer.split(input) if len(token) > length]
    return tokens

flatten = lambda l: [item for sublist in l for item in sublist]

articles_tokens = list(map(tokenize,articles))      # tokenize all articles in list
tokens = sorted(flatten(articles_tokens))           # all tokens in one (flat) sorted list
n_tokens = len(tokens)                              # total number of tokens
n_types = len(list(set(tokens)))                    # number of unique tokens

print("The corpus consist of {} tokens distributed over {} lexical types".format(n_tokens, n_types))
print("The lexical richness measured as the type-token ratio is {}".format(round(n_types/n_tokens,4)))
print("On average every word is repeated {} times".format(round(n_tokens/n_types,2)))

The corpus consist of 289006 tokens distributed over 12336 lexical types
The lexical richness measured as the type-token ratio is 0.0427
On average every word is repeated 23.43 times


## Stopwords ##

In [5]:
from collections import defaultdict
from operator import itemgetter

# function to generate stopword list from dataset with n number of stopwords
def gen_ls_stoplist(input, n = 100):
    t_f_total = defaultdict(int)
    for text in input:
        for token in text:
            t_f_total[token] += 1
    nmax = sorted( t_f_total.items(), key = itemgetter(1), reverse = True)[:n]
    return [elem[0] for elem in nmax]

# generate stopword list from articles with 50 stopwords
sw = gen_ls_stoplist(articles_tokens, 50)

import io

# function to read txt-file and store content in string
def read_txt(filepath):
    f = io.open(filepath, 'r', encoding = 'utf-8')
    content = f.read()
    f.close()
    return content

nltksw = read_txt('Stopwords/english')      # save nltk stopword list in variable
nltksw = tokenize(nltksw)                   # tokenize nltk stopword list

# apply sw
no_sw = list()
for article in articles_tokens:
    out = [token for token in article if token not in sw]
    no_sw.append(out)

# apply nltksw
no_nltksw = list()
for article in articles_tokens:
    out = [token for token in article if token not in nltksw]
    no_nltksw.append(out)

# apply sw + nltksw
no_swall = list()
for article in no_sw:
    out = [token for token in article if token not in nltksw]
    no_swall.append(out)


## Filter 3 char tokens ##

In [8]:
################################### REMOVE 3 CHAR TOKENS #######################################


# remove 3 char tokens from articles without sw
clean_nosw = list()
for article in no_sw:
    out = [token for token in article if len(token)>3]
    clean_nosw.append(out)

# remove 3 char tokens from articles without nltksw
clean_nonltksw = list()
for article in no_nltksw:
    out = [token for token in article if len(token)>3]
    clean_nonltksw.append(out)

# remove 3 char tokens from articles without sw + nltk
clean_noswall = list()
for article in no_swall:
    out = [token for token in article if len(token)>3]
    clean_noswall.append(out)


# Stemming

In [9]:
######################################### STEMMING  #############################################

from nltk.stem import PorterStemmer

ps = PorterStemmer()

# stemming no sw
stem_nosw = list()
for article in clean_nosw:
    out = [ps.stem(token) for token in article]
    stem_nosw.append(out)

# stemming no nltk
stem_nonltksw = list()
for article in clean_nonltksw:
    out = [ps.stem(token) for token in article]
    stem_nonltksw.append(out)

# stemming no sw all
stem_noswall = list()
for article in clean_noswall:
    out = [ps.stem(token) for token in article]
    stem_noswall.append(out)

## New corpus statistics

In [12]:
################################### NEW CORPUS STATISTICS ########################################


clean_tokens = sorted(flatten(stem_nonltksw))            # all tokens in one (flat) sorted list
n_clean_tokens = len(clean_tokens)                      # total number of tokens
n_clean_types = len(list(set(clean_tokens)))            # number of unique tokens


print("The corpus now consist of {} tokens distributed over {} lexical types".format(n_clean_tokens, n_clean_types))
print("The new lexical richness measured as the type-token ratio is now {}".format(round(n_clean_types/n_clean_tokens,4)))
print("On average every word is now repeated {} times".format(round(n_clean_tokens/n_clean_types,2)))



The corpus now consist of 124661 tokens distributed over 7493 lexical types
The new lexical richness measured as the type-token ratio is now 0.0601
On average every word is now repeated 16.64 times


# Word frequency

# Topic modeling

# Association rules