In [1]:
#! /usr/bin/env python

import pandas
import re
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import *
from nltk.stem.porter import *

# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_2.csv', names=colnames)

# Create list of talks
talks = df.text.tolist()

In [2]:
# Create a test set
test = talks[0:100]

In [3]:
# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

# Load tokenizer, stopwords, and stemmer
tokenizer = WhitespaceTokenizer()
stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())
stemmer = SnowballStemmer("english", ignore_stopwords=True)
p_stemmer = PorterStemmer()

# List for loop
texts = []

# loop through document list
for i in test:
    
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    
    # stem tokens
    # stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(tokens)
#    texts.append(stemmed_tokens)

In [28]:
# =-=-=-=-=-=
# Experiments in stemming corpus
# =-=-=-=-=-=

# Create a stemmed version
stemmed = []
for i in test:
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in stopwords]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    stemmed.append(stemmed_tokens)

unstemmed_words = set([y for x in unstemmed for y in x])
stemmed_words = set([y for x in stemmed for y in x])


# Build master list of words:
unstemmeds = [y for x in unstemmed for y in x]
stemmeds = [y for x in stemmed for y in x]

# Create dictionary of word:frequency pairs

punctuation = re.compile(r'[.?!,":;]') 
stemmed_freq_dict = {}

for word in stemmeds:
    # remove punctuation marks
    word = punctuation.sub("", word)
    # form dictionary
    try: 
        stemmed_freq_dict[word] += 1
    except: 
        stemmed_freq_dict[word] = 1
        
stemmed_freq_list = [(val, key) for key, val in stemmed_freq_dict.items()]
stemmed_word_list = [(key, val) for key, val in stemmed_freq_dict.items()]

stemmed_word_list.sort()
print(stemmed_word_list)