In [7]:
#! /usr/bin/env python

import pandas
import re
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.snowball import SnowballStemmer
# from nltk.stem import *
# from nltk.stem.porter import *

# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_2.csv', names=colnames)

# Create lists for the data
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Getting only the years from dates list
years = [re.sub('[A-Za-z ]', '', item) for item in dates]

# Combining year with presenter for citation
authordate = [author+" "+year for author, year in zip(authors, years)]

# Just to check to see if things are synced,
# let's create a new df with the two lists.

cited_texts = pandas.DataFrame(
    {'citation': authordate,
     'text': talks,
    })

In [6]:
# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

# Load tokenizer, stopwords, and stemmer
tokenizer = WhitespaceTokenizer()
stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())
stemmer = SnowballStemmer("english", ignore_stopwords=True)
p_stemmer = PorterStemmer()

# List for loop
texts = []

# loop through document list
for i in talks:
    
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    
    # stem tokens
    stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
#    texts.append(tokens)
    texts.append(stemmed_tokens)

print(texts[0:5])

[['thank', 'chris', 'truli', 'great', 'honor', 'opportun', 'come', 'stage', 'twice', 'extrem', 'grate', 'blown', 'away', 'confer', 'thank', 'nice', 'comment', 'say', 'night', 'say', 'sincer', 'part', 'mock', 'sob', 'need', 'put', 'posit', 'flew', 'air', 'forc', 'two', 'eight', 'year', 'take', 'shoe', 'boot', 'get', 'airplan', 'tell', 'quick', 'stori', 'illustr', 'like', 'true', 'stori', 'bit', 'true', 'soon', 'tipper', 'left', 'mock', 'sob', 'white', 'hous', 'drive', 'home', 'nashvill', 'littl', 'farm', 'mile', 'east', 'nashvill', 'drive', 'know', 'sound', 'like', 'littl', 'thing', 'look', 'rear', 'view', 'mirror', 'sudden', 'just', 'hit', 'motorcad', 'back', 'heard', 'phantom', 'limb', 'pain', 'rent', 'ford', 'taurus', 'dinnertim', 'start', 'look', 'place', 'eat', 'got', 'exit', 'lebanon', 'tennesse', 'got', 'exit', 'found', 'shoney', 'restaur', 'low', 'cost', 'famili', 'restaur', 'chain', 'know', 'went', 'sat', 'booth', 'waitress', 'came', 'made', 'big', 'commot', 'tipper', 'order', 

In [8]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', 'died', 
           'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating', 
           'siezing', 'itemization', 'sensational', 'traditional', 
           'reference', 'colonizer', 'plotted']

singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule deni die agre own humbl size meet state siez item sensat tradit refer colon plot
