In [None]:

#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## 14 TEXT MINING DAY1 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs



In [None]:
#=================================================-
#### Slide 11: Directory settings  ####

# Set `main_dir` to the location of your `af-werx` folder (for Linux).
main_dir = "/home/[username]/Desktop/af-werx"
# Set `main_dir` to the location of your `af-werx` folder (for Mac).
main_dir = '/Users/[username]/Desktop/af-werx'
# Set `main_dir` to the location of your `af-werx` folder (for Windows).
main_dir = "C:\\Users\\[username]\\Desktop\\af-werx"
# Make `data_dir` from the `main_dir` and
# remainder of the path to data directory (for Mac).
data_dir = main_dir + "/data"

# Make `data_dir` from the `main_dir` and
# remainder of the path to data directory (for Windows).
data_dir = main_dir + "\\data"




In [None]:
#=================================================-
#### Slide 12: Loading packages  ####

# Helper packages.
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
# Packages with tools for text processing.
from wordcloud import WordCloud
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer



In [None]:
#=================================================-
#### Slide 13: Working directory  ####

# Set working directory.
os.chdir(data_dir)
# Check working directory.
print(os.getcwd())



In [None]:
#=================================================-
#### Slide 25: Loading text data  ####

# Load the corpus.
NYT = pd.read_csv(data_dir + '/NYT_article_data.csv')

print(NYT.columns)



In [None]:
#=================================================-
#### Slide 26: Look at the first few columns  ####

# Look at the columns.
print(NYT.head())



In [None]:
#=================================================-
#### Slide 27: Creating a list of snippets  ####

# Isolate the snippet column.
NYT_snippet = NYT["snippet"]
# Look at a sample of the snippets column, 0-20.
print(NYT["snippet"][0:20])



In [None]:
#=================================================-
#### Slide 29: Exercise 1  ####





In [None]:
#=================================================-
#### Slide 38: Tokenization: split each snippet into words  ####

# Tokenize each snippet into a large list of tokenized snippets.
NYT_tokenized = [word_tokenize(NYT_snippet[i]) for i in range(0,len(NYT_snippet))]



In [None]:
#=================================================-
#### Slide 39: Save the first tokenized snippet  ####

# Let's take a look at the first tokenized snippet.
snippet_words = NYT_tokenized[0]
print(snippet_words)



In [None]:
#=================================================-
#### Slide 41: Convert characters to lower case  ####

# 1. Convert to lower case.
snippet_words = [word.lower() for word in snippet_words]
print(snippet_words[:10])



In [None]:
#=================================================-
#### Slide 43: Remove stop words  ####

# 2. Remove stopwords.
# Get common English stop words.
stop_words = stopwords.words('english')
print(stop_words[:10])

# Remove stop words.
snippet_words = [word for word in snippet_words if not word in stop_words]
print(snippet_words[:10])



In [None]:
#=================================================-
#### Slide 45: Remove non-alphabetical characters  ####

# 3. Remove punctuation and any non-alphabetical characters.
snippet_words = [word for word in snippet_words if word.isalpha()]
print(snippet_words[:10])



In [None]:
#=================================================-
#### Slide 48: Stem words  ####

# 4. Stem words.
snippet_words = [PorterStemmer().stem(word) for word in snippet_words]
print(snippet_words[:10])



In [None]:
#=================================================-
#### Slide 49: Implementing pre-processing steps on a corpus  ####

# Create a list for clean snippets.
NYT_clean = [None] * len(NYT_tokenized)
# Create a list of word counts for each clean snippet.
word_counts_per_snippet = [None] * len(NYT_tokenized)

# Process words in all snippets.
for i in range(len(NYT_tokenized)):
    # 1. Convert to lower case.
    NYT_clean[i] = [snippet.lower() for snippet in NYT_tokenized[i]]

    # 2. Remove stopwords.
    NYT_clean[i] = [word for word in NYT_clean[i] if not word in stop_words]

    # 3. Remove punctuation and any non-alphabetical characters.
    NYT_clean[i] = [word for word in NYT_clean[i] if word.isalpha()]

    # 4. Stem words.
    NYT_clean[i] = [PorterStemmer().stem(word) for word in NYT_clean[i]]

    # Record the word count per snippet.
    word_counts_per_snippet[i] = len(NYT_clean[i])



In [None]:
#=================================================-
#### Slide 50: Inspect results  ####

print(NYT_clean[0][:10])
print(NYT_clean[5][:10])
print(NYT_clean[10][:10])
print(NYT_clean[15][:10])
print(NYT_clean[20][:10])



In [None]:
#=================================================-
#### Slide 51: Removing empty and very short snippets  ####

# Let's take a look at total word counts per snippet (for the first 10).
print(word_counts_per_snippet[:10])
# Plot a histogram for word counts per snippet, set bins to num of unique values in the list.
plt.hist(word_counts_per_snippet, bins = len(set(word_counts_per_snippet)))

plt.xlabel('Number of words per snippet')
plt.ylabel('Frequency')



In [None]:
#=================================================-
#### Slide 52: Removing empty and very short snippets (cont'd)  ####

# Convert word counts list and snippets list to numpy arrays.
word_counts_array = np.array(word_counts_per_snippet)
NYT_array = np.array(NYT_clean)
print(len(NYT_array))
# Find indices of all snippets where there are greater than or equal to 5 words.
valid_snippets = np.where(word_counts_array >= 5)[0]
print(len(valid_snippets))



In [None]:
#=================================================-
#### Slide 53: Removing empty and very short snippets (cont'd)  ####

# Subset the NYT_array to keep only those where there are at least 5 words.
NYT_array = NYT_array[valid_snippets]
print(len(NYT_array))




In [None]:
#=================================================-
#### Slide 54: Removing empty and very short snippets (cont'd)  ####

# Convert the array back to a list.
NYT_clean = NYT_array.tolist()
print(NYT_clean[:10])



In [None]:
#=================================================-
#### Slide 55: .join() function  ####

# Here is a simple example of the `.join()` function in action!
numList = ['1', '2', '3', '4']
print(', '.join(numList))



In [None]:
#=================================================-
#### Slide 56: Save processed text to file using .join()  ####

# Join words in each snippet into a single character string.
NYT_clean_list = [' '.join(snippet) for snippet in NYT_clean]
print(NYT_clean_list[:5])

# Save output file name to a variable.
out_filename = data_dir + "/clean_NYT.txt"

# Create a function that takes a list of character strings
# and a name of an output file and writes it into a txt file.
def write_lines(lines, filename):   #<- given lines to write and filename
    joined_lines = '\n'.join(lines) #<- join lines with line breaks
    file = open(out_filename, 'w')  #<- open write only file
    file.write(joined_lines)        #<- write lines to file
    file.close()                    #<- close connection

# Write sequences to file.
write_lines(NYT_clean_list, out_filename)



In [None]:
#=================================================-
#### Slide 59: Exercise 2  ####





In [None]:
#=================================================-
#### Slide 64: Create a DTM  ####

# Initialize `CountVectorizer`.
vec = CountVectorizer()

# Transform the list of snippets into DTM.
X = vec.fit_transform(NYT_clean_list)
print(X.toarray()) #<- show output as a matrix

print(vec.get_feature_names()[:10])



In [None]:
#=================================================-
#### Slide 65: Create a DTM (cont'd)  ####

# Convert the matrix into a pandas dataframe for easier manipulation.
DTM = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())
print(DTM.head())




In [None]:
#=================================================-
#### Slide 66: DTM to dictionary of total word counts  ####

# Create a convenience function that sorts and looks at first n-entries in the dictionary.
def HeadDict(dict_x, n):
    # Get items from the dictionary and sort them by
    # value key in descending (i.e. reverse) order
    sorted_x = sorted(dict_x.items(),
    reverse = True,
    key = lambda kv: kv[1])

    # Convert sorted dictionary to a list.
    dict_x_list = list(sorted_x)

    # Return the first `n` values from the dictionary only.
    return(dict(dict_x_list[:n]))



In [None]:
#=================================================-
#### Slide 67: DTM to dictionary of total word counts (cont'd)  ####

# Sum frequencies of each word in all documents.
DTM.sum(axis = 0).head()

# Save series as a dictionary.
corpus_freq_dist = DTM.sum(axis = 0).to_dict()

# Glance at the frequencies.
print(HeadDict(corpus_freq_dist, 6))




In [None]:
#=================================================-
#### Slide 69: Plot distribution of words in snippet corpus  ####

# Save as a FreqDist object native to nltk.
corpus_freq_dist = nltk.FreqDist(corpus_freq_dist)
# Plot distribution for the entire corpus.
plt.figure(figsize = (16, 7))
corpus_freq_dist.plot(80)




In [None]:
#=================================================-
#### Slide 70: Visualizing word counts with word clouds  ####

# Construct a word cloud from corpus.
wordcloud = WordCloud(max_font_size = 40, background_color = "white")
wordcloud = wordcloud.generate(' '.join(NYT_clean_list))

# Plot the cloud using matplotlib.
plt.figure()
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()



In [None]:
#=================================================-
#### Slide 72: Save results as a pickle  ####

pickle.dump(DTM, open('DTM.sav', 'wb'))
pickle.dump(X, open('DTM_matrix.sav', 'wb'))
pickle.dump(NYT_clean, open('NYT_clean.sav', 'wb'))
pickle.dump(NYT_clean_list, open('NYT_clean_list.sav', 'wb'))

