# Pipeline for NLP 

### Includes: 

#### * Text Preprocessing (NLTK, SpaCy, Textblob) 
#### * Vectorization (Count Vectorizer, TF-IDF Vectorizer) 
#### * Topic Modeling (LSA, LDA, NMF) 
#### * Dimensionality Reduction (SVD, PCA, tSNE) 
#### * Clustering (K-Means, Agglomerative, DBSCAN, Mean Shift, Spectral) 
#### * Visualizations in 3D & 2D 

## List of Imports

In [6]:
from __future__ import print_function

In [43]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

import matplotlib
from mpl_toolkits.mplot3d import Axes3D

import nltk
from nltk.stem import porter
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 

import spacy
import en_core_web_sm
from spacy.en import STOP_WORDS
from textblob import TextBlob

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords
from scipy.cluster.hierarchy import dendrogram, linkage

%matplotlib inline
# plt.style.use('seaborn')
plt.style.use('seaborn-white')
# plt.style.use(['dark_background', 'presentation'])

In [3]:
libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd), ('NLTK', nltk))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.6.2 |Continuum Analytics, Inc.| (default, Jul 20 2017, 13:14:59) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)] 

Matplotlib Version: 2.0.2
Numpy Version: 1.12.1
Pandas Version: 0.20.3
NLTK Version: 3.2.4


## Data & MongoDB 

In [None]:
# Increase the field_size_limit
csv.field_size_limit(sys.maxsize)

In [None]:
# CSV to Json
csvfile = open("../../Data/emails.csv", 'r')
reader = csv.DictReader(csvfile)

# Pandas DataFrame to Json
reader = json.loads(dataframe_name.T.to_json()).values()

# Connect to MongoDB 
client = MongoClient()
db = client.database_name

# Drop Json to MongoDB 
db.collection_name.drop()
header=["col1","col2", "col3","col4","col5","col6","col7"]

for each in reader: 
    row = {}
    for field in header: 
        row[field] = each[field]
    db.collection_name.insert(row)

In [None]:
# Import data from Mongo 
client = MongoClient()
db = client.database_name
cursor = db.collection_name.find({})

# Construct the DataFrame
df =  pd.DataFrame(list(cursor))
df.head()

## Email Info Extraction

In [20]:
def extract_text_from_email(email_):
    msg = email.message_from_string(email_)
    for content in msg.walk():
        if content.get_content_type() == 'text/plain':
            return content.get_payload()

In [22]:
messages = list(map(email.message_from_string, df['message']))

# Sets keys that are commonly used by most messages 
keys = messages[9].keys()
print(keys)

# Add keys and corresponding info to DataFrame 
for key in keys:
    df[key] = [doc[key] for doc in messages]
print(df.head(2))

## NLP - Preprocessing 

In [19]:
# NLTK 
def nltk_tokenizer_(raw_string):
    """ Function that returns cleaned, tokenized text.
    Input: 
        (str) raw text (corpus of documents)
    Text Preprocessing: 
        1. Removal of numbers, punctuations and special charaters
        2. Lowercase-conversion 
        2. Tokenization of text 
        3. Lemmatization & Stemming 
        5. Removal of stop words
        6. Removel of small words (length less than 3)
    Output: 
       cleaned text (tokens).
    """
    stemmer = nltk.stem.porter.PorterStemmer()
    lemmizer = WordNetLemmatizer()
    
    tokenizer = RegexpTokenizer(r'\w+')
    stop = nltk.corpus.stopwords.words('english')
    stop += ['.',',','(', ')',"'",'"']
    stop = set(stop) 
    
    # lower all letter 
    raw_string = raw_string.lower()
    # get rid of numbers 
    raw_string = raw_string.translate(str.maketrans('','','1234567890'))
    # tokenize text with RegexpTokenizer to get rid of punctuations
    tokens = tokenizer.tokenize(raw_string)
    # lemmatize 
    tokens = [lemmatizer.lemmatize(i) for i in tokens]
    # stemming 
    tokens = [stemmer.stem(i) for i in tokens]
    # get rid of stop words 
    tokens = [i for i in tokens if i not in stop]
    # get rid of small words 
    tokens = [i for i in tokens if len(i)>3]
    
    return tokens

In [21]:
# SpaCy 
def spacy_tokenizer_(raw_string):
    """ Function that returns cleaned, tokenized text.
    Input: 
        (str) raw text (corpus of documents)
    Text Preprocessing: 
        1. Removal of numbers, punctuations and special charaters
        2. Lowercase-conversion 
        2. Tokenization of text 
        3. Lemmatization & Stemming 
        5. Removal of stop words
        6. Removel of small words (length less than 3)
    Output: 
       cleaned text (tokens).
    """
    nlp = en_core_web_sm.load()
    punctuations = string.punctuation + '.doc'
    stemmer = nltk.stem.porter.PorterStemmer()

    raw_string = re.sub(r'\-\-+', '', raw_string)
    raw_string = raw_string.translate(str.maketrans('','','1234567890'))
    
    tokens = nlp(raw_string)
    tokens = [tok.lemma_.lower().strip() 
              if tok.lemma_ != "-PRON-" 
              else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens 
              if (tok not in stopwords and tok not in punctuations)]   
    tokens = [stemmer.stem(i) for i in tokens]
    tokens = [i for i in tokens if len(i)>3]
    
    return tokens

In [20]:
# TextBlob 
def textblob_tokenizer_(raw_string):
    """ Function that returns cleaned, tokenized text.
    Input: 
        (str) raw text (corpus of documents)
    Text Preprocessing: 
        1. Removal of numbers, punctuations and special charaters
        2. Lowercase-conversion 
        2. Tokenization of text 
        3. Lemmatization & Stemming 
        5. Removal of stop words
        6. Removel of small words (length less than 3)
    Output: 
       cleaned text (tokens).
    """
    stemmer = nltk.stem.porter.PorterStemmer()
    lemmizer = WordNetLemmatizer()
    
    stop = nltk.corpus.stopwords.words('english')
    stop += ['.',',','(', ')',"'",'"']
    stop = set(stop)
    
    raw_string = raw_string.lower()
    raw_string = re.sub(' [A-Z]* ', '', raw_string)
    raw_string = raw_string.translate(str.maketrans('','','1234567890'))
    
    tokens = TextBlob(raw_string).words
    tokens = [w for w in tokens if w not in stop]
    tokens = [lemmatizer.lemmatize(i) for i in tokens]
    tokens = [stemmer.stem(i) for i in tokens]
    tokens = [i for i in tokens if len(i)>3]
                                      
    return tokens

## NLP - Vectorization

#### Count Vectorizer & TF-IDF Vectorizer

In [None]:
# Count Vectorizer 
def count_vectorizer(tokenizer, 
                     min_n=1, 
                     max_n=2, 
                     max_features=1000, 
                     max_df=0.6):
    """ This function returns a count vectorizer.
    Input: 
        1. tokenizer: tokenizer to tokenize and cleanse the text 
        2. min_n: lower boundary of n-values for 
                  different n-grams to be extracted
        3. max_n: upper boundary of n-values for 
                  different n-grams to be extracted
        4. max_features: max features returned with top term frequency 
        5. max_df: the max document frequency allowed for a single word
    Output: 
        Vectorized text
    """
    
    vectorizer = CountVectorizer(tokenizer=tokenizer,
                                 ngram_range=(min_n,max_n),
#                                stop_words='english', 
#                                token_pattern="\\b[a-z][a-z]+\\b",
#                                lowercase=True
                                 max_features=max_features,
                                 max_df = max_df) 
    
    vect_data = vectorizer.fit_transform(data)
    norm_data = Normalizer().fit_transform(vect_data)
    feature_names = vectorizer.get_feature_names()
    
    return norm_data, feature_names

# TF-IDF Vectorizer     
def tfidf_vectorizer(tokenizer, 
                     min_n=1, 
                     max_n=2, 
                     max_features=1000, 
                     max_df=0.6): 
    """ This function returns a TF-IDF vectorizer.
    Input: 
        1. tokenizer: tokenizer to tokenize and cleanse the text 
        2. min_n: lower boundary of n-values for 
                  different n-grams to be extracted
        3. max_n: upper boundary of n-values for 
                  different n-grams to be extracted
        4. max_features: max features returned with top term frequency 
        5. max_df: the max document frequency allowed for a single word
    Output: 
        Vectorized text
    """

    vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                                 ngram_range=(min_n, max_n),  
#                                stop_words='english', 
#                                token_pattern="\\b[a-z][a-z]+\\b",
#                                lowercase=True,
                                 max_features=max_features,
                                 max_df = 0.6)
    
    vect_data = vectorizer.fit_transform(data)
    norm_data = Normalizer().fit_transform(vect_data)
    feature_names = vectorizer.get_feature_names()
    
    return norm_data, feature_names    

#### Vectorizeration + DataFrame Construction

In [27]:
# Count Vectorizer 
def count_vectorizer_df(content, 
                        tokenizer, 
                        min_n, 
                        max_n, 
                        max_features=1000, 
                        max_df=0.6):  
    """ This function returns a vectorized pandas dataframe 
    with count vectorizer.
    """
    
    vectorizer = CountVectorizer(tokenizer=tokenizer, 
                                 ngram_range=(min_n,max_n),
#                                  stop_words='english', 
#                                  token_pattern="\\b[a-z][a-z]+\\b",
#                                  lowercase=True
                                 max_features=max_features,
                                 max_df = max_df) 
    vec_text = vectorizer.fit_transform(content)    
    feature_names = vectorizer.get_feature_names()
    df = pd.DataFrame(vec_text.toarray(), columns=feature_names)
    return df 

# TF-IDF Vectorizer
def tfidf_vectorizer_df(content, 
                        tokenizer, 
                        min_n, 
                        max_n, 
                        max_features=1000, 
                        max_df=0.6):     
        """ This function returns a vectorized pandas dataframe 
        with TF-IDF vectorizer.
        """
        
    vectorizer = TfidfVectorizer(tokenizer=tokenizer, 
                                 ngram_range=(min_n,max_n),
#                                  stop_words='english', 
#                                  token_pattern="\\b[a-z][a-z]+\\b",
#                                  lowercase=True
                                 max_features=max_features,
                                 max_df = max_df) 
    vec_text = vectorizer.fit_transform(content)    
    feature_names = vectorizer.get_feature_names()
    df = pd.DataFrame(vec_text.toarray(), columns=feature_names)
    return df 

## Topic Modeling 

In [62]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    """ Returns a list of topics identified 
    by different topic modeling algorithms.
    """
    
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def display_topics_lda(model, feature_names, no_top_words, d):
    for ix, topic in enumerate(model.components_):
        print("Topic ", ix, "Score", d[ix])
        print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-no_top_words - 1:-1]]))    

#### IDA (Latent Dirichlet Allocation) 

In [60]:
def lda_algo(vectorized_data, 
             n_comp=10, 
             n_iter=10, 
             random_state=7777, 
             learning_method='online',
             n_jobs=-1):
    """ Returns a topic modeling model employing LDA algorithm
    with specified inputs and transformed data.
    Input: 
        1. vectorized data 
        2. n_comp: number of topics to extract from data
    Output: 
        1. LDA model 
        2. Data that's been trained and transformed
        """
    
    lda = LatentDirichletAllocation(n_components=n_comp,
                                max_iter=n_iter,
                                random_state=random_state,
                               learning_method=learning_method)
    
    data = lda.fit_transform(vectorized_data)
    scor = data[0]

    return lda, data, scor

#### NMF (Non-Negative Matrix Factorization) 

In [6]:
def nmf_algo(vectorized_data, n_comp=10, random_state=7777,n_jobs=-1):
    """ Returns a topic modeling model employing NMF algorithm
    with specified inputs and transformed data.
    Input: 
        1. vectorized data 
        2. n_comp: number of topics to extract from data
    Output: 
        1. NMF model 
        2. Data that's been trained and transformed
        """
    
    nmf = NMF(n_components=n_comp)
    data = nmf.fit_transform(vectorized_data)
    scor = data[0]
    
    return nmf, data, scor 

#### LSA (Latent semantic analysis)

In [7]:
def lsa_algo(vectorized_data, n_comp=10, random_state=7777,n_jobs=-1):
    """ Returns a topic modeling model employing NMF algorithm
    with specified inputs and transformed data.
    Input: 
        1. vectorized data 
        2. n_comp: number of topics to extract from data
    Output: 
        1. NMF model 
        2. Data that's been trained and transformed
    """
    lsa = TruncatedSVD(n_components=n_comp)
    data = lsa.fit_transform(vectorized_data)
    scor = data[0]
    
    return lsa, data, scor 

In [8]:
# display_topics(lsa_tfidf,count_vectorizer.get_feature_names(),10)

## Dimensionality Reduction

#### PCA 
* Improve clustering
* Improve classification (alternative to feature selection)
* Visualize high dimensional data in 2D or 3D 
* Data compression with little loss 

In [50]:
def plot_clusters_3D_pca(data, n_comp=3, random_state=7777):
    pca = PCA(n_components=n_comp)
    reduced_data = pca.fit_transform(data)

    fig = plt.figure(1, figsize=(8, 6))
    ax = Axes3D(fig, elev=-160, azim=130)
    ax.scatter(reduced_data[:, 0],
               reduced_data[:, 1], 
               reduced_data[:, 2], 
#                c=y, 
               cmap=plt.cm.Paired)
    ax.set_title("Enron Emails in 3D")
    ax.set_xlabel("sepal length")
    ax.w_xaxis.set_ticklabels([])
    ax.set_ylabel("sepal width")
    ax.w_yaxis.set_ticklabels([])
    ax.set_zlabel("petal length")
    ax.w_zaxis.set_ticklabels([])

    plt.show()  
    
    return pca, reduced_data

def plot_3D(X):
    x,y,z = zip(*X)
    plt.style.use("seaborn-poster")
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(x,y,z)

#### SVD

In [None]:
def plot_clusters_3D_svd(data, n_comp=3, random_state=7777):
    
    svd = TruncatedSVD(n_components=n_comp, random_state=random_state)
    reduced_data = svd.fit_transform(data)
        
    fig = plt.figure(1, figsize=(8, 6))
    ax = Axes3D(fig, elev=-160, azim=130)
    ax.scatter(reduced_data[:, 0],
               reduced_data[:, 1], 
               reduced_data[:, 2], cmap='Set1')
    
    ax.set_title("Enron Emails in 3D")
    ax.set_xlabel("topic1")
    ax.w_xaxis.set_ticklabels([])
    ax.set_ylabel("topic2")
    ax.w_yaxis.set_ticklabels([])
    ax.set_zlabel("topic3")
    ax.w_zaxis.set_ticklabels([])

    plt.show() 
    
    return svd, reduced_data

In [None]:
def plot_clusters_2D_svd(data, n_comp=2, random_state=7777):
    
    svd = TruncatedSVD(n_components=n_comp, random_state=random_state)
    reduced_data = svd.fit_transform(data)
        
    fig = plt.figure(1, figsize=(8, 6))
#     ax = Axes3D(fig, elev=-160, azim=130)
    plt.scatter(reduced_data[:, 0],
               reduced_data[:, 1])
#                cmap='Set1')
    
#     ax.set_title("Enron Emails in 3D")
#     ax.set_xlabel("topic1")
#     ax.w_xaxis.set_ticklabels([])
#     ax.set_ylabel("topic2")
#     ax.w_yaxis.set_ticklabels([])

    plt.show() 
    
#     return svd, reduced_data

#### t-SNE
* For Visualization of High Dim Data Only

In [51]:
def plot_high_dim_data_tsne(data, num_clust=10, 
                            perplexity=30, 
                            num_points=100, 
                            n_comp=2, 
                            learning_rate=200, 
                            random_state=7777):
    tsne = TSNE(n_components=n_comp, 
                perplexity=perplexity, 
                learning_rate=learning_rate, 
                random_state=random_state, verbose=2)
    
    low_data = tsne.fit_transform(data)

    colorize = []
    
    for i in range(num_clust):
        for _ in range(num_points):
            colorize.append(plt.cm.rainbow(i*20))
            
    x,y = zip(*low_data)
    
    plt.scatter(x,y,c=colorize,s=40)
    
#     return tsne, low_data

## Standardization 

In [26]:
standardized_data = StandardScaler().fit_transform(data)

## Clustering 

#### K-Means Clustering 
https://github.com/thisismetis/chi17_ds4/blob/master/class_lectures/week07-fletcher1/01-unsup_kmeans/simple_kmeans_demo.ipynb

https://github.com/dziganto/Data_Science_Fundamentals/blob/master/notebooks/Machine_Learning/Kmeans_Clustering.ipynb

In [28]:
def k_means_clustering(X, min_k, max_k, random_state=7777):
    SSEs = []
    Sil_coefs = []
    for k in range(min_k, max_k):   
        km = KMeans(n_clusters=k, random_state=random_state)
        clusters = km.fit_predict(X)
        labels = km.labels_ 
        Sil_coefs.append(silhouette_score(X, labels, metric='euclidean'))
        SSEs.append(km.inertia_)
        
        
    return clusters, Sil_coefs, SSEs 

def plot_k_means(X, n_clust=5): 
    km = KMeans(n_clusters=n_clust)
    clusters = km.fit_predict(X)
    x,y = zip(*X)
    plt.figure(dpi=200)
    plt.scatter(X[:,0],X[:,1],c=plt.cm.rainbow(clusters*20),s=14);

def plot_kmeans_evaluation(Sil_coefs, SSEs, min_k, max_k):
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), 
                                   sharex=True, dpi=200)
    k_clusters = range(min_k, max_k)
    ax1.plot(k_clusters, Sil_coefs)
    ax1.set_xlabel('number of clusters')
    ax1.set_ylabel('silhouette coefficient')

    ax2.plot(k_clusters, SSEs)
    ax2.set_xlabel('number of clusters')
    ax2.set_ylabel('SSE');
    
def plot_silhoutte(X, min_k, max_k, random_state=7777):
    for k in range(min_k, max_k):
        plt.figure(dpi=150, figsize=(8,6))
        ax1 = plt.gca()
        km = KMeans(n_clusters=k, random_state=random_state)
        km.fit(X)
        labels = km.labels_
        silhouette_avg = silhouette_score(X, labels)
        print("For n_clusters =", k,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, labels)

        y_lower = 10
        for i in range(k):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = plt.cm.spectral(float(i) / k)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster 
            # numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

#### DBSCAN Clustering 
http://localhost:8888/notebooks/metisgh/chi17_ds4/class_lectures/week07-fletcher1/05-more_clustering/other_clustering_sklearn.ipynb

In [37]:
def dbscan_clustering(X, eps, minpoints):
    """ A function that employs DBSCAN clustering to cluster text data
    and plots clusters.
    Input: 
        X: Standardized data for best results
    """
    
    dbscan = DBSCAN(eps=eps, min_samples=minpoints).fit(X)
    labels = dbscan.labels_
    unique_labels = set(labels)
    
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))
    plt.figure(dpi=200)
    show_core = True
    show_non_core = True

    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'
        
        class_member_mask = (labels == k)
        
        if show_core:
            xy = X[class_member_mask & core_samples_mask]
            x, y = xy[:,0], xy[:,1]
            plt.scatter(x, y, c=col, edgecolors='k',  s=20, linewidths=1.1)

        if show_non_core:
            xy = X[class_member_mask & ~core_samples_mask]
            x, y = xy[:,0], xy[:,1]
            plt.scatter(x, y, c=col, s=20, linewidths=1.1)

    plt.title('Estimated number of clusters: %d' % n_clusters_);
    
    return dbscan, unique_labels 

# dbscan_model, dbscan_ulabels = dbscan_clustering(standardized_data, 
#                                                        eps=0.1, 
#                                                        minpoints=3)

#### Mean Shift Clustering 

In [29]:
def mean_shift_clustering(X, quantile=0.2, n_samples=500):
    
    # automatically detects bandwidth 
    bandwidth = estimate_bandwidth(X, quantile=quantile, n_samples=n_samples)

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    print("number of estimated clusters : %d" % n_clusters_)
    
    plt.figure(1)
    plt.clf()
    plt.figure(dpi=200)
    colors = cycle('byrcmykbgrcmykbgrcmykbgrcmyk')
    
    for k, col in zip(range(n_clusters_), colors):
        my_members = labels == k
        cluster_center = cluster_centers[k]
        plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'X', markerfacecolor='k',
                 markeredgecolor='k', markersize=14)
        
    plt.title('Estimated number of clusters: %d' % n_clusters_);

#### Hierarchical Clustering (Agglomerative)

In [39]:
def plot_agglomerative_clustering(X, clusters=3):
    for linkage in ('ward', 'average', 'complete'):
        agglo = AgglomerativeClustering(linkage=linkage, n_clusters=clusters)
        t0 = time()
        clusters = agglo.fit(X)
        print("%s : %.2fs" % (linkage, time() - t0))

        x,y = zip(*X)
        plt.figure(dpi=200)
        plt.scatter(x,y,c=plt.cm.rainbow(agglo.labels_*20),s=14)
        plt.title("Linkage Type: %s" % linkage)
        
        return agglo, clusters 

Dendrograms with Agglomerative Clustering in SciPy

In [None]:
Xdata, _ = make_blobs(n_samples=50, centers=centers, cluster_std=0.6)
Z = linkage(Xdata,'ward')
plt.figure(dpi=200)
dendrogram(Z,truncate_mode='mlab',); ### Dendrograms with Agglomerative Clustering in SciPy

#### Spectral Clustering

In [38]:
def spectral_clustering(X, clusters=3):
    sc = SpectralClustering(n_clusters=clusters)
    clusters = sc.fit_predict(X)
    
    x,y = zip(*X)
    plt.figure(dpi=200)
    plt.scatter(X[:,0],X[:,1],c=plt.cm.rainbow(ypred*20),s=14);
    
    return clusters, sc

## All Together 

In [1]:
class EmailAutoClustering():
    """ Take in a corpus of emails (accepts pandas series), 
    preprocesses (cleans & tokenizes) and vectorizes them, and 
    conducts topic modeling to cluster each email into its 
    corresponding topic.
    
    OUT: DataFrame containing each email and its topic. 
    """
    
    
    def __init__(self, docs):
        self.docs = docs
        self.tokenizer = self.nltk_tokenizer
        self.count_vectorizer(self.docs, self.tokenizer)
        self.nmf_algo(self.norm_data, self.feature_names)
        
        
    def nltk_tokenizer(self, doc):
        """ Takes in a corpus of documents, cleans, and tokenizes:
        1. Remove numbers, punctuations and special charaters
        2. Tokenize into words using wordpunct
        3. Lemmatize
        4. Stem and lowercase
        5. Remove stop words

        OUT: cleaned text (tokens).
        """
        lemmatizer = WordNetLemmatizer()
        tokenizer = RegexpTokenizer(r'\w+')
        stop = nltk.corpus.stopwords.words('english')
        stop += ['.',',','(', ')',"'",'"']
        stop += ['_____________________________________________',
                 '__________________________________________________']
        stop += ['_________________________________________________________________']
        stop = set(stop) 
        
        doc = doc.lower()
        doc = doc.translate(str.maketrans('','','1234567890')) 
        tokens = tokenizer.tokenize(doc)
        tokens = [lemmatizer.lemmatize(i) for i in tokens]
        tokens = [i for i in tokens if i not in stop]
        tokens = [i for i in tokens if len(i)>3]
        
        return tokens 
             
    def count_vectorizer(self, docs, tokenizer, min_n=1, max_n=2, max_features=5000, max_df=0.6):
        """ This function takes in cleaned tokens and 
        returns vectorized and normalized data using count vectorizer.
        """

        self.vectorizer = CountVectorizer(tokenizer=tokenizer,
                                     ngram_range=(min_n,max_n),
                                     max_features=max_features,
                                     max_df=max_df) 
        
        self.vect_data = self.vectorizer.fit_transform(self.docs)
        self.norm_data = Normalizer().fit_transform(self.vect_data)
        self.feature_names = self.vectorizer.get_feature_names()
        
        
    def nmf_algo(self, norm_data, feature_names, n_comp=4, random_state=7777, no_top_words=20, topic_names=None):
        """ Returns a 1). NMF model and 2). transformed data
        given the parameters specified by user.
        """
        
        self.nmf = NMF(n_components=n_comp)
        
        self.nmf_data = self.nmf.fit_transform(self.norm_data)
        
        for ix, topic in enumerate(self.nmf.components_):
            if not topic_names or not topic_names[ix]:
                print("\nTopic ", ix)
            else:
                print("\nTopic: '",topic_names[ix],"'")
            print(", ".join([self.feature_names[i]
                            for i in topic.argsort()[:-no_top_words - 1:-1]]))
            
    def create_topic_space(self):
        """ Returns a pandas dataframe with emails as row and topics & content as column """
        self.df_topic = pd.DataFrame(self.nmf_data)
        self.df_topic['topics'] = self.df_topic.idxmax(axis=1)
        df_topics = pd.get_dummies(self.df_topic['topics'])
        self.df_topic = pd.concat([self.df_topic, df_topics, self.docs.to_frame()], axis=1)
        self.df_topic.columns = ['t0_vec', 't1_vec','t2_vec','t3_vec', 'topics',
                                 'Email_Bucket_1', 'Email_Bucket_2','Email_Bucket_3',
                                 'Email_Bucket_4','Email']
        self.df_topic = self.df_topic.drop(['t0_vec', 't1_vec','t2_vec','t3_vec','topics'], axis=1)
        
        return self.df_topic
    
#         email_topic = defaultdict(list)

#         for topic, email in zip(df_topic['topics'][66:88],df_topic['Email'][66:88]):
#             if topic==0:
#                 email_topic['Corporation_Related'].append(email)
#             elif topic == 1: 
#                 email_topic['Meeting_Call_Appointment'].append(email)
#             elif topic == 2: 
#                 email_topic['IT_Related'].append(email)
#             else: 
#                 email_topic['Industry_Business_Market'].append(email)

In [59]:
def nlp_stuff(raw_string, tokenizer_, vectorizer_, topic_modeling_, 
              min_n=1, max_n=2, max_features=5000, max_df=0.6, ):
    """This function takes in raw text, preprocesses and vectorizes it, 
    then applies an unsurpervised learning algorithm on it.
    """
    
    if tokenizer_ == 'nltk':
        pass 
#         tokenizer = RegexpTokenizer(r'\w+')
#         stop = nltk.corpus.stopwords.words('english')
#         stop += ['.',',','(', ')',"'",'"']
#         stop = set(stop) 

#         raw_string = raw_string.lower()
#         raw_string = raw_string.translate(str.maketrans('','','1234567890'))
#         tokens = tokenizer.tokenize(raw_string)
#         tokens = [lemmatizer.lemmatize(i) for i in tokens]
#         tokens = [stemmer.stem(i) for i in tokens]
#         tokens = [i for i in tokens if i not in stop]
#         tokens = [i for i in tokens if len(i)>3]
    
    
    if tokenizer_ == 'spacy':
        pass
#         punctuations = string.punctuation + '.doc'
#         raw_string = re.sub(r'\-\-+', '', raw_string)
#         raw_string = re.sub(r'\.\.+', '', raw_string)
#         raw_string = raw_string.translate(str.maketrans('','','1234567890'))
#         tokens = nlp(raw_string)
#         tokens = [tok.lemma_.lower().strip() 
#                   if tok.lemma_ != "-PRON-" 
#                   else tok.lower_ for tok in tokens]
#         tokens = [tok for tok in tokens 
#                   if (tok not in stopwords and tok not in punctuations)]   
#         tokens = [stemmer.stem(i) for i in tokens]
#         tokens = [i for i in tokens if len(i)>3]


    if vectorizer_ == 'countv':
        pass 
#         vectorizer = CountVectorizer(tokenizer=tokenizer_,
#                                  ngram_range=(min_n,max_n),
#                                  stop_words='english', 
#                                  token_pattern="\\b[a-z][a-z]+\\b",
#                                  lowercase=True,
#                                  max_features=max_features,
#                                  max_df = max_df) 
        
#         vect_data = vectorizer.fit_transform(data)
    
    if vectorizer_ == 'tfidfv':
        pass
#         vectorizer = TfidfVectorizer(tokenizer=tokenizer_,
#                                  ngram_range=(min_n,max_n),  
#                                  stop_words='english', 
#                                  token_pattern="\\b[a-z][a-z]+\\b",
#                                  lowercase=True,
#                                  max_features=max_features,
#                                  max_df = 0.6)
        
#         vect_data = vectorizer.fit_transform(data)
        
    if topic_modeling_ == 'isa':
        pass 
    if topic_modeling_ == 'ida':
        pass 
    if topic_modeling_ == 'nmf':
        pass 
    if dim_red_ == 'pca':
        pass 
    if clustering_ == 'kmeans':
        pass 
    if clustering_ == 'spectral':
        pass 
    if clustering_ == 'agglo':
        pass 
    if clustering_ == 'meanshift':
        pass 
    if clustering_ == 'dbscan':
        pass 