# Homework 4

## Overview


Problem 1:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

# Import data-set
from nltk.corpus import brown
brown_words = brown.words()
print(len(brown_words), 'total words')

Problem 2:

In [None]:
# Pre-process
import string
brown_words = [''.join(c for c in s if c not in string.punctuation) for s in brown_words]
brown_words = [x.lower() for x in brown_words if x != '']

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))
brown_words2 = [word for word in brown_words if word not in stop]

In [None]:
from collections import Counter
brown_counts = Counter(brown_words2)
sorted_brown = sorted(brown_counts.items(), key=lambda kv: kv[1],reverse=True)

In [None]:
vocabulary = [x[0] for x in sorted_brown[:5000]]
context_w = [x[0] for x in sorted_brown[:1000]]

Problem 3:

In [None]:
context_matrix = np.zeros((5000, 1000))

# Follow directions exactly, very inefficient -- possibility to parallelize
for w in vocabulary:
    v_index = [i for i,val in enumerate(vocabulary) if val==w]
    store_words = []
    indices = [i for i,val in enumerate(brown_words2) if val==w]
    for i in indices:
        if i == 0:
            store = brown_words2[i+1:i+3]
        elif i == 1:
            store = brown_words2[i-1:i] + brown_words2[i+1:i+3]
        else:
            store = brown_words2[i-2:i] + brown_words2[i+1:i+3]
        store_words.append(store)
    for store in store_words:
        for word in store:
            if word in context_w:
                c_index = [i for i,val in enumerate(context_w) if val==word]
                context_matrix[v_index, c_index] += 1
            else:
                pass

In [None]:
pr_context_words = context_matrix/context_matrix.sum(axis=1, keepdims=True)

In [None]:
c = context_matrix.sum(axis=0)
total_context = c.sum()
pr_context = c/total_context

In [None]:
pr_cw_df = pd.DataFrame(pr_context_words, index=vocabulary, columns=context_w)

Problem 4:

In [None]:
m_info = np.log(pr_context_words/pr_context)
m_info[m_info < 0] = 0

In [None]:
m_info.shape

Problem 5:

Let us use PCA to reduce the dimensionality to 100

In [None]:
from sklearn.decomposition import PCA
df_phi_w = pd.DataFrame(m_info, index=vocabulary, columns=context_w)
pca = PCA(n_components=100, random_state=10)
pca_m_info = pca.fit_transform(m_info)
pca_m_info = pd.DataFrame(pca_m_info, index=vocabulary)

In [None]:
df_phi_w.head()

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');
print('For PCA(100), ', pca.explained_variance_ratio_.sum()* 100.0, '% of the variance is explained.')

Problem 6:

In [None]:
# Import K Means Package
from sklearn.cluster import KMeans

# Set k = 100
km100 = KMeans(n_init=6, n_clusters=100, max_iter= 1000, init='k-means++', random_state=0)
km100.fit(pca_m_info)
# Get cluster assignment labels
labels = km100.labels_
# Format results as a DataFrame
results = pd.DataFrame([pca_m_info.index,labels]).T
results.columns = ['class', 'cluster']

In [None]:
print('Cluster for economics:\n',results.groupby('cluster')['class'].apply(list)[4],'\n')
print('Cluster for state affairs:\n',results.groupby('cluster')['class'].apply(list)[8],'\n')
print('Cluster for state affairs (2):\n',results.groupby('cluster')['class'].apply(list)[12],'\n')
print('Cluster for quantities:\n',results.groupby('cluster')['class'].apply(list)[6],'\n')
print('Cluster for names or pronouns:\n',results.groupby('cluster')['class'].apply(list)[13],'\n')

Using K-Means ++, it would appear that the clusters, while some of them appear to be random, for the most part have some logical sense. Following the directions of the problem, the clusters selected appear to be the most salient.

In [None]:
# Import KNN
from sklearn.neighbors import NearestNeighbors

# Function to achieve nearest neighbor
def knn_info(brown_words2):
    for word in brown_words2:
        subset = pca_m_info.drop(word)
        neigh = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='cosine')
        neigh.fit(subset)
        nn_loc = neigh.kneighbors(pca_m_info[pca_m_info.index == word])[1]
        print('For', word, ', the nearest neighbor is = ', subset.index[nn_loc][0][0])


In [None]:
word_list = ['communism','autumn','cigarette','pulmonary','mankind','africa','chicago',\
         'revolution','september','chemical','detergent','dictionary','storm','worship',\
         'employees','million','wife','husband','education','world','christ','would','cattle', \
         'thousand','new']
knn_info(word_list)

Based on my interpretation, KNN does a very good job teasing out insights from the selection of words chosen. There are only a few exceptions that require you to think more critically: cigarette + bullet and storm + saturday. Other than that, the neighbors are pretty spot on.