In [37]:
import os
import string
import pickle
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer

import gensim
from gensim import utils
import numpy as np
import sys
from sklearn.datasets import fetch_20newsgroups
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from gensim.models import word2vec
import multiprocessing
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [47]:
import logging
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
get_ipython().magic('matplotlib inline')

[nltk_data] Downloading package punkt to /Users/jhuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jhuang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
# Get file names
in_dir = "nlp_approach/data/transcripts/utterances"
files = os.listdir(in_dir)
files.sort()
new_files = []
justicenames = []
for f in files:
    if f.endswith(".txt"):
        new_files.append(os.path.join(in_dir,f))
        justicenames.append(f[:-4])

print(justicenames)

['AJGoldberg', 'AMKennedy', 'AScalia', 'BRWhite', 'DHSouter', 'EKagan', 'JGRoberts', 'JPStevens', 'LFPowell', 'NMGorsuch', 'RBGinsburg', 'SAAlito', 'SDOConnor', 'SGBreyer', 'SSotomayor', 'TMarshall', 'WHRehnquist']


In [69]:
# Load Google's pre-trained Word2Vec model.
w = gensim.models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin.gz', binary=True)

2019-05-10 22:06:14,164 : INFO : loading projection weights from GoogleNews-vectors-negative300.bin.gz
2019-05-10 22:09:21,423 : INFO : loaded (3000000, 300) matrix from GoogleNews-vectors-negative300.bin.gz


In [None]:
w.save("pretrained_word2vec_model")

2019-05-10 22:10:53,272 : INFO : saving Word2VecKeyedVectors object under pretrained_word2vec_model, separately None
2019-05-10 22:10:53,273 : INFO : storing np array 'vectors' to pretrained_word2vec_model.vectors.npy
2019-05-10 22:11:33,382 : INFO : not storing attribute vectors_norm


In [70]:
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

def get_corpus(files):
    corpus = []
    for f in files:
        data = f.read().replace('\n', '')
        data = preprocess(data)
        doc = gensim.utils.simple_preprocess(data, deacc=True)
        corpus.append(doc)
    return corpus

In [71]:
def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word2vec_model.vocab]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []

In [None]:
corpus = get_corpus(new_files)
for doc in corpus:
    vec = get_mean_vector(w, doc)
    if len(vec) > 0:
      # do somthing with the vector ${vec}

In [58]:
def read_input(input_files):
    """This method reads the input file which is in gzip format"""

    lines = []
    for input_file in input_files:
        with open(input_file, 'rb') as f:
            logging.info("reading file {0}...this may take a while".format(input_file))
            for i, line in enumerate(f):
                # do some pre-processing and return list of words for each review
                # text
                lines.append(gensim.utils.simple_preprocess(line))
    return lines

In [59]:
documents = read_input(new_files)

2019-05-10 21:47:09,891 : INFO : reading file nlp_approach/data/transcripts/utterances/AJGoldberg.txt...this may take a while
2019-05-10 21:47:09,909 : INFO : reading file nlp_approach/data/transcripts/utterances/AMKennedy.txt...this may take a while
2019-05-10 21:47:09,992 : INFO : reading file nlp_approach/data/transcripts/utterances/AScalia.txt...this may take a while
2019-05-10 21:47:10,043 : INFO : reading file nlp_approach/data/transcripts/utterances/BRWhite.txt...this may take a while
2019-05-10 21:47:10,049 : INFO : reading file nlp_approach/data/transcripts/utterances/DHSouter.txt...this may take a while
2019-05-10 21:47:10,183 : INFO : reading file nlp_approach/data/transcripts/utterances/EKagan.txt...this may take a while
2019-05-10 21:47:10,263 : INFO : reading file nlp_approach/data/transcripts/utterances/JGRoberts.txt...this may take a while
2019-05-10 21:47:10,367 : INFO : reading file nlp_approach/data/transcripts/utterances/JPStevens.txt...this may take a while
2019-05

In [60]:
 model = gensim.models.Word2Vec(
        documents,
        size=150,
        window=10,
        min_count=2,
        workers=10)

2019-05-10 21:47:19,613 : INFO : collecting all words and their counts
2019-05-10 21:47:19,614 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-05-10 21:47:19,712 : INFO : PROGRESS: at sentence #10000, processed 338550 words, keeping 11442 word types
2019-05-10 21:47:19,771 : INFO : collected 14927 word types from a corpus of 618049 raw words and 18459 sentences
2019-05-10 21:47:19,772 : INFO : Loading a fresh vocabulary
2019-05-10 21:47:19,803 : INFO : effective_min_count=2 retains 8942 unique words (59% of original 14927, drops 5985)
2019-05-10 21:47:19,804 : INFO : effective_min_count=2 leaves 612064 word corpus (99% of original 618049, drops 5985)
2019-05-10 21:47:19,842 : INFO : deleting the raw counts dictionary of 14927 items
2019-05-10 21:47:19,843 : INFO : sample=0.001 downsamples 55 most-common words
2019-05-10 21:47:19,844 : INFO : downsampling leaves estimated 423299 word corpus (69.2% of prior 612064)
2019-05-10 21:47:19,883 : INFO : estimate

In [61]:
model.save("word2vec_model")

2019-05-10 21:48:15,701 : INFO : saving Word2Vec object under word2vec_model, separately None
2019-05-10 21:48:15,702 : INFO : not storing attribute vectors_norm
2019-05-10 21:48:15,704 : INFO : not storing attribute cum_table
2019-05-10 21:48:15,889 : INFO : saved word2vec_model


2019-05-10 21:48:39,921 : INFO : precomputing L2-norms of word weight vectors


ValueError: cannot compute similarity with no input

In [35]:
def read_data(f):
    """Extract the first file enclosed in a zip file as a list of words."""
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [38]:
vocabulary = read_data("nlp_approach/data/transcripts/utterances/AJGoldberg.txt")
print(vocabulary[:7])

AttributeError: 'str' object has no attribute 'read'

In [None]:
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

In [21]:
def create_word2vec_model(embedding_size, input_file="data/transcripts/utterances"):
    """
    Create the word2vec model based on the given embedding size and the corpus file.
    :param embedding_size: The embedding size
    :param input_file: The corpus file
    """
    word2vec_file = 'word2vec_' + str(embedding_size) + '.model'

    if os.path.isfile(word2vec_file):
        logging.info('? The word2vec model you want create already exists!')
    else:
        sentences = word2vec.LineSentence(input_file)
        # sg=0 means use CBOW model(default); sg=1 means use skip-gram model.
        model = gensim.models.Word2Vec(sentences, size=embedding_size, min_count=0,
                                       sg=0, workers=multiprocessing.cpu_count())
        model.save(word2vec_file) 
        

In [30]:
embedding_size=100
input_file="nlp_approach/data/transcripts/utterances/test.txt"
word2vec_file = 'word2vec_' + str(embedding_size) + '.model'

if os.path.isfile(word2vec_file):
    logging.info('? The word2vec model you want create already exists!')
else:
    sentences = word2vec.LineSentence(input_file)
    # sg=0 means use CBOW model(default); sg=1 means use skip-gram model.
    model = gensim.models.Word2Vec(sentences, size=embedding_size, min_count=0,
                                   sg=0, workers=multiprocessing.cpu_count())
    model.save(word2vec_file) 
    model.train(documents, total_examples=len(documents), epochs=10)
    

In [None]:
model = gensim.models.Word2Vec(
    new_files,
    size=150,
    window=10,
    min_count=2,
    workers=10)
model.train(documents, total_examples=len(documents), epochs=10)

### Tokenize, remove stopwords
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

corpus = [preprocess(text) for text in texts]

# ### Remove OOV words and documents with no words in model dictionar


x =[]
for doc in corpus: #look up each doc in model
    x.append(document_vector(model, doc))


X = np.array(x) #list to array

np.save('documents_vectors.npy', X)  #np.savetxt('documents_vectors.txt', X)
np.save('labels.npy', y)             #np.savetxt('labels.txt', y)

In [None]:
# ### Plot 2 PCA components

# In[22]:

pca = PCA(n_components=2)
x_pca = pca.fit_transform(X)


# In[23]:

plt.figure(1, figsize=(30, 20),)
plt.scatter(x_pca[:, 0], x_pca[:, 1],s=100, c=y, alpha=0.2)


# ### Plot t-SNE

# In[24]:

from sklearn.manifold import TSNE
X_tsne = TSNE(n_components=2, verbose=2).fit_transform(X)


# In[25]:

plt.figure(1, figsize=(30, 20),)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1],s=100, c=y, alpha=0.2)


# In[ ]:



© 2019 GitHub, Inc.
Terms
Privacy
Security
Status
Help
Contact GitHub
Pricing
API
Training
Blog
About