In [1]:
#Gensim Docs: http://pandas.pydata.org/pandas-docs/stable/
#Description: https://en.wikipedia.org/wiki/Word2vec
#Tutorials/Code Example:  https://github.com/RaRe-Technologies/gensim/blob/develop/gensim%20Quick%20Start.ipynb
#Library Purpose: Gensim Word2vec is a two-layer neural net that processes text.
#Its input is a text corpus and its output is a set of vectors: feature vectors for words in that corpus
#it can be applied just as well to genes, code, likes, playlists, social media graphs and 
#other verbal or symbolic series in which patterns may be discerned.

"""Basic word2vec example."""
import numpy as np
import pandas as pd
import gensim #open-source achine learning framework
from gensim import corpora
from gensim import models

#import favorite text dataset for analysis
def file_read_csv(path):
    print("Pandas File I/O Example - CSV Read")
    #load csv file into Pandas dataframe object
    data=pd.read_csv(path)
    return data

#replace with own dataset - "establish the corpus"
raw_corpus = file_read_csv(r"C:\\Python\\Data\\text8") 
#Preprocessing dataset, including stoplist, word frequencies & filtes""




Pandas File I/O Example - CSV Read


In [26]:
# Create a set of frequent words
stoplist = set('for a of the and to in'.split())
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in raw_corpus]
#Long list of non-distinct parsed words from doc 
#print(texts[0])

In [27]:
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

In [28]:
# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
#long list of distinct words
#print(processed_corpus)

In [29]:
#associate each word in the processed corpus with a unique integer ID, using the gensim.corpora.Dictionary class. 
#This dictionary defines the vocabulary of all words that our processing knows about.
dictionary = corpora.Dictionary(processed_corpus)
#print(dictionary)

Dictionary(135328 unique tokens: ['aa', 'aaa', 'aaaa', 'aaaargh', 'aaai']...)


In [9]:
#To infer the latent structure in our corpus we need a way to represent documents
#that we can manipulate mathematically. One approach is to represent each document as a vector. 
#convert document into "Bag of works"
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
#print(bow_corpus[0])

In [31]:
''' Now that we have vectorized our corpus we can begin to transform it using models. 
We use model as an abstract term referring to a transformation from one document representation to another. 
In gensim, documents are represented as vectors so a model can be thought of as a transformation between two vector spaces. 
The details of this transformation are learned from the training corpus. '''

#One simple example of a model is tf-idf. The tf-idf model transforms vectors from the bag-of-words representation to a vector space,
#where the frequency counts are weighted according to the relative rarity of each word in the corpus.
# train the model
tfidf = models.TfidfModel(bow_corpus)
# transform the "system minors" string
tfidf2 = tfidf[dictionary.doc2bow("system minors".lower().split())]
print(tfidf2)

[]
