## Word2Vec Example in Python

[http://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis]

In [1]:
import gensim
from gensim.models.word2vec import Word2Vec

In [2]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('../../GoogleNews-vectors-negative300.bin', binary=True) 

In [4]:
model.most_similar('ok',topn=5)

[('okay', 0.8567795753479004),
 ('alright', 0.807797372341156),
 ('OK', 0.6864467859268188),
 ('lol', 0.6789620518684387),
 ('anyways', 0.6699042320251465)]

In [5]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.7118192315101624),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133)]

In [6]:
## word to vector
model['me'].shape

(300,)

#### Sentiment analysis

In [33]:
import gensim
from gensim.models.word2vec import Word2Vec
import pandas as pd
import numpy as np

In [34]:
file_path = '../../twit_data/training.csv'
data = pd.read_csv(file_path,encoding='latin1',header=None)
columns = ['sentiment','text']
data=data[[0,5]]
data.columns=columns

In [35]:
## keep only positive and negeative values 
data = data[((data['sentiment']==4) | (data['sentiment']==0))]
data['sentiment'][data['sentiment']==4]  = 1 
#data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [36]:
## clean up text and make them into list of tokens 
def clean_text(text):
    return text.lower().replace('\n','').split()

data['text'] = data['text'].map(clean_text)

In [37]:
## assign training and testing data 
x_train = data['text']
y_train = data['sentiment']
del data

In [38]:
### initialize model and build vocabulary 
n_dim = 300
window = 7 
downsampling = 0.0001
seed = 1 
num_workers = 4
min_count = 10 
twit_w2v = Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=n_dim,
    min_count=min_count,
    window= window,
    sample=downsampling
)
## build the vocabulary
twit_w2v.build_vocab(x_train)

In [39]:
## train w2v model 
corpus_count = twit_w2v.corpus_count
iteration = 10
if gensim.__version__[0] =='1':
    twit_w2v.train(x_train)
else:
    twit_w2v.train(x_train,total_examples=corpus_count,epochs = iteration)

In [40]:
## save trained model 
import os 

if not os.path.exists("trained"):
    os.makedirs("trained")
    twit_w2v.save(os.path.join('trained','twit.w2v'))
else:
    twit_w2v = Word2Vec.load(os.path.join('trained','twit.w2v'))

In [41]:
# so obviously, we need to tokenize it better 
twit_w2v.most_similar('see',topn=5)

[('seeing', 0.6153379678726196),
 ('concert..', 0.5914677381515503),
 ('c', 0.569611668586731),
 ('watch:', 0.5642238259315491),
 ('together!!!', 0.5602788925170898)]

So now, we are going to use word embeding to create a document term matrix. 
it is kind of a naive way of doing it, simply aggregate all tokens in a sentence and average it. 

In [42]:
model = twit_w2v.wv ## wv is just easier to work with
vocabs = model.vocab.keys()
del twit_w2v

In [43]:
def buildDocumentVector(text,size):
    text = [t for t in text if t in vocabs]
    if len(text)==0:
        return None
    else:
        vec = [model[t] for t in text]
        return np.stack(vec,axis=0).mean(axis=0)

In [45]:
## mean standard normalize out input data 
from sklearn.preprocessing import scale 

doc_term = [buildDocumentVector(z,n_dim) for z in x_train]

for inx, vec in enumerate(doc_term):
    if vec is None: y_train[inx]=None

train_vecs = [x for x in doc_term if x is not None]
train_y = [x for x in y_train if x is not None ]
print('x_train vector length: ',len(train_vecs))
print('y_tain classification: ',len(train_y))

del doc_term 

1600000
x_train vector length:  1594001
y_tain classification:  1594001


In [51]:
## split training and testing data 
from sklearn.cross_validation import train_test_split
x_train_vec,x_test_vec,y_train,y_test = train_test_split(train_vecs,train_y,test_size=0.2)
x_train_vec = scale(x_train_vec)

In [53]:
print(len(x_train_vec)," ",len(y_train))

1275200   1275200


In [54]:
## classification algorism 
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss='log',penalty='l1')
lr.fit(x_train_vec,y_train)

print('training accuracy:', lr.score(x_test_vec,y_test))


training accuracy: 0.642877531752


#### try use nero network 

In [61]:
import tensorflow as tf 
import tflearn
from tflearn.data_utils import to_categorical

In [62]:
y_train_onehot = to_categorical(y_train,2)

In [70]:
### build a basic network 
def build_model():
    tf.reset_default_graph()
    net = tflearn.input_data([None,n_dim])
    net = tflearn.fully_connected(net,200,activation='ReLU')
    net = tflearn.fully_connected(net,50,activation='ReLU')
    ## output layer 
    net = tflearn.fully_connected(net,2,activation='softmax')
    net = tflearn.regression(net,optimizer='sgd',
                            learning_rate=0.01,
                            loss='categorical_crossentropy')
    model = tflearn.DNN(net)
    return model 


In [None]:
model = build_model()
model.fit(x_train_vec,y_train_onehot,show_metric=True,batch_size=256,n_epoch=50)

Training Step: 132928  | total loss: [1m[32m0.38996[0m[0m | time: 18.183s
[2K| SGD | epoch: 027 | loss: 0.38996 - acc: 0.8230 -- iter: 0869376/1275200


Test neuro network 

In [None]:
predictions = np.array(model.predict(x_test_vec)[:,0]>0.5).astype(np.int_)
test_accuracy = np.mean(predictions == y_test)
print("Test accuracy: ", test_accuracy)

### To get vector from google word to vect 

In [None]:
## document to matrix, by looking up wordvectors



In [15]:
# Retrieve the entire list of "words" from the Google Word2Vec model, and write
# these out to text files so we can peruse them.
vocab = model.vocab.keys()
wordsInVocab = len(vocab)

In [29]:
model.index2word[0]

'</s>'

In [56]:
list(model.vocab)[0]

'</s>'