## Word2Vec Example in Python

[http://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis]

In [None]:
import gensim
from gensim.models.word2vec import Word2Vec

In [None]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('../../GoogleNews-vectors-negative300.bin', binary=True) 

In [None]:
model.most_similar('ok',topn=5)

In [None]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

In [None]:
## word to vector
model['me'].shape

#### Sentiment analysis

In [1]:
import gensim
from gensim.models.word2vec import Word2Vec
import pandas as pd
import numpy as np

In [2]:
file_path = '../../twit_data/training.csv'
data = pd.read_csv(file_path,encoding='latin1',header=None)
columns = ['sentiment','text']
data=data[[0,5]]
data.columns=columns

In [3]:
## keep only positive and negeative values 
data = data[((data['sentiment']==4) | (data['sentiment']==0))]
data['sentiment'][data['sentiment']==4]  = 1 
#data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
## clean up text and make them into list of tokens 
def clean_text(text):
    return text.lower().replace('\n','').split()

data['text'] = data['text'].map(clean_text)

In [5]:
## assign training and testing data 
x_train = data['text'].tolist()
y_train = data['sentiment'].tolist()
del data

In [6]:
### initialize model and build vocabulary 
n_dim = 300
window = 7 
downsampling = 0.001
seed = 1 
num_workers = 8
min_count = 3 
twit_w2v = Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=n_dim,
    min_count=min_count,
    window= window,
    sample=downsampling
)
## build the vocabulary
twit_w2v.build_vocab(x_train)

In [7]:
## train w2v model 
corpus_count = twit_w2v.corpus_count
iteration = 5
if gensim.__version__[0] =='1':
    twit_w2v.train(x_train)
else:
    twit_w2v.train(x_train,total_examples=corpus_count,epochs = iteration)

In [8]:
## save trained model 
import os 

if not os.path.exists("trained"):
    os.makedirs("trained")
    twit_w2v.save(os.path.join('trained','twit.w2v'))
else:
    twit_w2v = Word2Vec.load(os.path.join('trained','twit.w2v'))

In [9]:
# so obviously, we need to tokenize it better 
twit_w2v.most_similar('what',topn=5)

[('wat', 0.6028522253036499),
 ('...what', 0.5512292981147766),
 ('how', 0.5442442893981934),
 ("what'd", 0.5403693914413452),
 ('wot', 0.5325145721435547)]

So now, we are going to use word embeding to create a document term matrix. 
it is kind of a naive way of doing it, simply aggregate all tokens in a sentence and average it. 

In [10]:
model = twit_w2v.wv ## wv is just easier to work with
vocabs = model.vocab.keys()
del twit_w2v

In [11]:
i = 911001
print(x_train[i])
print(y_train[i])
### 0 is negative 1 is positive

['got', 'to', 'reconnect', 'with', 'some', 'dear', 'friends', 'tonight.', 'i', 'am', 'so', 'lucky', 'to', 'have', 'so', 'many', 'great', 'people', 'in', 'my', 'life.', 'i', 'am', 'blessed']
1


In [12]:
def buildDocumentVector(text,size):
    text = [t for t in text if t in vocabs]
    if len(text)==0:
        return None
    else:
        vec = [model[t] for t in text]
        return np.stack(vec,axis=0).mean(axis=0)

In [13]:
test ="i am so happy to be here"
buildDocumentVector(test,300).shape

(300,)

In [14]:
## mean standard normalize out input data 
from sklearn.preprocessing import scale 

doc_term = [buildDocumentVector(z,n_dim) for z in x_train]

for inx, vec in enumerate(doc_term):
    if vec is None: y_train[inx]=None

train_vecs = [x for x in doc_term if x is not None]
train_y = [x for x in y_train if x is not None ]
print('x_train vector length: ',len(train_vecs))
print('y_tain classification: ',len(train_y))

del doc_term 

x_train vector length:  1594001
y_tain classification:  1594001


In [127]:
## split training and testing data 
from sklearn.cross_validation import train_test_split
x_train_vec,x_test_vec,y_train,y_test = train_test_split(train_vecs,train_y,test_size=0.2)
x_train_vec = scale(x_train_vec)

In [56]:
## classification algorism 
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss='log',penalty='l1')
lr.fit(x_train_vec,y_train)

print('test accuracy:', lr.score(x_test_vec,y_test))

test accuracy: 0.699295171596


#### try use nero network 

In [86]:
import tensorflow as tf 
import tflearn
from tflearn.data_utils import to_categorical

In [87]:
y_train_onehot = to_categorical(y_train,2)

In [88]:
print(y_train[1])
print(y_train_onehot[1])

0
[ 1.  0.]


In [90]:
### build a basic network 
def build_model(keep_prob):
    tf.reset_default_graph()
    net = tflearn.input_data([None,n_dim])
    net = tflearn.fully_connected(net,300,activation='ReLU')
    net = tflearn.fully_connected(net,20,activation='ReLU')
    ## output layer 
    net = tflearn.fully_connected(net,2,activation='softmax')
    net = tflearn.regression(net,optimizer='sgd',
                            learning_rate=0.005,
                            loss='categorical_crossentropy')
    model = tflearn.DNN(net)
    return model 


In [91]:
keep_prob=0.5
model_net = build_model(keep_prob)
model_net.fit(x_train_vec,y_train_onehot,show_metric=True,batch_size=512,n_epoch=10)

Training Step: 24909  | total loss: [1m[32m0.45577[0m[0m | time: 10.582s
| SGD | epoch: 010 | loss: 0.45577 - acc: 0.7886 -- iter: 1274880/1275200
Training Step: 24910  | total loss: [1m[32m0.45308[0m[0m | time: 10.586s
| SGD | epoch: 010 | loss: 0.45308 - acc: 0.7902 -- iter: 1275200/1275200
--


Test neuro network 

In [108]:
predictions = np.array(model_net.predict(scale(x_test_vec))[:,1]>0.5).astype(np.int_)
test_accuracy = np.mean(predictions == np.array(y_test))
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.792045821688


In [None]:
text = "i love like this place"
sample = x_test_vec.copy()
input_data = [buildDocumentVector(text,300)]
sample.append(input_data)
input_data = scale(sample)
#model_net.predict(input_data)[:,1]>0.5

### To get vector from google word to vect 

In [None]:
## document to matrix, by looking up wordvectors



In [None]:
# Retrieve the entire list of "words" from the Google Word2Vec model, and write
# these out to text files so we can peruse them.
vocab = model.vocab.keys()
wordsInVocab = len(vocab)

In [None]:
model.index2word[0]

In [None]:
list(model.vocab)[0]