In [1]:
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup


In [2]:
def download_data(dir):
    data = {}
    data['sentence'] = []
    for file in os.listdir(dir):
        with open(os.path.join(dir, file), 'r') as f:
            data['sentence'].append(f.read())
    return pd.DataFrame.from_dict(data)

def load_data(dir):
    pos_df = download_data(os.path.join(dir, 'pos'))
    neg_df = download_data(os.path.join(dir, 'neg'))
    pos_df['sentiment'] = 1
    neg_df['sentiment'] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=False)

train_df = load_data('/home/jiangxinyang/git_projects/machine_learning/tensorflow/Sequences/aclImdb/train')
test_df = load_data('/home/jiangxinyang/git_projects/machine_learning/tensorflow/Sequences/aclImdb/test')

train_df.head()

Unnamed: 0,index,sentence,sentiment
0,6894,Hopefully the score has changed by now due to ...,1
1,8263,(Spoilers galore) This is an absolutely awful ...,0
2,11776,Jack Black can usually make me snicker simply ...,0
3,11096,"After seeing all the Jesse James, Quantrill, j...",1
4,12423,"Make no mistake, Maureen O'Sullivan is easily ...",1


In [3]:
unlabeled_df = download_data('/home/jiangxinyang/git_projects/machine_learning/tensorflow/Sequences/aclImdb/train/unsup')
def sentence_to_wordList(sentence):
    sentence_text = BeautifulSoup(sentence).get_text()
    sentence_text = re.sub('[^a-zA-Z]', " ", sentence_text)
    word_list = sentence_text.lower().split()
    
    return word_list

sentences = []
for sentence in train_df['sentence']:
    sentences.append(sentence_to_wordList(sentence))

for sentence in unlabeled_df['sentence']:
    sentences.append(sentence_to_wordList(sentence))

sentences[0]



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


['hopefully',
 'the',
 'score',
 'has',
 'changed',
 'by',
 'now',
 'due',
 'to',
 'my',
 'brilliant',
 'and',
 'stunning',
 'review',
 'which',
 'persuades',
 'all',
 'of',
 'you',
 'to',
 'go',
 'and',
 'watch',
 'the',
 'film',
 'thereby',
 'creating',
 'an',
 'instant',
 'chorus',
 'of',
 's',
 'this',
 'movie',
 's',
 'true',
 'score',
 'as',
 'mentioned',
 'before',
 'chris',
 'rock',
 'is',
 'the',
 'king',
 'previous',
 'to',
 'going',
 'to',
 'see',
 'this',
 'movie',
 'i',
 'wasn',
 't',
 'that',
 'over',
 'the',
 'top',
 'about',
 'him',
 'but',
 'now',
 'i',
 'm',
 'banging',
 'on',
 'the',
 'doors',
 'of',
 'chris',
 'rock',
 's',
 'website',
 'begging',
 'him',
 'to',
 'take',
 'me',
 'on',
 'as',
 'his',
 'protege',
 'this',
 'film',
 'is',
 'truly',
 'funny',
 'if',
 'you',
 'don',
 't',
 'find',
 'this',
 'movie',
 'funny',
 'you',
 'really',
 'need',
 'therapy',
 'and',
 'it',
 's',
 'humour',
 'which',
 'targets',
 'all',
 'areas',
 'of',
 'society',
 'including',
 '

In [4]:
sentences = [[word for word in sentence if len(word) > 1] for sentence in sentences]
sentences[0]

['hopefully',
 'the',
 'score',
 'has',
 'changed',
 'by',
 'now',
 'due',
 'to',
 'my',
 'brilliant',
 'and',
 'stunning',
 'review',
 'which',
 'persuades',
 'all',
 'of',
 'you',
 'to',
 'go',
 'and',
 'watch',
 'the',
 'film',
 'thereby',
 'creating',
 'an',
 'instant',
 'chorus',
 'of',
 'this',
 'movie',
 'true',
 'score',
 'as',
 'mentioned',
 'before',
 'chris',
 'rock',
 'is',
 'the',
 'king',
 'previous',
 'to',
 'going',
 'to',
 'see',
 'this',
 'movie',
 'wasn',
 'that',
 'over',
 'the',
 'top',
 'about',
 'him',
 'but',
 'now',
 'banging',
 'on',
 'the',
 'doors',
 'of',
 'chris',
 'rock',
 'website',
 'begging',
 'him',
 'to',
 'take',
 'me',
 'on',
 'as',
 'his',
 'protege',
 'this',
 'film',
 'is',
 'truly',
 'funny',
 'if',
 'you',
 'don',
 'find',
 'this',
 'movie',
 'funny',
 'you',
 'really',
 'need',
 'therapy',
 'and',
 'it',
 'humour',
 'which',
 'targets',
 'all',
 'areas',
 'of',
 'society',
 'including',
 'race',
 'predictably',
 'class',
 'division',
 'love',

In [7]:
import logging
from gensim.models import word2vec
# logging.basicConfig(format="%(asctime)s: %(levelname)s: %(message)s", level=logging.INFO)
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, 
                         window=context, sample=downsampling)

model

2018-07-03 18:40:05,543: INFO: collecting all words and their counts
2018-07-03 18:40:05,545: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-03 18:40:06,161: INFO: PROGRESS: at sentence #10000, processed 2202906 words, keeping 51244 word types
2018-07-03 18:40:06,796: INFO: PROGRESS: at sentence #20000, processed 4458315 words, keeping 67962 word types
2018-07-03 18:40:07,420: INFO: PROGRESS: at sentence #30000, processed 6679221 words, keeping 81761 word types
2018-07-03 18:40:08,062: INFO: PROGRESS: at sentence #40000, processed 8916893 words, keeping 93943 word types
2018-07-03 18:40:08,896: INFO: PROGRESS: at sentence #50000, processed 11123478 words, keeping 103614 word types
2018-07-03 18:40:09,569: INFO: PROGRESS: at sentence #60000, processed 13364359 words, keeping 112051 word types
2018-07-03 18:40:10,133: INFO: PROGRESS: at sentence #70000, processed 15568436 words, keeping 119823 word types
2018-07-03 18:40:10,451: INFO: collected 123477 wor

2018-07-03 18:41:07,349: INFO: EPOCH 2 - PROGRESS: at 39.48% examples, 278525 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:41:08,393: INFO: EPOCH 2 - PROGRESS: at 42.55% examples, 283934 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:41:09,398: INFO: EPOCH 2 - PROGRESS: at 45.75% examples, 288860 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:41:10,410: INFO: EPOCH 2 - PROGRESS: at 48.90% examples, 293317 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:41:11,422: INFO: EPOCH 2 - PROGRESS: at 51.81% examples, 296362 words/s, in_qsize 5, out_qsize 2
2018-07-03 18:41:12,453: INFO: EPOCH 2 - PROGRESS: at 54.40% examples, 296867 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:41:13,495: INFO: EPOCH 2 - PROGRESS: at 56.83% examples, 296533 words/s, in_qsize 6, out_qsize 1
2018-07-03 18:41:14,495: INFO: EPOCH 2 - PROGRESS: at 59.57% examples, 297937 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:41:15,511: INFO: EPOCH 2 - PROGRESS: at 62.46% examples, 299651 words/s, in_qsize 7, out_qsize 0
2

2018-07-03 18:42:15,228: INFO: EPOCH 4 - PROGRESS: at 9.42% examples, 284763 words/s, in_qsize 8, out_qsize 1
2018-07-03 18:42:16,268: INFO: EPOCH 4 - PROGRESS: at 12.08% examples, 290416 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:42:17,270: INFO: EPOCH 4 - PROGRESS: at 14.78% examples, 296128 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:42:18,283: INFO: EPOCH 4 - PROGRESS: at 17.36% examples, 298590 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:42:19,292: INFO: EPOCH 4 - PROGRESS: at 19.97% examples, 301524 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:42:20,316: INFO: EPOCH 4 - PROGRESS: at 22.36% examples, 300062 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:42:21,334: INFO: EPOCH 4 - PROGRESS: at 24.75% examples, 299064 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:42:22,382: INFO: EPOCH 4 - PROGRESS: at 27.11% examples, 297398 words/s, in_qsize 7, out_qsize 0
2018-07-03 18:42:23,397: INFO: EPOCH 4 - PROGRESS: at 29.64% examples, 297043 words/s, in_qsize 7, out_qsize 0
20

<gensim.models.word2vec.Word2Vec at 0x7f2c10d3c4e0>