In [1]:
from __future__ import division
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from Word2VecUtility import Word2VecUtility
import pickle
import pandas as pd
import numpy as np
from gensim.models import word2vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [2]:
data = pd.read_csv('review.tsv', header=0, delimiter="\t", quoting=3)
print '\nThe first review is:\n'
print data["text"][0], '\n'
print data.shape
print data.columns


The first review is:

Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road. 

(5563019, 2)
Index([u'stars', u'text'], dtype='object')


In [30]:
print data['stars'][:3]
print
print data.ix[:2]['text']

0    4
1    5
2    5
Name: stars, dtype: object

0    Mr Hoagie is an institution. Walking in, it do...
1    Excellent food. Superb customer service. I mis...
2    Yes this place is a little out dated and not o...
Name: text, dtype: object


In [3]:

size = 1000000 #80000
subdata = data.sample(n = size, random_state=520)
subdata = subdata[pd.notnull(subdata['text'])]
print subdata.index
subdata.to_csv('review_sub_399850.tsv', index=False, quoting=3, sep='\t', encoding='utf-8')

Int64Index([4794955, 1892608, 3999307, 2154843,  761508, 3494804, 1473565,
             863959, 2619903, 4163736,
            ...
            2500699,   49075, 1876586, 2436708, 3048511, 2531670, 2027150,
            1261754, 2247496, 2453123],
           dtype='int64', length=399850)


In [4]:
del(data)
data = subdata
del(subdata)

In [6]:
data = pd.read_csv('review_sub_399850.tsv', header=0, delimiter="\t", quoting=3, encoding='utf-8')

In [5]:
print data.shape
print data.columns
print data.index
# only after to_csv without index, and read_csv back to data, can you use ix[5]
# print data.ix[:5]['text']
# if you want to index dataframe directly after sampled it. use iloc
print data.iloc[:5]

(399850, 2)
Index([u'stars', u'text'], dtype='object')
Int64Index([4794955, 1892608, 3999307, 2154843,  761508, 3494804, 1473565,
             863959, 2619903, 4163736,
            ...
            2500699,   49075, 1876586, 2436708, 3048511, 2531670, 2027150,
            1261754, 2247496, 2453123],
           dtype='int64', length=399850)
        stars                                               text
4794955     5  I've been going there for about a year, mostly...
1892608     3  This place is fine, not great, just fine.  I l...
3999307     4  Had my best Phoenix massage since we moved to ...
2154843     1  "Don't use them for an airport shuttle, PERIOD...
761508      5  If you life Thai Food, you definitely love thi...


In [6]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [7]:
# print data.ix[0:10]
print data.iloc[:10]['text']
# print data['text'][2]

4794955    I've been going there for about a year, mostly...
1892608    This place is fine, not great, just fine.  I l...
3999307    Had my best Phoenix massage since we moved to ...
2154843    "Don't use them for an airport shuttle, PERIOD...
761508     If you life Thai Food, you definitely love thi...
3494804    "Not a huge fan of the flavors here ... or the...
1473565    "I had one of my best dining experiences in La...
863959     "About five years ago I took my vehicle to Aut...
2619903    "Still an amazing place to come for Vegetarian...
4163736    "This past January I ran the P.F. Chang's Half...
Name: text, dtype: object


In [8]:
review_sents = []
print "Cleaning and parsing the reviews...\n"
for i in xrange( 0, len(data["text"])):
    # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer)
    review_sents += Word2VecUtility.review_to_sentences(data.iloc[i]["text"], tokenizer)
    

  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that doc

Cleaning and parsing the reviews...



In [53]:
out = open('review_sents_1859888.pkl', 'wb')
pickle.dump(review_sents, out)
out.close()

In [11]:
# review_sents = pickle.load(open('review_sents_1859888.pkl', 'rb'))
print len(review_sents)
print review_sents[:5]

1859859
[[u'i', u've', u'been', u'going', u'there', u'for', u'about', u'a', u'year', u'mostly', u'for', u'b', u'shots'], [u'the', u'people', u'are', u'great', u'and', u'very', u'accommodating'], [u'never', u'have', u'to', u'wait'], [u'the', u'new', u'location', u'is', u'not', u'as', u'convenient', u'but', u'i', u'plan', u'on', u'continuing', u'to', u'go', u'there'], [u'this', u'place', u'is', u'fine', u'not', u'great', u'just', u'fine']]


In [57]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
print "Training model..."
model = word2vec.Word2Vec(review_sents, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)


Training model...


In [58]:
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

In [None]:
model = word2vec.Word2Vec.load("300features_40minwords_10context")

In [71]:
model.doesnt_match("man woman child kitchen".split())

'kitchen'

In [70]:
model.doesnt_match("coffee tea juice restaurant".split())

'restaurant'

In [4]:
model.most_similar("delicious")

[(u'delish', 0.7754588723182678),
 (u'tasty', 0.7434055805206299),
 (u'yummy', 0.7149284482002258),
 (u'scrumptious', 0.6782612800598145),
 (u'flavorful', 0.6213445663452148),
 (u'divine', 0.6012349724769592),
 (u'tastey', 0.5901238918304443),
 (u'delicous', 0.587270200252533),
 (u'mouthwatering', 0.5798896551132202),
 (u'devine', 0.5537630319595337)]

In [5]:
model.most_similar("chinese")

[(u'mexican', 0.7112151384353638),
 (u'cantonese', 0.6801303029060364),
 (u'asian', 0.6524285078048706),
 (u'japanese', 0.6484655141830444),
 (u'americanized', 0.6477989554405212),
 (u'indian', 0.6398634910583496),
 (u'filipino', 0.6290761232376099),
 (u'vietnamese', 0.6171532869338989),
 (u'korean', 0.6015596389770508),
 (u'lebanese', 0.5581501722335815)]

In [6]:
print model["chinese"]
print model.syn0.shape

[-0.09085934 -0.04050202 -0.07604051 -0.02878256 -0.03832901  0.04651292
 -0.08233552 -0.06436986 -0.006646    0.01952864 -0.10288478 -0.03497215
 -0.04002167 -0.0277764  -0.03175622  0.0141541   0.06412307  0.0514068
 -0.04425988 -0.01241343 -0.00785599 -0.0206115   0.03097875 -0.01636746
  0.12936752 -0.04187576 -0.04594978  0.0632828  -0.0185187  -0.03435634
  0.02050968 -0.00153008  0.04422459  0.08578489  0.0569248  -0.13749051
  0.07906641 -0.08986761 -0.06780145  0.03066873 -0.07235949  0.00491482
 -0.05130845 -0.03616726  0.02364809  0.00438806  0.03820136 -0.02138964
  0.01468734  0.0239164   0.06650317 -0.01117458  0.08711758  0.02350685
  0.00737275 -0.03050523  0.01972778 -0.00599776  0.00697179  0.03140137
  0.01172278 -0.00411805 -0.09804209 -0.06642748 -0.01673794  0.04739327
 -0.00381328 -0.10510307 -0.06244999 -0.03497938 -0.02515736 -0.05637315
 -0.03300777 -0.02991769 -0.00337767  0.01365327  0.03197937 -0.01513318
 -0.00577635 -0.00223164 -0.04746583  0.02690253 -0.

In [15]:
review_words = []
print type(model.index2word)
print len(model.index2word)
print model.index2word[:100]
index2word_set = set(model.index2word)
print len(index2word_set)

<type 'list'>
12597
[u'the', u'and', u'i', u'a', u'to', u'was', u'it', u'of', u'is', u'for', u'in', u'my', u'this', u'we', u'that', u'they', u'you', u'with', u'but', u'on', u'have', u't', u'had', u's', u'place', u'not', u'so', u'food', u'at', u'are', u'great', u'good', u'were', u'here', u'very', u'be', u'there', u'as', u'me', u'service', u'all', u'out', u'time', u'like', u'just', u'if', u'our', u'one', u'get', u'from', u'when', u'go', u'their', u'back', u'can', u'been', u'would', u'up', u'about', u'really', u'an', u'or', u'will', u'what', u'he', u've', u'best', u'love', u'she', u'only', u'no', u'some', u'which', u'by', u'your', u'nice', u'always', u'because', u'got', u'more', u'us', u'them', u'has', u'also', u'even', u'm', u'after', u'staff', u'went', u'don', u'other', u'first', u'do', u'friendly', u'well', u'too', u'restaurant', u'never', u'vegas', u'did']
12597


In [16]:
words = Word2VecUtility.review_to_wordlist(data.iloc[0]['text'])
print words
for word in words:
    print word in index2word_set

[u'i', u've', u'been', u'going', u'there', u'for', u'about', u'a', u'year', u'mostly', u'for', u'b', u'shots', u'the', u'people', u'are', u'great', u'and', u'very', u'accommodating', u'never', u'have', u'to', u'wait', u'the', u'new', u'location', u'is', u'not', u'as', u'convenient', u'but', u'i', u'plan', u'on', u'continuing', u'to', u'go', u'there']
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [8]:
clean_labels = np.array(data["stars"])
print clean_labels[:10], clean_labels.shape
clean_labels[clean_labels <= 3] = 0
clean_labels[clean_labels > 3] = 1
print clean_labels[:10]
# num of positive reviews
print (clean_labels == 1).sum()

[5 3 4 1 5 2 5 4 4 4] (399850,)
[1 0 1 0 1 0 1 1 1 1]
267983
