In [30]:
import pandas as pd
from gensim.parsing import preprocessing
from gensim.summarization import textcleaner
from gensim.models import word2vec
from sklearn.preprocessing import normalize
import numpy as np

### Applying CBOW word2vec to text reviews 

#### Reading in the reviews

In [31]:
yelp = pd.read_csv('yelp_reviews_business_pre_vec_3_4_2016.csv')

In [33]:
yelp = yelp.iloc[:, 1:]

#### Making sure all reviews are strings

In [35]:
yelp_txt_str = []
for text in yelp['text']:
    text = str(text)
    yelp_txt_str.append(text)

In [37]:
yelp['text'] = yelp_txt_str

In [34]:
yelp_join = pd.DataFrame()

#### Creating a minimal DataFrame

In [36]:
yelp_join = yelp[['business_id', 'text']]

In [38]:
yelp_join = yelp_join.set_index('business_id')

In [39]:
yelp_join = yelp_join.groupby('business_id').apply(np.sum)

In [40]:
def make_reviews_list(df):
    reviews_list = []
    for review in df['text']:
        reviews_list.append(review)
    return reviews_list

In [41]:
reviews_list = make_reviews_list(yelp_join)

#### Generating tokens and lists of documents as well as sentences

In [None]:
docs = []
all_sentences = []
for i, text in enumerate(reviews_list):
    sentences = textcleaner.split_sentences(text)
    if i%1000 == 0:
        print(i)
    doc_tokens = []
    for sent in sentences:
        tokens = preprocessing.preprocess_string(sent)
        all_sentences.append(tokens)
        doc_tokens.extend(tokens)
    docs.append(doc_tokens)

#### Generating vectors

In [43]:
cbow_model = word2vec.Word2Vec(
    all_sentences,     
    min_count=15,   # Minimum word count threshold.
    window=5,      # Number of words around target word to consider.
    sg=0,          # cbow
    sample=1e-3 ,  
    size=500,      
    hs=1           
)

#### Organizing vectors

In [44]:
def create_vector_list(array_shape, X):
    vector_list = []
    for review in X:
        review_vector = np.zeros_like(array_shape)
        for word in review:
            word = word.lemma_ 
            if word in cbow_model.wv:
                vector = cbow_model.wv[word]
                review_vector += vector
        norm = normalize(review_vector.reshape(1, -1))
        norm = norm.flatten()
        vector_list.append(norm)
    return vector_list

In [45]:
x = cbow_model.wv['burger']
print(len(x))
vector_list = []
X_w2v = vector_list
for review in docs:
    review_vector = np.zeros_like(x)
    for word in review:
        #tokens_word = preprocessing.preprocess_string(word)
        #word = tokens_word.lemma_ 
        if word in cbow_model.wv:
            vector = cbow_model.wv[word]
            review_vector += vector
    norm = normalize(review_vector.reshape(1, -1))
    norm = norm.flatten()
    vector_list.append(norm)

500


#### Generating files to merge with the business data

In [46]:
yelp_join['w2v'] = X_w2v

In [47]:
yelp_join.to_csv('w2v_after_merge_3_5.csv')

In [48]:
df = pd.DataFrame(X_w2v)

In [50]:
yelp_join = yelp_join.drop('w2v', 1)

In [51]:
yelp_join = yelp_join.reset_index()

In [53]:
yelp_join = pd.merge(yelp_join, df, right_index=True, left_index=True, how='inner')

In [54]:
yelp_join = yelp_join.drop('text', 1)

In [None]:
yelp_join.to_csv('w2v_after_merge_3_5_2.csv')