In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#######################
__version__ = "1.0"
__date__ = "2016-04-19"
__modified_by__ = "Aditya Tanikanti"
####################################

In [2]:
import numpy as np
import pandas as pd
from utils import *
from scipy.sparse import csr_matrix, vstack, hstack

In [3]:
SEED_VAL = 200;
WORK_DIR = os.getcwd();
data_subset = "_0_1Percent"
YELP_DATA_CSV_DIR = os.path.join(WORK_DIR, "data", "csv")
YELP_DATA_SPARSE_MATRIX_DIR = os.path.join(WORK_DIR, "data", "sparse_matrix")
YELP_DATA_FEAT_ADD = os.path.join(WORK_DIR, "data", "pos_neg")
YELP_DATA_SPARSE_MATRIX_DIR_FEAT_ADD = os.path.join(WORK_DIR, "data", "sparse_matrix_feat_add")

In [4]:
make_sure_path_exists(YELP_DATA_SPARSE_MATRIX_DIR)

In [5]:
read_filename = os.path.join(YELP_DATA_CSV_DIR, 'business_review_user' + data_subset + '.csv')
df_data = pd.read_csv(read_filename, engine='c', encoding='utf-8')

In [6]:
df_data_preprocessed_review = df_data.copy();
%time df_data_preprocessed_review['review_text'] = df_data_preprocessed_review['review_text'].apply(lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem_and_restring)

Wall time: 3.13 s


In [7]:
df_data.review_text[1]

u"I've been incredibly weary when it comes to mechanics as I've had some bad experiences, but I'm convinced that Tony and his team at Oasis are one of the best in town.\r\n\r\nI've been taking my Jeep Liberty into them since I bought it in 2010 after being referred by a friend, and they have consistently gone above and beyond when it comes to customer service, fair pricing, and clear communication about the vehicle's problem and options for getting it fixed. There have been one or two occasions where I've even called them, explained the problem, and they were able to diagnose the problem over the phone and - because the issue was so minor - walk me through how to take care of it on my own.\r\n\r\nGreat company that I completely trust with my vehicle. I would most definitely recommend Oasis to anyone who needs a good mechanic."

In [8]:
df_data_preprocessed_review.review_text[1]

u'ive incred weari come mechan ive bad experi im convinc toni team oasi one best town ive take jeep liberti sinc bought refer friend consist gone beyond come custom servic fair price clear commun vehicl problem option get fix one two occas ive even call explain problem abl diagnos problem phone issu minor walk take care great compani complet trust vehicl would definit recommend oasi anyon need good mechan'

In [9]:
df_data['pos_words'] = ''
df_data['neg_words'] = ''
df_data['net_senti'] = ''
df_data['review_senti'] = ''

In [10]:
POS_WORDS_FILE = os.path.join(YELP_DATA_FEAT_ADD, 'positive-words.txt')
NEG_WORDS_FILE = os.path.join(YELP_DATA_FEAT_ADD, 'negative-words.txt')

In [11]:
with open(POS_WORDS_FILE) as f:
    pos_words1 = f.read().split()[213:]

with open(NEG_WORDS_FILE) as f:
    neg_words1 = f.read().split()[213:]    
    
print "First 5 positive words %s " % pos_words1[:5]
print "First 5 negative words %s" % neg_words1[:5]

print "Number of positive words %d" % len(pos_words1)

print "Number of negative words %d" % len(neg_words1)

all_words_with_sentiment = pos_words1 + neg_words1

print "Total number of words %d" % len(all_words_with_sentiment)

First 5 positive words ['a+', 'abound', 'abounds', 'abundance', 'abundant'] 
First 5 negative words ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable']
Number of positive words 2006
Number of negative words 4783
Total number of words 6789


In [12]:
def lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords(s):
    s = remove_numbers_in_string(s)
    s = lowercase_remove_punctuation(s)
    s = remove_stopwords(s)
    token_list = []
    token_list.append(nltk.word_tokenize(s))
    #token_list = filter_out_more_stopwords(token_list)
    #token_list = stem_token_list(token_list)
    
    return token_list

In [13]:
all_reviews = [] # Initialize an empty list of sentences for each review

print "Parsing sentences from training set"
for review in df_data["review_text"]:
    all_reviews += lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords(review)

print len(all_reviews)
print len(df_data)
print all_reviews[0]

Parsing sentences from training set
2225
2225
[u'place', u'great', u'allows', u'eat', u'like', u'much', u'little', u'would', u'like', u'love', u'place', u'frequent', u'often', u'great', u'large', u'parties']


In [14]:
i = 0
for sentence in all_reviews:
    num_positive = sum([r in pos_words1 for r in sentence])
    num_negative = sum([r in neg_words1 for r in sentence])
    net_sentiment = num_positive - num_negative
     
#   num_positive #adds number of positive words to the dataframe for each review
#   num_negative #adds number of negative words to the dataframe for each review
#   net_sentiment #adds total weight to the dataframe for each review
    
    
    df_data.set_value(i, 'pos_words', num_positive)
    df_data.set_value(i, 'neg_words', num_negative)
    df_data.set_value(i, 'net_senti', net_sentiment)
    

    if(net_sentiment > 0):
        #adds 1 to the dataframe for each review denoting a positive tendency of the review
        df_data.set_value(i, 'review_senti', 1)
    elif(net_sentiment < 0):
        #adds -1 to the dataframe for each review denoting a negative tendency of the review
         df_data.set_value(i, 'review_senti', -1)
    else:
        #adds 0 to the dataframe for each review denoting a neutral tendency of the review
         df_data.set_value(i, 'review_senti', 0)
    i = i + 1



In [19]:
df_data[['review_text','pos_words','neg_words','net_senti','review_senti','review_stars']]

Unnamed: 0,review_text,pos_words,neg_words,net_senti,review_senti,review_stars
0,This place is great and allows you to eat what...,5,0,5,1,5
1,I've been incredibly weary when it comes to me...,12,6,6,1,5
2,My new favorite place... the food was so good ...,4,0,4,1,5
3,Pros:\r\n-extensive wine list\r\n-great presen...,6,6,0,0,3
4,If you're looking for yoga that makes you swea...,5,0,5,1,5
5,wow!!!,1,0,1,1,5
6,"I'm becoming a bit of a regular here, the serv...",1,0,1,1,4
7,The Divine Eatery is delicious! All the food i...,7,0,7,1,4
8,The employees at this location are the losers ...,1,1,0,0,1
9,Excellent service is routine - easy to reach t...,3,0,3,1,5


In [16]:
vectorizer = CountVectorizer(analyzer = "word",
                                 tokenizer = None,
                                 preprocessor = None,
                                 ngram_range = (1, 1),
                                 strip_accents = 'unicode')

In [17]:
feature_matrix = vectorizer.fit_transform(df_data_preprocessed_review.review_text)

In [18]:
feature_matrix

<2225x10692 sparse matrix of type '<type 'numpy.int64'>'
	with 107982 stored elements in Compressed Sparse Row format>

In [19]:
a = csr_matrix(df_data.pos_words,dtype=int)
a = csr_matrix.transpose(a)


b = csr_matrix(df_data.neg_words,dtype=int)
b = csr_matrix.transpose(b)


c = csr_matrix(df_data.net_senti,dtype=int)
c = csr_matrix.transpose(c)


d = csr_matrix(df_data.review_senti,dtype=int)
d = csr_matrix.transpose(d)


In [20]:
feature_matrix = csr_matrix(hstack((feature_matrix,a)))
feature_matrix = csr_matrix(hstack((feature_matrix,b)))
feature_matrix = csr_matrix(hstack((feature_matrix,c)))
feature_matrix = csr_matrix(hstack((feature_matrix,d)))

In [21]:
feature_matrix

<2225x10696 sparse matrix of type '<type 'numpy.int64'>'
	with 115834 stored elements in Compressed Sparse Row format>

In [22]:
spare_matrix_file = os.path.join(YELP_DATA_SPARSE_MATRIX_DIR_FEAT_ADD, "bagWords_feat_add"+ data_subset)
save_sparse_csr(spare_matrix_file, feature_matrix)

In [23]:
test = load_sparse_csr(spare_matrix_file + ".npz")

In [24]:
print np.array_equal(feature_matrix.data, test.data)
print np.array_equal(feature_matrix.indices, test.indices)
print np.array_equal(feature_matrix.indptr, test.indptr)
print np.array_equal(feature_matrix.shape, test.shape)

True
True
True
True
