In [1]:
import turicreate as tc

# Data Preparation

In [2]:
products = tc.SFrame('data/amazon_baby.sframe')

## Create sentiment column

In [3]:
#ignore all 3*  reviews
products = products[products['rating'] != 3]

#positive sentiment = 4-star or 5-star reviews
products['sentiment'] = products['rating'] >= 4

In [6]:
products['word_count'] = tc.text_analytics.count_words(products['review'])

# Baseline model `sentiment_model`

In [9]:
sentiment_model = tc.logistic_classifier.create(train_data, 
                                                target='sentiment', 
                                                features=['word_count'],
                                                validation_set=test_data)

In [10]:
products['predicted_sentiment'] = sentiment_model.predict(products, output_type='probability')

# `selected_words_model`

In [11]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

In [12]:
def count_word(word_count_dict, word):
    if word in word_count_dict:
        return word_count_dict[word]
    else:
        return 0

In [13]:
for selected_word in selected_words:
    products[selected_word] = products['word_count'].apply(lambda d: count_word(d, selected_word))

In [15]:
products.head(2)

name,review,rating,sentiment,word_count
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,1,"{'recommend': 1.0, 'disappointed': 1.0, ..."
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,1,"{'quilt': 1.0, 'the': 1.0, 'than': 1.0, 'fu ..."

predicted_sentiment,awesome,great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
0.9997307390047108,0.0,0.0,0.0,0.0,1.0,0,0,0.0,0,0,0
0.9985083368316828,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0


In [21]:
selected_word_counts = [products[selected_word].sum() for selected_word in selected_words]

In [24]:
for i, wc in enumerate(selected_word_counts):
    if wc == max(selected_word_counts):
        print("Most used: " + selected_words[i])
    if wc == min(selected_word_counts):
        print("Least used: " + selected_words[i])

Most used: great
Least used: wow


## Train-test split

In [18]:
train_data, test_data = products.random_split(0.8, seed=0)

## Training selected_words_model

In [20]:
selected_words_model = tc.logistic_classifier.create(train_data, target='sentiment', 
                                                     features=selected_words,
                                                     validation_set=test_data)

In [27]:
selected_words_model.coefficients.sort('value', ascending=True).print_rows(num_rows=12)

+-------------+-------+-------+-----------------------+----------------------+
|     name    | index | class |         value         |        stderr        |
+-------------+-------+-------+-----------------------+----------------------+
|   horrible  |  None |   1   |   -2.251335236759093  | 0.08020249388788442  |
|   terrible  |  None |   1   |   -2.223661436085127  | 0.07731736203785755  |
|    awful    |  None |   1   |  -2.0529082040313513  | 0.10099735435259259  |
|     hate    |  None |   1   |  -1.3484407222463124  | 0.07715698604297333  |
|     bad     |  None |   1   |  -0.9914778800650565  | 0.03848428664699063  |
|     wow     |  None |   1   | -0.009538236067678897 |  0.1604641122471166  |
|    great    |  None |   1   |   0.8630655001196618  | 0.018955052444377323 |
|  fantastic  |  None |   1   |   0.8858047568814295  | 0.11167591293399656  |
|   amazing   |  None |   1   |   1.1000933113660283  | 0.09954776260465983  |
|   awesome   |  None |   1   |   1.133534666034145 

In [29]:
selected_words_model.evaluate(test_data)['accuracy']

0.8463848186404036

In [30]:
sentiment_model.evaluate(test_data)['accuracy']

0.9176975738650012

In [33]:
test_data[test_data['sentiment'] == 1].shape[0] / test_data.shape[0]

0.8400192169108815

# Evaluate performance on `diaper_champ_reviews`

In [40]:
diaper_champ_reviews = (products[products['name'] == 'Baby Trend Diaper Champ']
                        .sort('predicted_sentiment', ascending=False))

In [42]:
diaper_champ_reviews.head(1)

name,review,rating,sentiment,word_count
Baby Trend Diaper Champ,I read a review below that can explain exactly ...,4.0,1,"{'key': 1.0, 'have': 1.0, 'pieces': 1.0, 'betwe ..."

predicted_sentiment,awesome,great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
0.999999999989594,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0


In [45]:
selected_words_model.predict(diaper_champ_reviews[0], output_type='probability')

dtype: float
Rows: 1
[0.7919288370624453]