# !preprocessing

In [99]:
import pandas as pd
import numpy as np
import os
import sklearn.pipeline
import sklearn.tree
import sklearn.ensemble

from sklearn.feature_extraction.text import CountVectorizer

In [43]:
def tokenize_text(raw_text):
    ''' Transform a plain-text string into a list of tokens
    
    We assume that *whitespace* divides tokens.
    
    Args
    ----
    raw_text : string
    
    Returns
    -------
    list_of_tokens : list of strings
        Each element is one token in the provided text
    '''
    list_of_tokens = raw_text.split() # split method divides on whitespace by default
    for pp in range(len(list_of_tokens)):
        cur_token = list_of_tokens[pp]
        # Remove punctuation
        for punc in ['?', '!', '_', '.', ',', '"', '/']:
            cur_token = cur_token.replace(punc, "")
        # Turn to lower case
        clean_token = cur_token.lower()
        # Replace the cleaned token into the original list
        list_of_tokens[pp] = clean_token
    return list_of_tokens

In [44]:
def transform_text_into_feature_vector(text, vocab_dict):
    ''' Produce count feature vector for provided text
    
    Args
    ----
    text : string
        A string of raw text, representing a single 'review'
    vocab_dict : dict with string keys
        If token is in vocabulary, will exist as key in the dict
        If token is not in vocabulary, will not be in the dict

    Returns
    -------
    count_V : 1D numpy array, shape (V,) = (n_vocab,)
        Count vector, indicating how often each vocab word
        appears in the provided text string
    '''
    V = len(vocab_dict.keys())
    count_V = np.zeros(V)
    for tok in tokenize_text(text):
        if tok in vocab_dict:
            vv = vocab_dict[tok]
            count_V[vv] += 1
    return count_V

In [45]:
#----------------
#READ IN THE DATA
#----------------
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
x_test_df = pd.read_csv(os.path.join(data_dir, 'x_test.csv'))

#x df has column for review and column for source website

N, n_cols = x_train_df.shape
print("Shape of x_train_df: (%d, %d)" % (N,n_cols))
print("Shape of y_train_df: %s" % str(y_train_df.shape))
print(y_train_df)

Shape of x_train_df: (2400, 2)
Shape of y_train_df: (2400, 1)
      is_positive_sentiment
0                         0
1                         0
2                         0
3                         0
4                         0
...                     ...
2395                      1
2396                      1
2397                      1
2398                      1
2399                      1

[2400 rows x 1 columns]


In [46]:
# Print out the first five rows and last five rows
tr_text_list = x_train_df['text'].values.tolist()
rows = np.arange(0, 5)
for row_id in rows:
    text = tr_text_list[row_id]
    print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id], text))

print("...")
rows = np.arange(N - 5, N)
for row_id in rows:
    text = tr_text_list[row_id]
    print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id], text))

row     0 | y = 0 | Oh and I forgot to also mention the weird color effect it has on your phone.
row     1 | y = 0 | THAT one didn't work either.
row     2 | y = 0 | Waste of 13 bucks.
row     3 | y = 0 | Product is useless, since it does not have enough charging current to charge the 2 cellphones I was planning to use it with.
row     4 | y = 0 | None of the three sizes they sent with the headset would stay in my ears.
...
row  2395 | y = 1 | The sweet potato fries were very good and seasoned well.
row  2396 | y = 1 | I could eat their bruschetta all day it is devine.
row  2397 | y = 1 | Ambience is perfect.
row  2398 | y = 1 | We ordered the duck rare and it was pink and tender on the inside with a nice char on the outside.
row  2399 | y = 1 | Service was good and the company was better!


In [47]:
#----------------
#DATA CLEANUP
#----------------

tok_count_dict = dict()

for line in tr_text_list:
    tok_list = tokenize_text(line)
    for tok in tok_list:
        if tok in tok_count_dict:
            tok_count_dict[tok] += 1
        else:
            tok_count_dict[tok] = 1
#tok_count_dict holds words paired with # occurences

In [48]:
#print top 10 as sanity check
sorted_tokens = list(sorted(tok_count_dict, key=tok_count_dict.get, reverse=True))
for w in sorted_tokens[:30]:
    print("%5d %s" % (tok_count_dict[w], w))

 1560 the
  916 and
  707 a
  700 i
  609 is
  542 to
  534 it
  493 of
  493 this
  447 was
  328 in
  257 for
  244 not
  231 that
  212 with
  202 very
  201 my
  183 good
  176 on
  163 you
  162 great
  158 but
  147 have
  143 are
  141 so
  140 movie
  137 phone
  136 as
  119 film
  115 be


In [82]:
#select words with at least 4 occurrences and less than 300 occurrences
vocab_list = [w for w in sorted_tokens if (tok_count_dict[w] >= 10) and (tok_count_dict[w] <= 300)]

In [83]:
#pair tokens with unique id for vectorizing
vocab_dict = dict()
for vocab_id, tok in enumerate(vocab_list):
    vocab_dict[tok] = vocab_id

x_tr_NV = np.zeros((len(tr_text_list), len(vocab_list)))
for nn, raw_text_line in enumerate(tr_text_list):
    x_tr_NV[nn] = transform_text_into_feature_vector(raw_text_line, vocab_dict)
print(x_tr_NV.shape)

(2400, 381)


In [84]:
#model setup
tree = sklearn.tree.DecisionTreeClassifier(
    criterion='gini', min_samples_split=2, min_samples_leaf=1)
hyperparameter_grid_by_name = dict(
    max_depth=[32, 128, 256],
    min_samples_leaf=[1, 3, 9],
    )

In [85]:
#model definition
grid = sklearn.model_selection.GridSearchCV(
    tree,
    hyperparameter_grid_by_name,
    scoring='balanced_accuracy',
    cv=7,
    return_train_score=True)

In [86]:
grid.fit(x_tr_NV, y_train_df['is_positive_sentiment'])

GridSearchCV(cv=7, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [32, 128, 256],
                         'min_samples_leaf': [1, 3, 9]},
             return_train_score=True, scoring='balanced_accuracy')

In [87]:
tree_search_results_df = pd.DataFrame(grid.cv_results_).copy()
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 256, 'min_samples_leaf': 3}
0.7404412193273882


In [55]:
#BIGRAMS
two_gram_preprocessor = CountVectorizer(binary=False, ngram_range=(2,2), min_df=3, max_df=0.75)

In [56]:
two_gram_preprocessor.fit(tr_text_list)

CountVectorizer(max_df=0.75, min_df=3, ngram_range=(2, 2))

In [57]:
len(two_gram_preprocessor.vocabulary_)

1156

In [58]:
for term, count in two_gram_preprocessor.vocabulary_.items():
    print("%4d %s" % (count, term))

 486 it has
 654 on your
1153 your phone
 782 that one
 215 didn work
1054 waste of
 705 product is
 750 since it
 479 it does
 220 does not
 605 not have
 968 to use
 506 it with
 597 none of
 635 of the
 880 the three
1105 with the
 831 the headset
 402 in my
 578 my ears
 206 customer service
 455 is still
 667 out and
 847 the only
1003 very disappointing
 906 thing was
 896 there was
1039 was no
1002 very disappointed
 869 the service
 743 service was
1053 was very
1000 very bad
 658 only thing
 905 thing that
 460 is the
 351 had to
 481 it feels
 216 difficult to
  75 and the
 171 buttons are
 105 are so
 781 that it
 489 it is
 548 make the
 863 the same
 723 same mistake
 759 sound and
 218 do not
 598 not be
 639 of time
 217 disappointed with
1101 with my
 527 like the
 265 felt like
 526 like it
 509 it would
 157 bluetooth headset
 970 to wear
1062 we have
  78 and they
 706 put on
 922 this product
 227 down the
1127 would not
 612 not recommend
 720 recommend this
 917 t

In [88]:
pipeline = sklearn.pipeline.Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(binary=False, ngram_range=(2,2), min_df=20, max_df=0.75)),
    ('my_classifier', tree),
])

In [89]:
tree = sklearn.tree.DecisionTreeClassifier(
    criterion='gini', min_samples_split=2, min_samples_leaf=1)
hyperparameter_grid_by_name = dict(
    my_classifier__max_depth=[32, 128, 256],
    my_classifier__min_samples_leaf=[1, 3, 9],
    )

In [90]:
#model definition
grid2 = sklearn.model_selection.GridSearchCV(
    pipeline,
    hyperparameter_grid_by_name,
    scoring='balanced_accuracy',
    cv=7,
    return_train_score=True)

In [91]:
grid2.fit(tr_text_list, y_train_df['is_positive_sentiment'])

GridSearchCV(cv=7,
             estimator=Pipeline(steps=[('my_bow_feature_extractor',
                                        CountVectorizer(max_df=0.75, min_df=20,
                                                        ngram_range=(2, 2))),
                                       ('my_classifier',
                                        DecisionTreeClassifier())]),
             param_grid={'my_classifier__max_depth': [32, 128, 256],
                         'my_classifier__min_samples_leaf': [1, 3, 9]},
             return_train_score=True, scoring='balanced_accuracy')

In [92]:
tree_search_results_df = pd.DataFrame(grid2.cv_results_).copy()
print(grid2.best_params_)
print(grid2.best_score_)

{'my_classifier__max_depth': 256, 'my_classifier__min_samples_leaf': 3}
0.5508320219152533


In [93]:
lasso = sklearn.linear_model.LogisticRegression(
    penalty='l1', solver='saga', random_state=101)

In [94]:
lasso_hyperparameter_grid_by_name = dict(
    C=np.logspace(-4, 4, 9),
    max_iter=[20, 40], # sneaky way to do "early stopping" 
                       # we'll take either iter 20 or iter 40 in training process, by best valid performance
    )

In [95]:
lasso_searcher = sklearn.model_selection.GridSearchCV(
    lasso,
    lasso_hyperparameter_grid_by_name,
    scoring='balanced_accuracy',
    cv=7,
    return_train_score=True,
    refit=False)

In [96]:
lasso_searcher.fit(x_tr_NV, y_train_df['is_positive_sentiment'])







GridSearchCV(cv=7,
             estimator=LogisticRegression(penalty='l1', random_state=101,
                                          solver='saga'),
             param_grid={'C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
       1.e+04]),
                         'max_iter': [20, 40]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [97]:
lasso_search_results_df = pd.DataFrame(lasso_searcher.cv_results_).copy()
print(lasso_searcher.best_params_)
print(lasso_searcher.best_score_)

{'C': 1.0, 'max_iter': 40}
0.7858211420022927


# FOREST

In [108]:
forest = sklearn.ensemble.RandomForestClassifier(
    n_estimators=125,
    criterion='gini',
    max_depth=15,
    min_samples_split=2,
    min_samples_leaf=1)

In [116]:
pipeline = sklearn.pipeline.Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(ngram_range=(2,2), min_df=3, max_df=0.75)),
    ('my_classifier', forest),
])

In [129]:
forest_hyperparameter_grid_by_name = dict(
    max_features=[3, 10, 33, 100, 333],
    max_depth=[16, 32],
    min_samples_leaf=[1],
    n_estimators=[125],
    random_state=[101],
    )

In [130]:
# TODO construct a GridSearchCV object like you did above.

forest_searcher = sklearn.model_selection.GridSearchCV(
    forest,
    forest_hyperparameter_grid_by_name,
    scoring='balanced_accuracy',
    cv=7,
    return_train_score=True,
    refit=False)

In [131]:
forest_searcher.fit(x_tr_NV, y_train_df['is_positive_sentiment'])

GridSearchCV(cv=7,
             estimator=RandomForestClassifier(max_depth=15, n_estimators=125),
             param_grid={'max_depth': [16, 32],
                         'max_features': [3, 10, 33, 100, 333],
                         'min_samples_leaf': [1], 'n_estimators': [125],
                         'random_state': [101]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [132]:
forest_search_results = pd.DataFrame(lasso_searcher.cv_results_).copy()
print(forest_searcher.best_params_)
print(forest_searcher.best_score_)

{'max_depth': 32, 'max_features': 10, 'min_samples_leaf': 1, 'n_estimators': 125, 'random_state': 101}
0.7992437489071517
