### Amazon Products Review Sentiment Analysis

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

def result_correction(data):
    data[data>5] = 5
    data[data<1] = 1
    return data


def pred_format_process(clf,data):
    pred_data = clf.predict(data.toarray()).round().astype(int)
    pred_data = result_correction(pred_data)
    return pred_data


def acc(Y_true,Y_pred):
    conf_mat = confusion_matrix(Y_true, Y_pred)
    acc = np.sum(np.sum(conf_mat.diagonal())+np.sum(conf_mat.diagonal(offset=-1))+np.sum(conf_mat.diagonal(offset=1)))/np.sum(conf_mat)
    return acc


luna = pd.read_csv('amazon_luna2_review.csv').reset_index()
lunaMini = pd.read_csv('amazon_luna2mini_review.csv').reset_index()
luna = pd.concat([luna,lunaMini], axis=0)
luna['one_review_text'] = list(map(lambda x: x.strip('\n'), luna['one_review_text']))
dta = luna[['one_review_stars','one_review_text']]
dta


Unnamed: 0,one_review_stars,one_review_text
0,3.0,This is my first product like this. I never...
1,5.0,"I love my Foreo Luna 2! I too, am a former ..."
2,1.0,Terrible experience of using this brush. It ...
3,5.0,I already owned the original FOREO Luna for ...
4,5.0,I have been hooked on these since I got the ...
...,...,...
3225,4.0,Die Luna mini 2 ist eine auf den ersten Bli...
3226,4.0,Ich habe noch nie von einer Gesichtsreinigun...
3227,4.0,Ich habe noch nie von einer Gesichtsreinigun...
3228,5.0,"Wirklich erstaunlich, was diese Gesichtsbürs..."


In [32]:
dta['ML_group'] = np.random.randint(100,size = dta.shape[0])
dta['one_review_stars'] = dta.one_review_stars.astype(int)
dta = dta.sort_values(by='ML_group')

corpus = dta.one_review_text.to_list()
vectorizer = CountVectorizer(lowercase = True,
                             ngram_range = (1,1),
                             max_df = 0.75,
                             min_df = 0.02);
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())
print(X.toarray())

['aber', 'about', 'acne', 'after', 'al', 'all', 'also', 'am', 'amazing', 'amazon', 'an', 'and', 'anti', 'any', 'are', 'around', 'as', 'at', 'auch', 'battery', 'be', 'because', 'been', 'before', 'best', 'better', 'bien', 'bought', 'brush', 'but', 'buy', 'can', 'cara', 'charge', 'che', 'clarisonic', 'clean', 'cleaning', 'cleanser', 'cleansing', 'como', 'con', 'could', 'da', 'das', 'day', 'days', 'de', 'definitely', 'deja', 'del', 'den', 'der', 'desde', 'device', 'di', 'did', 'didn', 'die', 'difference', 'different', 'do', 'does', 'doesn', 'don', 'dopo', 'dura', 'easy', 'ein', 'eine', 'el', 'en', 'es', 'este', 'estoy', 'et', 'even', 'every', 'face', 'facial', 'feel', 'feels', 'felt', 'few', 'first', 'for', 'foreo', 'from', 'für', 'gentle', 'gerät', 'get', 'good', 'got', 'great', 'gut', 'ha', 'habe', 'had', 'hand', 'happy', 'has', 'haut', 'have', 'he', 'ho', 'how', 'ich', 'if', 'il', 'in', 'is', 'ist', 'it', 'its', 'je', 'just', 'kann', 'la', 'las', 'last', 'le', 'less', 'like', 'limpia', 

In [34]:
# data split
inx_train = dta.ML_group<80                     
inx_valid = (dta.ML_group<90)&(dta.ML_group>=80)
inx_test = (dta.ML_group>=90)                  

# in R lm(Y~X)
Y_train = dta.one_review_stars[inx_train].to_list()
Y_valid = dta.one_review_stars[inx_valid].to_list()
Y_test = dta.one_review_stars[inx_test].to_list()

X_train = X[np.where(inx_train)[0],:]
X_valid = X[np.where(inx_valid)[0],:]
X_test = X[np.where(inx_test)[0],:]

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer


In [37]:
# test generalization ability
clf = MLPClassifier(activation='logistic',max_iter=10000)
clf.fit(X_train.toarray(), Y_train)

pred_test = pred_format_process(clf,X_test)
acc3 = acc(Y_test, pred_test)
print('--- generalization ability test ---')
print('test accuracy is %f'%acc3)

--- generalization ability test ---
test accuracy is 0.939597


In [40]:
from nltk.corpus import stopwords
stops = stopwords.words("English")
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each