### 作業目的: 使用樹型模型進行文章分類

本次作業主利用[Amazon Review data中的All Beauty](https://nijianmo.github.io/amazon/index.html)來進行review評價分類(文章分類)

資料中將review分為1,2,3,4,5分，而在這份作業，我們將評論改分為差評價、普通評價、優良評價(1,2-->1差評、3-->2普通評價、4,5-->3優良評價)

### 載入套件

In [1]:
import json
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

### 資料前處理
文本資料較為龐大，這裡我們取前10000筆資料來進行作業練習

In [2]:
#load json data
all_reviews = []
###<your code>###
with open("All_Beauty.json", "r") as f:
    for review in f:
        all_reviews.append(json.loads(review))
        
print(all_reviews[0])

{'overall': 1.0, 'verified': True, 'reviewTime': '02 19, 2015', 'reviewerID': 'A1V6B6TNIC10QE', 'asin': '0143026860', 'reviewerName': 'theodore j bigham', 'reviewText': 'great', 'summary': 'One Star', 'unixReviewTime': 1424304000}


In [3]:
#parse label(overall) and corpus(reviewText)
corpus = []
labels = []

###<your code>###
#取前10000筆執行
for review in all_reviews[:10000]:
    if review.get("reviewText", False) and review.get("overall", False):
        corpus.append(review["reviewText"])
        labels.append(review["overall"])
        
#transform labels: 1,2 --> 1 and 3 --> 2 and 4,5 --> 3
#1,2* 轉為低(1) 3為中(2) 4,5*為高(3)
###<your code>###
print(labels)
for i, label in enumerate(labels):
    if label == 1 or label == 2:
        labels[i] = 1
    elif label == 3:
        labels[i] = 2
    else:
        labels[i] = 3
print(labels)


[1.0, 4.0, 4.0, 5.0, 5.0, 5.0, 4.0, 1.0, 5.0, 1.0, 2.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 2.0, 4.0, 5.0, 2.0, 4.0, 5.0, 5.0, 5.0, 2.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 4.0, 5.0, 1.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 1.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 4.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 2.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 4.0, 3.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 2.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 1.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 4.0, 2.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 3.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 2.0, 4.0, 5.0, 4.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0,

In [4]:
#preprocessing data
#remove email address, punctuations, and change line symbol(\n)

###<your code>###
pattern = r"\S*@\S*|\\n|[^a-zA-Z0-9 ]"

for i, review in enumerate(corpus):
    fil_review = [w for w in re.sub(pattern, " ", review).split(" ") if w != ""]
    corpus[i] = " ".join(fil_review)

In [7]:
#split corpus and label into train and test
###<your code>###
x_train,x_test,y_train,y_test=train_test_split(corpus,labels, test_size=0.2, random_state=0)
len(x_train), len(x_test), len(y_train), len(y_test)

(7996, 1999, 7996, 1999)

In [8]:
#change corpus into vector
#you can use tfidf or BoW here

###<your code>###
tfidf_vec = TfidfVectorizer()
tfidf_vec.fit(x_train)

#transform training and testing corpus into vector form
 ###<your code>### 
x_train =tfidf_vec.transform(x_train).toarray()
###<your code>###
x_test =  tfidf_vec.transform(x_test).toarray()

### 訓練與預測

In [9]:
#build classification model (decision tree, random forest, or adaboost)
#start training

###<your code>###
tree = DecisionTreeClassifier(max_depth=6, min_samples_split=2)
tree.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=6)

In [10]:
#start inference
y_pred = ###<your code>###

In [13]:
#calculate accuracy
###<your code>###

Accuracy: 0.9054527263631816


In [17]:
#calculate confusion matrix, precision, recall, and f1-score
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.71      0.22      0.33       134
           2       0.00      0.00      0.00        73
           3       0.91      0.99      0.95      1792

    accuracy                           0.91      1999
   macro avg       0.54      0.40      0.43      1999
weighted avg       0.87      0.91      0.88      1999

[[  29    4  101]
 [   3    0   70]
 [   9    2 1781]]


由上述資訊可以發現, 模型在好評的準確度高(precision, recall都高), 而在差評的部分表現較不理想, 在普通評價的部分大部分跟差評搞混,
同學可以試著學習到的各種方法來提升模型的表現