<a href="https://colab.research.google.com/github/jumbokh/ML-Class/blob/main/notebooks/CH17_AmazonRating.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Link: [kindle_rating.csv](https://drive.google.com/file/d/1R90m7CltSb4cxTF12I0mQhNAeng590Tu/view?usp=sharing)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')

df= pd.read_csv('kindle_rating.csv', parse_dates=['date'])
df.head()

Unnamed: 0,id,rating,title,date,content
0,Professor Nishanth,5,An outstanding refresh of the base Kindle at a...,2019-04-15,"Original review: April 15, 2019, and two updat..."
1,Beverly K,3,Base Kindle gets an upgrade\n,2019-04-15,The pros: I like that you have a choice of col...
2,Gwaredd Thomas,1,Lower ppi - Not good.\n,2019-04-15,I wouldn't purchase this product for the follo...
3,Lynn,5,Greatly Improved Basic Kindle\n,2019-04-15,Don't buy into the petty negative reviews. The...
4,A.B.,4,"Pleasant updates to the ""base"" Kindle\n",2019-04-15,I had a Kindle touch years ago and had stopped...


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2780 entries, 0 to 2779
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   id       2780 non-null   object        
 1   rating   2780 non-null   int64         
 2   title    2780 non-null   object        
 3   date     2780 non-null   datetime64[ns]
 4   content  2780 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 108.7+ KB


In [3]:
size = df['rating'].value_counts().sort_index()
pct = df['rating'].value_counts(normalize=True).round(2).sort_index()
pd.DataFrame(zip(size, pct), columns=['次數', '百分比'], index=range(1,6))

Unnamed: 0,次數,百分比
1,219,0.08
2,134,0.05
3,235,0.08
4,414,0.15
5,1778,0.64


In [4]:
df['rating'] = (df['rating'] > 3).map({True:1 , False:0})
df['rating'].value_counts()

1    2192
0     588
Name: rating, dtype: int64

In [5]:
# ch17-5
X = df['content']
y = df['rating']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, random_state=42)
X_train.head()

315     arrived with no instructions.  did not respond...
2769    Used it for about a month and so far all is gr...
2635                         Love the light! Everything\n
2066         Hard to figure but when done great product\n
2195            Compact easy to use & good battery life\n
Name: content, dtype: object

In [6]:
# ch17-6
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

model_pl = make_pipeline(CountVectorizer(stop_words='english'), MultinomialNB())
model_pl.fit(X_train, y_train)
y_pred = model_pl.predict(X_test)
score = model_pl.score(X_test, y_test)
print('測試集的結果', score.round(3))
print(confusion_matrix(y_test, y_pred))
print('綜合報告')
print(classification_report(y_test, y_pred))

測試集的結果 0.853
[[ 54  63]
 [ 19 420]]
綜合報告
              precision    recall  f1-score   support

           0       0.74      0.46      0.57       117
           1       0.87      0.96      0.91       439

    accuracy                           0.85       556
   macro avg       0.80      0.71      0.74       556
weighted avg       0.84      0.85      0.84       556



In [7]:
# ch17-7
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
model_pl = make_pipeline(CountVectorizer(stop_words='english'), 
                         RandomUnderSampler(),
                         MultinomialNB())
model_pl.fit(X_train, y_train)
y_pred = model_pl.predict(X_test)
score = model_pl.score(X_test, y_test)
print('測試集的結果', score.round(3))
y_pred = model_pl.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('綜合報告')
print(classification_report(y_test, y_pred))

測試集的結果 0.786
[[102  15]
 [104 335]]
綜合報告
              precision    recall  f1-score   support

           0       0.50      0.87      0.63       117
           1       0.96      0.76      0.85       439

    accuracy                           0.79       556
   macro avg       0.73      0.82      0.74       556
weighted avg       0.86      0.79      0.80       556



In [8]:
# ch17-8
from imblearn.over_sampling import SMOTE

model_pl = make_pipeline(CountVectorizer(stop_words='english'), 
                         SMOTE(),
                         MultinomialNB())
model_pl.fit(X_train, y_train)
y_pred = model_pl.predict(X_test)
score = model_pl.score(X_test, y_test)
print('測試集的結果', score.round(3))
y_pred = model_pl.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('綜合報告')
print(classification_report(y_test, y_pred))

測試集的結果 0.842
[[ 90  27]
 [ 61 378]]
綜合報告
              precision    recall  f1-score   support

           0       0.60      0.77      0.67       117
           1       0.93      0.86      0.90       439

    accuracy                           0.84       556
   macro avg       0.76      0.82      0.78       556
weighted avg       0.86      0.84      0.85       556



In [9]:
df['title'].head()

0    An outstanding refresh of the base Kindle at a...
1                        Base Kindle gets an upgrade\n
2                              Lower ppi - Not good.\n
3                      Greatly Improved Basic Kindle\n
4              Pleasant updates to the "base" Kindle\n
Name: title, dtype: object

In [10]:
# ch17-10
X = df[['title', 'content']]
from sklearn.compose import ColumnTransformer
data_pl = ColumnTransformer([
    ('title', CountVectorizer(stop_words='english'), 'title'),
    ('content', CountVectorizer(stop_words='english'), 'content')
])
data_pl.fit_transform(X).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
# ch17-11
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42)
model_pl = make_pipeline(data_pl, MultinomialNB())
model_pl.fit(X_train, y_train)
y_pred = model_pl.predict(X_test)
score = model_pl.score(X_test, y_test)
print('測試集的結果', score.round(3))
y_pred = model_pl.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('綜合報告')
print(classification_report(y_test, y_pred))

測試集的結果 0.871
[[ 66  51]
 [ 21 418]]
綜合報告
              precision    recall  f1-score   support

           0       0.76      0.56      0.65       117
           1       0.89      0.95      0.92       439

    accuracy                           0.87       556
   macro avg       0.82      0.76      0.78       556
weighted avg       0.86      0.87      0.86       556



In [12]:
# ch17-12
np.random.seed(42)
model_pl = make_pipeline(data_pl, RandomUnderSampler(), MultinomialNB())
model_pl.fit(X_train, y_train)
y_pred = model_pl.predict(X_test)
score = model_pl.score(X_test, y_test)
print('測試集的結果', score.round(3))
y_pred = model_pl.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('綜合報告')
print(classification_report(y_test, y_pred))

測試集的結果 0.838
[[100  17]
 [ 73 366]]
綜合報告
              precision    recall  f1-score   support

           0       0.58      0.85      0.69       117
           1       0.96      0.83      0.89       439

    accuracy                           0.84       556
   macro avg       0.77      0.84      0.79       556
weighted avg       0.88      0.84      0.85       556

