In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('./product review.csv')
df.head()

## 資料前處理

In [None]:
df.info()

### 資料清理 (Data Cleaning)

- isnull():檢查空值，回傳布林值
- notnull():檢查不是空值，回傳布林值
- dropna():刪除空值
- fillna():填入空值

In [None]:
# 刪除缺失值(行)
df.dropna(inplace=True)

### 類別資料的處理

In [None]:
# Label Encoding

from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df['Season_Label'] = label.fit_transform(df['Season'])
df[['Season', 'Season_Label']] 

In [None]:
# One-hot Encoding

onehot = pd.get_dummies(df['Season'])
pd.concat([df['Season'],onehot], axis=1)

### 類別資料的處理

In [None]:
# Max-Min
from IPython.display import Math

Math(r'x^{(i)}_{norm}=\frac{x^{(i)}-x_{min}}{x_{max}-x_{min}}')

In [None]:
from sklearn import preprocessing

#建立MinMaxScaler物件
minmax = preprocessing.MinMaxScaler()

# 資料標準化
df['Max_Min'] = minmax.fit_transform(df['Score'].values.reshape(-1,1))
df[['Score', 'Max_Min']]

In [None]:
# Z-Score
from IPython.display import Math

Math(r'x^{(i)}_{std}=\frac{x^{(i)}-\mu_{x}}{\sigma_{x}}')

In [None]:
from sklearn import preprocessing

#建立StandardScaler物件
zscore = preprocessing.StandardScaler()

# 資料標準化
df['Z_Score'] = zscore.fit_transform(df['Score'].values.reshape(-1,1))
df[['Score', 'Z_Score']]

## 視覺化

In [None]:
# 導入套件
import matplotlib.pyplot as plt
import seaborn

In [None]:
# 添加中文字型
import matplotlib as mpl
from matplotlib.font_manager import fontManager

fontManager.addfont('TaipeiSansTCBeta-Regular.ttf')
mpl.rc('font', family='Taipei Sans TC Beta')

### 長條圖 Bar chart

In [None]:
user_buy_count = df.groupby('UserId').count()['Id'].sort_values(ascending=False)

# 創建圖形並設置大小
plt.figure(figsize=(20,8))

# 創建長條圖
plt.bar(user_buy_count[:20].index, user_buy_count[:20].values)

# 添加圖標題
plt.title('各個使用者購買的商品總數量')

# 添加軸標籤
plt.xlabel('UserId')
plt.ylabel('購買數量')

# 顯示圖形
plt.show()

In [None]:
# 創建圖形並設置大小
plt.figure(figsize=(20,8))

# 創建長條圖
plt.bar(user_buy_count[:20].index, user_buy_count[:20].values)

# 添加圖標題
plt.title('各個使用者購買的商品總數量', fontsize = 25)

# 添加軸標籤
plt.xlabel('UserId', fontsize = 20)
plt.ylabel('購買數量', fontsize = 20)

# 設定軸刻度標籤
plt.xticks(rotation = 90)

# 顯示圖形
plt.show()

In [None]:
season_score = pd.DataFrame()
season_score['Score'] = df[df['Season']=='Spring'].groupby('Score').Id.count().index
season_score['Spring'] = df[df['Season']=='Spring'].groupby('Score').Id.count().values
season_score['Summer'] = df[df['Season']=='Summer'].groupby('Score').Id.count().values
season_score['Autumn'] = df[df['Season']=='Autumn'].groupby('Score').Id.count().values
season_score['Winter'] = df[df['Season']=='Winter'].groupby('Score').Id.count().values
season_score

In [None]:
# 創建圖形並設置大小
plt.figure(figsize=(20,8))

# 設定長條寬度
width_val = 0.2

# 創建長條圖
plt.bar(x=season_score.Score-width_val, height=season_score['Spring'], width=width_val, label = '春')
plt.bar(x=season_score.Score, height=season_score['Summer'], width=width_val, label = '夏')
plt.bar(x=season_score.Score+width_val, height=season_score['Autumn'], width=width_val, label = '秋')
plt.bar(x=season_score.Score+width_val*2, height=season_score['Winter'], width=width_val, label = '冬')

# 添加圖標題
plt.title('比較各季節的商品評分與銷售數量', fontsize = 25)

# 添加軸標籤
plt.xlabel('商品評分', fontsize = 20)
plt.ylabel('銷售數量', fontsize = 20)

# 添加圖例說明
plt.legend(fontsize = 20)

# 顯示圖形
plt.show()

### 折線圖 Line chart

In [None]:
month_sales = df.groupby('Y-M').Id.count().sort_index()
month_sales

In [None]:
# 創建圖形並設置大小
plt.figure(figsize=(20,8))

# 創建長條圖
plt.plot(pd.to_datetime(month_sales.index), month_sales.values, linewidth=2)

# 添加圖標題
plt.title('1999 至 2012 商品月銷售量趨勢圖', fontsize=25)

# 添加軸標籤
plt.xlabel('時間', fontsize=20)
plt.ylabel('銷售量', fontsize=20)

# 設定軸刻度標籤
plt.xticks(fontsize=15)

# 顯示圖形
plt.show()

In [None]:
sales_2009 = df[df['Year']==2009].groupby('Month').Id.count()
sales_2010 = df[df['Year']==2010].groupby('Month').Id.count()
sales_2011 = df[df['Year']==2011].groupby('Month').Id.count()

In [None]:
# 創建圖形並設置大小
plt.figure(figsize=(20,8))

# 創建長條圖
plt.plot(sales_2011.index, sales_2011, label = '2011年')
plt.plot(sales_2010.index, sales_2010, label = '2010年')
plt.plot(sales_2009.index, sales_2009, label = '2009年')

# 添加圖標題
plt.title('2009 至 2011 每月商品銷售量趨勢對比圖', fontsize=25)

# 添加軸標籤
plt.xlabel('月份', fontsize=20)
plt.ylabel('銷售量', fontsize=20)

# 設定軸刻度標籤
plt.xticks(range(1,13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul','Aug','Sep','Oct', 'Nov', 'Dec'], fontsize = 15)

# 添加圖例說明
plt.legend(fontsize = 20)

# 顯示圖形
plt.show()

### 圓餅圖 Pie Chart

In [None]:
top5_sales = df.groupby('ProductId').Id.count().sort_values(ascending=False)[:5]
top5_sales

In [None]:
# 創建圖形並設置大小
plt.figure(figsize=(8,8))

# 創建圓餅圖
plt.pie(x=top5_sales, autopct='%1.1f%%', textprops = {"fontsize" : 15})

# 添加圖例說明
plt.legend(labels=top5_sales.index, fontsize = 15, bbox_to_anchor = (0.9,1))

# 添加圖標題
plt.title('top5商品銷售數量比例圖', fontsize = 25)

# 顯示圖形
plt.show()

### 散點圖 Scatter Plot

In [None]:
average_score = df.groupby('ProductId').Score.mean().sort_index()
sales_count = df.groupby('ProductId').Id.count().sort_index()

In [None]:
# 創建圖形並設置大小
plt.figure(figsize=(20,8))

# 創建散點圖
plt.scatter(sales_count, average_score)

# 添加圖標題
plt.title('商品平均分數與銷售數量分佈圖', fontsize = 25)

# 添加軸標籤
plt.xlabel('銷售數量', fontsize = 20)
plt.ylabel('商品平均分數', fontsize = 20)

# 顯示圖形
plt.show()

### 範例圖

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# 長條圖
categories = ['A', 'B', 'C', 'D']
values = [15, 24, 12, 8]
plt.bar(categories, values)
plt.title('長條圖')
plt.xlabel('類別')
plt.ylabel('數量')
plt.show()

# 折線圖
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.plot(x, y)
plt.title('折線圖')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()

# 直方圖
data = np.random.randn(1000)
plt.hist(data, bins=30)
plt.title('直方圖')
plt.xlabel('數值')
plt.ylabel('頻率')
plt.show()

# 圓餅圖
labels = ['A', 'B', 'C', 'D']
sizes = [30, 20, 15, 35]
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('圓餅圖')
plt.show()

# 散點圖
x = np.random.randn(100)
y = np.random.randn(100)
plt.scatter(x, y)
plt.title('散點圖')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# 創建子圖
fig, axes = plt.subplots(2, 3, figsize=(20, 10))

# 長條圖
categories = ['A', 'B', 'C', 'D']
values = [15, 24, 12, 8]
axes[0, 0].bar(categories, values)
axes[0, 0].set_title('長條圖')
axes[0, 0].set_xlabel('類別')
axes[0, 0].set_ylabel('數量')

# 折線圖
x = np.linspace(0, 10, 100)
y = np.sin(x)
axes[0, 1].plot(x, y)
axes[0, 1].set_title('折線圖')
axes[0, 1].set_xlabel('X')
axes[0, 1].set_ylabel('Y')

# 直方圖
data = np.random.randn(1000)
axes[0, 2].hist(data, bins=30)
axes[0, 2].set_title('直方圖')
axes[0, 2].set_xlabel('數值')
axes[0, 2].set_ylabel('頻率')

# 圓餅圖
labels = ['A', 'B', 'C', 'D']
sizes = [30, 20, 15, 35]
axes[1, 0].pie(sizes, labels=labels, autopct='%1.1f%%')
axes[1, 0].set_title('圓餅圖')

# 散點圖
x = np.random.randn(100)
y = np.random.randn(100)
axes[1, 1].scatter(x, y)
axes[1, 1].set_title('散點圖')
axes[1, 1].set_xlabel('X')
axes[1, 1].set_ylabel('Y')

# 隱藏多餘的子圖
axes[1, 2].axis('off')

# 調整子圖間的間距
plt.subplots_adjust(wspace=0.4, hspace=0.4)

# 顯示圖形
plt.show()
