<a href="https://colab.research.google.com/github/hantedyou/kccnlpClass/blob/main/DATA_EDA_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
from google.colab import drive

In [None]:
ROOT = '/content/drive'     # default for the drive
# PROJ = 'My Drive/kcc/KCCNLP2022'       # path to your project on Drive
PROJ = 'My Drive//KCCNLP2022_shared'


drive.mount(ROOT)           
PROJECT_PATH = os.path.join(ROOT, PROJ)  
print(PROJECT_PATH)
os.chdir(PROJECT_PATH)
os.listdir()[:10]

In [None]:
dict_map_classes = {14:0, 33:0, 62:0, 7:1, 18:1, 8:2, 63:2,
          0:3, 39:3, 10:3, 37:4, 40:4, 49:4, 52:4,
          16:5, 27:5, 29:5, 35:5, 64:5, 20:6, 47:6,
          38:7, 68:8, 50:9, 51:10, 60:10, 61:10, 71:10,
          57:11, 12:12, 4:13, 31:14, 66:14, 9:14, 56:15, 48:16,
          45:17, 55:17, 65:17, 43:18, 3:19, 5:19, 32:19, 54:19,
          26:20, 34:20, 36:21, 59:21, 21:22, 24:22, 53:22,
          15:23, 23:23, 67:23, 11:24, 30:24, 72:24, 58:25,
          17:26, 13:26, 2:27, 19:27, 22:27, 46:27, 41:28, 42:28,
          6:29, 25:29, 28:29, 44:29, 70:29, 1:30, 69:31}
len(dict_map_classes)

In [None]:
df_old = pd.read_csv("data/2020CisDep-10501.csv")
print(df_old.columns)
df_old.head(3)

## 取出需要用的columns，刪除沒有分類的資料

In [None]:
df_old = df_old[['編號', '類別', '分類', '內容']]
print(len(df_old))
# 移除分類是NAN的資料
df_old=df_old.dropna(subset=['分類'])
print(len(df_old))
df_old['分類'] = df_old['分類'].astype('int64')

## 將舊分類轉換為新分類

In [None]:
df_old['舊分類'] = df_old['分類']
df_old.replace({"分類": dict_map_classes},inplace = True)
print(df_old.columns)
df_old.head(3)

### 檢查分類

In [None]:
old_classes = set(df_old['舊分類'])
new_classes = set(df_old['分類'])

In [None]:
len(old_classes),len(new_classes)

In [None]:
df_old.to_csv("data/2020CisDep-10501-類別轉換.csv")

## 檢視資料分布

In [None]:
def getFreqPair(df, classes, class_len=32):
  """
  功能：回傳紀錄每個class出現次數的字典，
  key是類別，values是該類別的出現次數。
  classes ：類別欄位名稱
  class_len：有幾種類別
  """
  class_len = len(set(df[classes]))
  res = dict()
  # 計算類別出現次數
  df['freq'] = df.groupby(classes)[classes].transform('count')
  for i in range(len(df)):
    if(len(res) > class_len):
      break;
    key = df[classes].iloc[i]
    if(key not in res):
      res[key] = df['freq'].iloc[i]
  dict(sorted(res.items()))
  return res

In [None]:
df_data3k = pd.read_csv("data/data_3000.csv")
print(df_data3k.columns)
df_data3k.head(3)

In [None]:
dict_freq_pair = getFreqPair(df_data3k, 'reCheckedsubject')
names = list(dict_freq_pair.keys())
values = list(dict_freq_pair.values())

plt.figure(figsize=(12,4),dpi=100,linewidth = 2)
plt.xlabel("Classes", fontsize=15, labelpad = 15)
plt.ylabel("Counts", fontsize=15, labelpad = 20)
plt.bar(names, values, tick_label=names)

In [None]:
df_data3k = pd.read_csv("data/data_3000.csv")
df_data3k['freq'] = df_data3k.groupby('reCheckedsubject')['reCheckedsubject'].transform('count')
df_sample_from_3k = df_data3k.sample(int(len(df_data3k)*0.2), weights = df_data3k.freq)
#df_sample_from_3k = df_data3k.sample(int(len(df_data3k)*0.2), ignore_index = True)
dict_freq_pair = getFreqPair(df_sample_from_3k, 'reCheckedsubject')
names = list(dict_freq_pair.keys())
values = list(dict_freq_pair.values())

plt.figure(figsize=(12,4),dpi=100,linewidth = 2)
plt.xlabel("Classes", fontsize=12, labelpad = 5)
plt.ylabel("Counts", fontsize=12, labelpad = 20)
plt.bar(names, values, tick_label=names)

In [None]:
df_data3k.columns

In [None]:
dict_freq_pair = getFreqPair(df_old, '分類')
names = list(dict_freq_pair.keys())
values = list(dict_freq_pair.values())

plt.figure(figsize=(12,4),dpi=100,linewidth = 1)
plt.xlabel("Classes", fontsize=15, labelpad = 15)
plt.ylabel("Counts", fontsize=15, labelpad = 20)
plt.bar(names, values, tick_label=names)

## 將資料的分布資料匯出

In [None]:
df_old['freq'] = df_old.groupby('分類')['分類'].transform('count')/len(df_old)
df_old_distribution = df_old[['分類','freq']].drop_duplicates()
df_old_distribution.to_excel('data/2020CisDep-10501-類別轉換-百分比.xls')

In [None]:
df_data3k['freq'] = df_data3k.groupby('reCheckedsubject')['reCheckedsubject'].transform('count')/len(df_old)
df_data3k_distribution = df_data3k[['reCheckedsubject','freq']].drop_duplicates()
df_data3k_distribution.to_excel('data/data_3000-百分比.xls')

# 2022 07 12 結果檢視

In [None]:
df = pd.read_csv('result/roberta-base-TrainByOld-PredOld-acc89.4-all.csv', index_col = 0)
#df = pd.read_csv('result/roberta-base-TrainByOld-PredOld-acc89.4-preWrong.csv',index_col = 0)

In [None]:
print(len(df),df.columns)
df.head()

In [None]:
from sklearn import metrics

In [None]:
df_dif = df[df['true_y']!=df['pred_y']]

In [None]:
len(df),len(df_dif)

In [None]:
metrics.accuracy_score(df['true_y'],df['pred_y'])

In [None]:
df_dif.reset_index()

In [None]:
data_index = 5
true_y = df_dif.iloc[data_index].true_y
pred_y = df_dif.iloc[data_index].pred_y
print(f"True y:{true_y}\nPredict y:{pred_y}")
df_dif.iloc[data_index].content

In [None]:
!nvidia-smi