<a href="https://colab.research.google.com/github/kevin6449/ironman2024_genai/blob/main/gen_ai_day21_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 使用嵌入功能訓練文字分類器

In [20]:
import google.generativeai as genai

from google.colab import userdata

API_KEY=userdata.get('GOOGLE_API_KEY')

#genai.configure(api_key="YOUR_API_KEY")

# Configure the client library by providing your API key.
genai.configure(api_key=API_KEY)

### 設定環境

In [21]:
import re
import tqdm
import keras
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from keras import layers
from matplotlib.ticker import MaxNLocator
from sklearn.datasets import fetch_20newsgroups
import sklearn.metrics as skmetrics

In [22]:
for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    print(m.name)

models/embedding-001
models/text-embedding-004


## 資料集

 - 20 Newsgroups Text Dataset 包含 20 個主題的 18,000 個新聞群組貼文，分為訓練和測試集。
 - 訓練資料集和測試資料集的分割依據，是特定日期前後發布的訊息。
 - 使用訓練和測試資料集的一部分。
 - 將預先處理資料，並將資料整理到 Pandas DataFrame。

In [23]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# View list of class names for dataset
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### 訓練集的資料點範例

In [24]:
idx = newsgroups_train.data[0].index('Lines')
print(newsgroups_train.data[0][idx:])

Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







### 開始預先處理這個教學課程的資料。移除名稱、電子郵件地址或文字多餘部分 (例如 "From: "、"\nSubject: ") 的所有機密資訊。將資訊整理成 Pandas DataFrame，以便更清晰易讀。

In [25]:
def preprocess_newsgroup_data(newsgroup_dataset):
  # Apply functions to remove names, emails, and extraneous words from data points in newsgroups.data
  newsgroup_dataset.data = [re.sub(r'[\w\.-]+@[\w\.-]+', '', d) for d in newsgroup_dataset.data] # Remove email
  newsgroup_dataset.data = [re.sub(r"\([^()]*\)", "", d) for d in newsgroup_dataset.data] # Remove names
  newsgroup_dataset.data = [d.replace("From: ", "") for d in newsgroup_dataset.data] # Remove "From: "
  newsgroup_dataset.data = [d.replace("\nSubject: ", "") for d in newsgroup_dataset.data] # Remove "\nSubject: "

  # Cut off each text entry after 5,000 characters
  newsgroup_dataset.data = [d[0:5000] if len(d) > 5000 else d for d in newsgroup_dataset.data]

  # Put data points into dataframe
  df_processed = pd.DataFrame(newsgroup_dataset.data, columns=['Text'])
  df_processed['Label'] = newsgroup_dataset.target
  # Match label to target name index
  df_processed['Class Name'] = ''
  for idx, row in df_processed.iterrows():
    df_processed.at[idx, 'Class Name'] = newsgroup_dataset.target_names[row['Label']]

  return df_processed

In [26]:
# Apply preprocessing function to training and test datasets
df_train = preprocess_newsgroup_data(newsgroups_train)
df_test = preprocess_newsgroup_data(newsgroups_test)

df_train.head()

Unnamed: 0,Text,Label,Class Name
0,WHAT car is this!?\nNntp-Posting-Host: rac3.w...,7,rec.autos
1,SI Clock Poll - Final Call\nSummary: Final ca...,4,comp.sys.mac.hardware
2,PB questions...\nOrganization: Purdue Univers...,4,comp.sys.mac.hardware
3,Re: Weitek P9000 ?\nOrganization: Harris Comp...,1,comp.graphics
4,Re: Shuttle Launch Question\nOrganization: Sm...,14,sci.space


### 在訓練資料集中收集 100 個資料點，並捨棄幾個類別以執行，以對部分資料進行取樣。選擇要比較的科學類別。

In [27]:
def sample_data(df, num_samples, classes_to_keep):
  df = df.groupby('Label', as_index = False).apply(lambda x: x.sample(num_samples)).reset_index(drop=True)

  df = df[df['Class Name'].str.contains(classes_to_keep)]

  # Reset the encoding of the labels after sampling and dropping certain categories
  df['Class Name'] = df['Class Name'].astype('category')
  df['Encoded Label'] = df['Class Name'].cat.codes

  return df

In [28]:
TRAIN_NUM_SAMPLES = 100
TEST_NUM_SAMPLES = 25
CLASSES_TO_KEEP = 'sci' # Class name should contain 'sci' in it to keep science categories
df_train = sample_data(df_train, TRAIN_NUM_SAMPLES, CLASSES_TO_KEEP)
df_test = sample_data(df_test, TEST_NUM_SAMPLES, CLASSES_TO_KEEP)

In [29]:
df_train.value_counts('Class Name')

Unnamed: 0_level_0,count
Class Name,Unnamed: 1_level_1
sci.crypt,100
sci.electronics,100
sci.med,100
sci.space,100


In [30]:
df_test.value_counts('Class Name')

Unnamed: 0_level_0,count
Class Name,Unnamed: 1_level_1
sci.crypt,25
sci.electronics,25
sci.med,25
sci.space,25


## 建立嵌入

API 變更為 Embeddingsembedding-001
新的嵌入模型有一個新的工作類型參數和選用標題 (僅適用於 task_type=RETRIEVAL_DOCUMENT)。
工作類型如下：

 - RETRIEVAL_QUERY : 指定指定文字是搜尋/擷取設定中的查詢。
 - RETRIEVAL_DOCUMENT : 指定文字是搜尋/擷取設定中的文件。
 - SEMANTIC_SIMILARITY : 指定指定文字將用於語意文字相似度 (STS)。
 - 分類 : 指定要將嵌入用於分類。
 - 叢集 : 指定嵌入將用於分群。

In [31]:
from tqdm.auto import tqdm
tqdm.pandas()

from google.api_core import retry

def make_embed_text_fn(model):

  @retry.Retry(timeout=300.0)
  def embed_fn(text: str) -> list[float]:
    # Set the task_type to CLASSIFICATION.
    embedding = genai.embed_content(model=model,
                                    content=text,
                                    task_type="classification")
    return embedding['embedding']

  return embed_fn

def create_embeddings(model, df):
  df['Embeddings'] = df['Text'].progress_apply(make_embed_text_fn(model))
  return df

In [32]:
model = 'models/text-embedding-004'
df_train = create_embeddings(model, df_train)
df_test = create_embeddings(model, df_test)

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [33]:
df_train.head()

Unnamed: 0,Text,Label,Class Name,Encoded Label,Embeddings
1100,Re: text of White House announcement and Q&As...,11,sci.crypt,0,"[-0.00684668, 0.022764914, -0.043934416, 0.044..."
1101,Subject: Re: Organized Lobbying for Cryptograp...,11,sci.crypt,0,"[-0.028270109, 0.03839823, -0.031843692, 0.024..."
1102,"Re: Screw the people, crypto is for hard-core...",11,sci.crypt,0,"[-0.013093504, 0.010548056, -0.057340056, 0.01..."
1103,Re: Licensing of public key implementations\n...,11,sci.crypt,0,"[-0.0008055951, 0.027333362, -0.014918889, 0.0..."
1104,Stray thought (was Re: More technical details...,11,sci.crypt,0,"[-0.01303231, 0.026498845, -0.036540926, 0.055..."


In [34]:
unique_values = df_train['Class Name'].unique()
print(unique_values)

['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']
Categories (4, object): ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']


In [35]:
df_test.size

500

##  建立簡易分類模型
 - 定義一個隱藏層和單一類別機率輸出結果的簡易模型。
 - 預測結果會對應一段文字屬於特定新聞類別的機率。
 - 建構模型時，Keras 會自動重組資料點。

In [36]:
def build_classification_model(input_size: int, num_classes: int) -> keras.Model:
  inputs = x = keras.Input(input_size)
  x = layers.Dense(input_size, activation='relu')(input_shape= (x,))
  x = layers.Dense(num_classes, activation='sigmoid')(x)
  return keras.Model(inputs=[inputs], outputs=x)

In [37]:
embedding_size = len(df_train['Embeddings'].iloc[0])
my_tuple = (embedding_size, len(df_train['Class Name'].unique()))
embedding_size

768

In [38]:
# Derive the embedding size from the first training element.
embedding_size = len(df_train['Embeddings'].iloc[0])

# Give your model a different name, as you have already used the variable name 'model'
classifier = build_classification_model(embedding_size, len(df_train['Class Name'].unique()))
classifier.summary()

classifier.compile(loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                   optimizer = keras.optimizers.Adam(learning_rate=0.001),
                   metrics=['accuracy'])

ValueError: Cannot convert '768' to a shape.

In [None]:
embedding_size

## 訓練模型將新聞群組分類
訓練簡單的模型。使用少量訓練週期，避免過度配適。第一個週期需要比其餘週期更長，因為嵌入只需要計算一次。

In [None]:
NUM_EPOCHS = 20
BATCH_SIZE = 32

# Split the x and y components of the train and validation subsets.
y_train = df_train['Encoded Label']
x_train = np.stack(df_train['Embeddings'])
y_val = df_test['Encoded Label']
x_val = np.stack(df_test['Embeddings'])

# Train the model for the desired number of epochs.
callback = keras.callbacks.EarlyStopping(monitor='accuracy', patience=3)

history = classifier.fit(x=x_train,
                         y=y_train,
                         validation_data=(x_val, y_val),
                         callbacks=[callback],
                         batch_size=BATCH_SIZE,
                         epochs=NUM_EPOCHS,)

## 評估模型效能
使用 Keras Model.evaluate敬上 取得測試資料集的損失和準確率。

In [None]:
classifier.evaluate(x=x_val, y=y_val, return_dict=True)

### 評估模型成效的其中一種方法是以圖表呈現分類器成效。使用 plot_history 查看不同週期的損失和準確率趨勢。

In [None]:
def plot_history(history):
  """
    Plotting training and validation learning curves.

    Args:
      history: model history with all the metric measures
  """
  fig, (ax1, ax2) = plt.subplots(1,2)
  fig.set_size_inches(20, 8)

  # Plot loss
  ax1.set_title('Loss')
  ax1.plot(history.history['loss'], label = 'train')
  ax1.plot(history.history['val_loss'], label = 'test')
  ax1.set_ylabel('Loss')

  ax1.set_xlabel('Epoch')
  ax1.legend(['Train', 'Validation'])

  # Plot accuracy
  ax2.set_title('Accuracy')
  ax2.plot(history.history['accuracy'],  label = 'train')
  ax2.plot(history.history['val_accuracy'], label = 'test')
  ax2.set_ylabel('Accuracy')
  ax2.set_xlabel('Epoch')
  ax2.legend(['Train', 'Validation'])

  plt.show()

plot_history(history)

### 使用混淆矩陣。混淆矩陣可讓您評估分類模型在準確率以外的表現。您可以查看系統分類錯誤的點數。為了建立這個多類別分類問題的混淆矩陣，請取得測試集和預測值中的實際值。

首先，使用 Model.predict() 為驗證集中的每個範例產生預測類別。

In [None]:
y_hat = classifier.predict(x=x_val)
y_hat = np.argmax(y_hat, axis=1)

In [None]:
labels_dict = dict(zip(df_test['Class Name'], df_test['Encoded Label']))
labels_dict

In [None]:
cm = skmetrics.confusion_matrix(y_val, y_hat)
disp = skmetrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=labels_dict.keys())
disp.plot(xticks_rotation='vertical')
plt.title('Confusion matrix for newsgroup test dataset');
plt.grid(False)