In [224]:
import pandas as pd
from typing import List
from pprint import pprint
import numpy as np
import re
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### **資料讀取與清洗**

In [225]:
# # 新資料
# train_df_new = pd.read_csv('../data/train_df.csv')
# test_df_new = pd.read_csv('../data/test_df.csv')

# train_df_new = train_df_new[['index', 'score']]
# test_df_new = test_df_new[['index']]

# # 舊資料
# train_df = pd.read_excel('../train_df_ws.xlsx')
# test_df = pd.read_excel('../test_df_ws.xlsx')

# train_df['ckip_ws'] = train_df['ckip_ws'].apply(lambda x: eval(x))
# test_df['ckip_ws'] = test_df['ckip_ws'].apply(lambda x: eval(x))

# train_df['text'] = train_df['text'].astype(str)
# test_df['text'] = test_df['text'].astype(str)


# train_df = train_df[['index', 'text', 'ckip_ws', 'score']]

# # 合併
# all_data = pd.concat([train_df, test_df])
# all_data = all_data[['index', 'text', 'ckip_ws']]

# train_df = pd.merge(train_df_new, all_data, how='left', on='index')
# test_df = pd.merge(test_df_new, all_data, how='left', on='index')

# train_df = train_df[['index', 'text', 'ckip_ws', 'score']]

In [226]:
train_df = pd.read_excel('../data/train_df.xlsx')
test_df = pd.read_excel('../data/test_df.xlsx')

train_df['ckip_ws'] = train_df['ckip_ws'].apply(lambda x: eval(x))
test_df['ckip_ws'] = test_df['ckip_ws'].apply(lambda x: eval(x))

train_df['text'] = train_df['text'].apply(lambda x: str(x))
test_df['text'] = test_df['text'].apply(lambda x: str(x))

In [227]:
train_df, dev_df = train_test_split(train_df, 
                                    test_size=0.2, 
                                    stratify=train_df['score'], 
                                    random_state=42)

### Try & Error

In [228]:
from opencc import OpenCC

cc = OpenCC('s2t')
train_df['text'] = train_df['text'].apply(lambda x: cc.convert(x))
test_df['text'] = test_df['text'].apply(lambda x: cc.convert(x))
dev_df['text'] = dev_df['text'].apply(lambda x: cc.convert(x))

In [229]:
import jieba
jieba.initialize()
jieba.load_userdict('../data/dict.txt')

train_df['ckip_ws'] = train_df['text'].apply(lambda x: jieba.lcut(x))
test_df['ckip_ws'] = test_df['text'].apply(lambda x: jieba.lcut(x))
dev_df['ckip_ws'] = dev_df['text'].apply(lambda x: jieba.lcut(x))

In [230]:
# ref: https://segmentfault.com/a/1190000007594620
emoji_pattern = r'[\U0001F300-\U0001F5FF]|[\U0001F600-\U0001F64F\U0001F680-\U0001F6FF]|[\u2600-\u2B55]'
number_pattern = r'\d+'
english_pattern = r'[a-zA-Z]+'
invite_word_pattern = r'(輸入|輸入:|輸入：|我的)?(邀請碼|推薦碼)' # 註：train 中，此 pattern 與 r'邀請碼|推薦碼' 結果相同
invite_code_pattern = r'(?!GOOD|NICE|BEST|GREAT|COOL|HAPPY)[A-Z0-9]{4,8}'
product_pattern = r'android\d{0,2}|google\d{0,2}|pixel\d{0,2}'

emoji_matches = train_df[train_df['text'].str.contains(emoji_pattern, na=False)]
num_matches = train_df[train_df['text'].str.contains(number_pattern, na=False)]
en_matches = train_df[train_df['text'].str.contains(english_pattern, na=False)]
invite_word_matches = train_df[train_df['text'].str.contains(invite_word_pattern, na=False)]
invite_code_matches = train_df[train_df['text'].str.contains(invite_code_pattern, na=False)]

  invite_word_matches = train_df[train_df['text'].str.contains(invite_word_pattern, na=False)]


In [231]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
emoji_matches.reset_index(drop=True, inplace=True)
emojis = emoji_matches['text'].apply(lambda x: re.findall(emoji_pattern, x))
emoji_matches['emoji'] = emojis


emoji_matches = emoji_matches[['emoji', 'score']]
one_hot_emoji = mlb.fit_transform(emoji_matches['emoji'])
one_hot_emoji = pd.DataFrame(one_hot_emoji, columns=mlb.classes_)
emojis = one_hot_emoji.columns.tolist()

one_hot_emoji['score'] = emoji_matches['score']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emoji_matches['emoji'] = emojis


In [232]:
one_hot_emoji

Unnamed: 0,★,☕,☹,☺,♀,♡,♤,♥,♧,♾,...,😽,🙁,🙂,🙃,🙄,🙇,🙋,🙌,🙏,score
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5 顆星
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5 顆星
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1 顆星
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1 顆星
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,5 顆星
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5 顆星
441,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5 顆星
442,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1 顆星
443,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3 顆星


In [233]:
emoji_cnt = {emoji: {'5 顆星': 0, '2 顆星': 0, '1 顆星': 0, '4 顆星': 0, '3 顆星': 0} 
             for emoji in emojis}
for emoji in emojis:
    df = one_hot_emoji[one_hot_emoji[emoji] == 1]
    emoji_cnt[emoji].update(df['score'].value_counts().to_dict())
pprint(emoji_cnt)

{'★': {'1 顆星': 0, '2 顆星': 0, '3 顆星': 0, '4 顆星': 1, '5 顆星': 17},
 '☕': {'1 顆星': 0, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 1},
 '☹': {'1 顆星': 1, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 0},
 '☺': {'1 顆星': 1, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 2},
 '♀': {'1 顆星': 1, '2 顆星': 1, '3 顆星': 1, '4 顆星': 1, '5 顆星': 0},
 '♡': {'1 顆星': 1, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 0},
 '♤': {'1 顆星': 1, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 0},
 '♥': {'1 顆星': 0, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 4},
 '♧': {'1 顆星': 1, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 0},
 '♾': {'1 顆星': 0, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 2},
 '⚠': {'1 顆星': 0, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 1},
 '⛔': {'1 顆星': 0, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 1},
 '✅': {'1 顆星': 0, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 2},
 '✌': {'1 顆星': 0, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 2},
 '✨': {'1 顆星': 0, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 顆星': 4},
 '❣': {'1 顆星': 0, '2 顆星': 0, '3 顆星': 0, '4 顆星': 0, '5 

In [234]:
remove_emoji = [emoji for emoji, cnt in emoji_cnt.items() if sum(cnt.values()) < 2]
print(remove_emoji)

['☕', '☹', '♡', '♤', '♧', '⚠', '⛔', '⭐', '🌚', '🌫', '🌷', '🎀', '🏅', '👀', '👋', '💀', '💕', '💗', '💩', '💫', '💬', '💰', '💸', '😆', '😇', '😉', '😏', '😕', '😖', '😘', '😟', '😬', '😯', '😳', '😶', '😽', '🙌']


##### 特徵工程 func

In [235]:
emoji_pattern = r'[\U0001F300-\U0001F5FF]|[\U0001F600-\U0001F64F\U0001F680-\U0001F6FF]|[\u2600-\u2B55]'
number_pattern = r'\d+'
english_pattern = r'[a-zA-Z]+'
punctuation_pattern = r'(?!！|!|\?|？|_|__)[^\w\s]'
url_pattern = r'https?://\S+|www\.\S+'
invite_word_pattern = r'(輸入|輸入:|輸入：|我的)?(邀請碼|推薦碼)' # 註：train 中，此 pattern 與 r'邀請碼|推薦碼' 結果相同
invite_code_pattern = r'(?!GOOD|NICE|BEST|GREAT|COOL|HAPPY|LOVE)[A-Z0-9]{4,6}$'

In [236]:
# 找到每個 emoji 的最大評分星級
emoji_cluster_mapping = {}
for emoji, rating_dict in emoji_cnt.items():
    # 尋找字典中最大值對應的星級
    max_star = max(rating_dict, key=rating_dict.get)
    emoji_cluster_mapping[emoji] = max_star

def cluster_emoji(text,
                  emoji_cluster_mapping=emoji_cluster_mapping,
                  emoji_pattern=r'[\U0001F300-\U0001F5FF]|[\U0001F600-\U0001F64F\U0001F680-\U0001F6FF]|[\u2600-\u2B55]') -> float:
    '''
    分配 emoji 給不同類別
    初步作法：先以 value counts 最多的評分類別，做為 cluster 分配給該 emoji
    '''
    found_emojis = re.findall(emoji_pattern, text)
    if not found_emojis:
        return '無資訊'
    ratings = [int(emoji_cluster_mapping[emoji][0]) for emoji in found_emojis if emoji in emoji_cluster_mapping]

    if not ratings:
        return '無資訊'  # 所有找到的 emoji 都不在映射表中

    # 如果只有一個分類，直接返回
    if len(ratings) == 1:
        return str(int(ratings[0])) + ' 顆星'
    # 如果有多個，計算平均值
    else:
        average_rating = sum(ratings) / len(ratings)
        return str(int(np.ceil(average_rating))) + ' 顆星'  # 回傳平均值，格式化為一位小數

In [237]:
def only_punc(text, punctuation_pattern=r'^[^\w\s]+$'):
    # 檢查文本是否只包含標點符號
    if re.match(punctuation_pattern, text):
        return 1
    else:
        return 0

In [238]:
def is_invite(text,
              invite_word_pattern = invite_word_pattern,
              invite_code_pattern = invite_code_pattern):
    word_match = re.search(invite_word_pattern, text)
    code_match = re.search(invite_code_pattern, text)
    
    if word_match and code_match:
        return 'is_invite'
    # 如果只符合一個
    elif word_match or code_match:
        return 'not_sure'
    # 如果都不符合
    else:
        return 'not_invite'

In [239]:
with open('../data/stopwords.txt', 'r') as f:
    stopwords = f.read().splitlines()

def clean_func(word_seg_result: List[str],
               stop_words_list: List[str] = stopwords,
               remove_emoji = remove_emoji,
               emoji_pattern = emoji_pattern,
               url_pattern = url_pattern,
               number_pattern = r'\d+',
               punctuation_pattern = punctuation_pattern,
               invite_code_pattern = invite_code_pattern) -> str:
    '''
    1. remove duplicate emoji within each text
    2. lowercase the text
    3. remove punctuation
    4. remove stopwords
    5. remove url
    6. remove numbers
    7. remove emojie less than 2 times
    8. remove invite code
    
    Return[str] the word segmented text joined by space
    '''
    
    new_word_seg_result = []
    emojis_set = set()
    # text_length_cum_sum = 0
    for text in word_seg_result:
        # emojis_set = emojis_set.union(set(re.findall(emoji_pattern, text)))
        # text = text if len(text) == 1 else re.sub(emoji_pattern, '', text)
        # text = text if len(text) == 1 else re.sub(punctuation_pattern, '', text)
        # text = text if len(text) == 1 else re.sub(url_pattern, '', text)
        # text = text if len(text) == 1 else re.sub(invite_code_pattern, '', text)
        # text = text if len(text) == 1 else re.sub(number_pattern, '', text)
        # text = text if len(text) == 1 else text.lower()
        
        text = re.sub(emoji_pattern, '', text)
        text = re.sub(punctuation_pattern, '', text)
        text = re.sub(url_pattern, '', text)
        text = text if not re.match(invite_code_pattern, text) else '' 
        text = text if not re.match(number_pattern, text) else ''
        text = text.lower()
        
        new_word_seg_result.append(text)
        
    emojis_set = emojis_set - set(remove_emoji)
    if emojis_set:
        for emoji in emojis_set:
            new_word_seg_result.append(emoji)
            
    result = " ".join([text for text in new_word_seg_result if text not in stop_words_list])
    
    if not result:
        result = " ".join([text for text in new_word_seg_result])
    
    return result

In [240]:
text = ''

print(clean_func([text]))




In [241]:
def cal_comment_length(text: str) -> int:
    '''
    text: str, the word segemented result joined by space
    '''
    return len(text.split())

In [242]:
def feature_extraction(data: pd.DataFrame, 
                       is_test: bool =True) -> pd.DataFrame:
    
    result_df = data.copy()
    result_df['is_invite'] = result_df['text'].apply(lambda x: is_invite(x))
    result_df['only_punc'] = result_df['text'].apply(lambda x: only_punc(x))
    result_df['original_comment_len'] = result_df['text'].apply(lambda x: len(x))
    result_df['comment_len'] = result_df['ckip_ws'].apply(lambda x: len(x))
    result_df['emoji_cluster'] = result_df['text'].apply(lambda x: cluster_emoji(x))
    result_df['cleaned_text'] = result_df['ckip_ws'].apply(lambda x: clean_func(x))
    
    
    column_set = ['index', 
                'text', 
                'ckip_ws', 
                'cleaned_text', 
                'is_invite', 
                'only_punc',
                # 'original_comment_len',
                'comment_len',
                'emoji_cluster'
                ]
    if not is_test:
        column_set.append('score')
    return result_df[column_set]

In [243]:
train_df = feature_extraction(train_df, is_test=False)
dev_df = feature_extraction(dev_df, is_test=False)
test_df = feature_extraction(test_df, is_test=True)

In [244]:
feature_set = train_df.columns.tolist()
feature_set = [col for col in feature_set if col not in ['index', 'text', 'ckip_ws', 'cleaned_text', 'score']]

In [245]:
object_feature = [col for col in feature_set if train_df[col].dtype == 'object']
object_feature

['is_invite', 'emoji_cluster']

清洗後要檢查是否有留言直接變空，再調整清洗 func

In [246]:
# train_df.to_excel('train_df.xlsx', index=False)

In [247]:
train_df.reset_index(drop=True, inplace=True)
dev_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

todo

1. emoji 分組
2. 邀請碼篩選

### **建立詞頻矩陣**

In [248]:
vectorizer = CountVectorizer(
                    token_pattern=r"(?u)\b\w+\b",
                    min_df=1,)
train_count_vector = vectorizer.fit_transform(train_df['cleaned_text']).toarray()
dev_count_vector = vectorizer.transform(dev_df['cleaned_text']).toarray()
test_count_vector = vectorizer.transform(test_df['cleaned_text']).toarray()

In [249]:
# vectorizer = TfidfVectorizer(
#                     token_pattern=r"(?u)\b\w+\b",
#                     # ngram_range=(1, 2),
#                     # max_df=0.7,
#                     min_df=1)
# train_count_vector = vectorizer.fit_transform(train_df['cleaned_text']).toarray()
# dev_count_vector = vectorizer.transform(dev_df['cleaned_text']).toarray()
# test_count_vector = vectorizer.transform(test_df['cleaned_text']).toarray()

# print(len(vectorizer.get_feature_names_out()))

In [250]:
with open('count.txt', 'w') as f:
    for text in vectorizer.get_feature_names_out():
        f.write(text + '\n')

In [251]:
train_X = pd.DataFrame(train_count_vector, columns=vectorizer.get_feature_names_out())
dev_X = pd.DataFrame(dev_count_vector, columns=vectorizer.get_feature_names_out())
test_X = pd.DataFrame(test_count_vector, columns=vectorizer.get_feature_names_out())

In [252]:
train_X = pd.concat([train_X, train_df[feature_set]], axis=1)
dev_X = pd.concat([dev_X, dev_df[feature_set]], axis=1)
test_X = pd.concat([test_X, test_df[feature_set]], axis=1)


print(f"get dummies for {object_feature}")
train_X = pd.get_dummies(train_X, columns=object_feature)
dev_X = pd.get_dummies(dev_X, columns=object_feature)
test_X = pd.get_dummies(test_X, columns=object_feature)

get dummies for ['is_invite', 'emoji_cluster']


In [253]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

train_y = label_encoder.fit_transform(train_df['score'])
dev_y = label_encoder.transform(dev_df['score'])

In [254]:
count_vec_df = train_X[vectorizer.get_feature_names_out()]
count_vec_df['score'] = train_df['score']

key_word_df = pd.DataFrame({
    '1 顆星 keyword': count_vec_df[count_vec_df['score'] == '1 顆星'].sum()[:-1].sort_values(ascending=False).index.tolist(),
    '1 顆星 values': count_vec_df[count_vec_df['score'] == '1 顆星'].sum()[:-1].sort_values(ascending=False).values,
    '2 顆星 keyword': count_vec_df[count_vec_df['score'] == '2 顆星'].sum()[:-1].sort_values(ascending=False).index.tolist(),
    '2 顆星 values': count_vec_df[count_vec_df['score'] == '2 顆星'].sum()[:-1].sort_values(ascending=False).values,
    '3 顆星 keyword': count_vec_df[count_vec_df['score'] == '3 顆星'].sum()[:-1].sort_values(ascending=False).index.tolist(),
    '3 顆星 values': count_vec_df[count_vec_df['score'] == '3 顆星'].sum()[:-1].sort_values(ascending=False).values,
    '4 顆星 keyword': count_vec_df[count_vec_df['score'] == '4 顆星'].sum()[:-1].sort_values(ascending=False).index.tolist(),
    '4 顆星 values': count_vec_df[count_vec_df['score'] == '4 顆星'].sum()[:-1].sort_values(ascending=False).values,
    '5 顆星 keyword': count_vec_df[count_vec_df['score'] == '5 顆星'].sum()[:-1].sort_values(ascending=False).index.tolist(),
    '5 顆星 values': count_vec_df[count_vec_df['score'] == '5 顆星'].sum()[:-1].sort_values(ascending=False).values,
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  count_vec_df['score'] = train_df['score']


In [255]:
start = 0
key_word_df.iloc[start:start+50,:]

Unnamed: 0,1 顆星 keyword,1 顆星 values,2 顆星 keyword,2 顆星 values,3 顆星 keyword,3 顆星 values,4 顆星 keyword,4 顆星 values,5 顆星 keyword,5 顆星 values
0,更新,3345,更新,419,更新,427,app,211,方便,1611
1,app,2402,app,366,app,356,更新,199,很,1158
2,一直,1766,後,214,後,231,可以,167,好用,1020
3,後,1382,一直,197,可以,194,方便,165,好,948
4,登入,1145,登入,183,登入,187,很,162,讚,683
5,用,1100,不,173,一直,180,使用,151,用,619
6,無法,1063,很,151,很,179,後,141,app,491
7,銀行,1051,可以,147,轉帳,161,好,133,使用,451
8,不,1004,銀行,139,不,159,用,124,操作,420
9,沒有,799,無法,138,使用,153,功能,119,快速,397


### **預測模型**

In [271]:
import xgboost as xgb
from sklearn.ensemble import  RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression

In [None]:
model_pred_for_ensemble = {}

In [270]:
model = lgb.LGBMClassifier(random_state=42)
model.fit(train_X, train_y)
dev_y_pred = model.predict(dev_X)

# 计算准确率
accuracy = accuracy_score(dev_y, dev_y_pred)
print("Accuracy:", accuracy)

test_y_pred = model.predict(test_X)
test_y_pred_label = label_encoder.inverse_transform(test_y_pred.astype(int))
print(classification_report(dev_y, dev_y_pred))

model_pred_for_ensemble['lgb'] = {'dev': dev_y_pred,
                                    'test': test_y_pred}

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3530
[LightGBM] [Info] Number of data points in the train set: 17624, number of used features: 1041
[LightGBM] [Info] Start training from score -0.870623
[LightGBM] [Info] Start training from score -2.815721
[LightGBM] [Info] Start training from score -2.681124
[LightGBM] [Info] Start training from score -2.671231
[LightGBM] [Info] Start training from score -0.957647
Accuracy: 0.7433628318584071
              precision    recall  f1-score   support

           0       0.72      0.92      0.81      1845
           1       0.24      0.02      0.04       264
           2       0.31      0.06      0.10       302
           3       0.32      0.06      0.10       305
           4       0.81      0.91      0.86      1691

    accuracy                           0.74      4407
   macro avg       0.48      0

In [272]:
model = LogisticRegression(multi_class="multinomial")
model.fit(train_X, train_y)
dev_y_pred = model.predict(dev_X)

# 计算准确率
accuracy = accuracy_score(dev_y, dev_y_pred)
print("Accuracy:", accuracy)

test_y_pred = model.predict(test_X)
test_y_pred_label = label_encoder.inverse_transform(test_y_pred.astype(int))
print(classification_report(dev_y, dev_y_pred))

model_pred_for_ensemble['logistic_regression'] = {'dev': dev_y_pred,
                                    'test': test_y_pred}

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7413206262763785
              precision    recall  f1-score   support

           0       0.73      0.91      0.81      1845
           1       0.21      0.03      0.05       264
           2       0.26      0.06      0.10       302
           3       0.22      0.03      0.06       305
           4       0.80      0.92      0.85      1691

    accuracy                           0.74      4407
   macro avg       0.44      0.39      0.37      4407
weighted avg       0.66      0.74      0.68      4407



In [273]:
from catboost import CatBoostClassifier
model = CatBoostClassifier()
model.fit(train_X, train_y)
dev_y_pred = model.predict(dev_X)

# 计算准确率
accuracy = accuracy_score(dev_y, dev_y_pred)
print("Accuracy:", accuracy)

test_y_pred = model.predict(test_X)
test_y_pred_label = label_encoder.inverse_transform(test_y_pred.astype(int))
print(classification_report(dev_y, dev_y_pred))

model_pred_for_ensemble['catboost'] = {'dev': dev_y_pred,
                                    'test': test_y_pred}

Learning rate set to 0.091596
0:	learn: 1.4900095	total: 166ms	remaining: 2m 46s
1:	learn: 1.4056426	total: 219ms	remaining: 1m 49s
2:	learn: 1.3392851	total: 273ms	remaining: 1m 30s
3:	learn: 1.2855854	total: 344ms	remaining: 1m 25s
4:	learn: 1.2381489	total: 405ms	remaining: 1m 20s
5:	learn: 1.2011556	total: 490ms	remaining: 1m 21s
6:	learn: 1.1724945	total: 534ms	remaining: 1m 15s
7:	learn: 1.1463582	total: 598ms	remaining: 1m 14s
8:	learn: 1.1238361	total: 656ms	remaining: 1m 12s
9:	learn: 1.1065945	total: 700ms	remaining: 1m 9s
10:	learn: 1.0898603	total: 756ms	remaining: 1m 7s
11:	learn: 1.0747193	total: 805ms	remaining: 1m 6s
12:	learn: 1.0605728	total: 862ms	remaining: 1m 5s
13:	learn: 1.0483461	total: 912ms	remaining: 1m 4s
14:	learn: 1.0371573	total: 960ms	remaining: 1m 3s
15:	learn: 1.0272746	total: 1s	remaining: 1m 1s
16:	learn: 1.0181632	total: 1.06s	remaining: 1m 1s
17:	learn: 1.0108974	total: 1.1s	remaining: 1m
18:	learn: 1.0047360	total: 1.15s	remaining: 59.4s
19:	learn

  y = column_or_1d(y, warn=True)


In [257]:
# model = RandomForestClassifier(random_state=42)
model = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


model.fit(train_X, train_y)
dev_y_pred = model.predict(dev_X)

# 计算准确率
accuracy = accuracy_score(dev_y, dev_y_pred)
print("Accuracy:", accuracy)

test_y_pred = model.predict(test_X)
test_y_pred_label = label_encoder.inverse_transform(test_y_pred.astype(int))

Accuracy: 0.7506240072611754


In [266]:
print(classification_report(dev_y, dev_y_pred))

              precision    recall  f1-score   support

           0       0.70      0.96      0.80      1845
           1       0.50      0.00      0.01       264
           2       0.38      0.02      0.03       302
           3       0.67      0.01      0.01       305
           4       0.83      0.91      0.87      1691

    accuracy                           0.75      4407
   macro avg       0.62      0.38      0.34      4407
weighted avg       0.71      0.75      0.67      4407



In [276]:
model_pred_for_ensemble['catboost']['dev'].reshape(-1)

array([0, 0, 0, ..., 0, 0, 0])

In [278]:
model_pred_for_ensemble

{'naive_bayes': {'dev': array([0, 0, 0, ..., 0, 0, 0]),
  'test': array([0, 0, 0, ..., 0, 0, 0])},
 'lgb': {'dev': array([0, 0, 0, ..., 0, 0, 0]),
  'test': array([2, 0, 0, ..., 0, 0, 0])},
 'logistic_regression': {'dev': array([0, 0, 0, ..., 0, 0, 0]),
  'test': array([0, 0, 0, ..., 0, 0, 0])},
 'catboost': {'dev': array([0, 0, 0, ..., 0, 0, 0]),
  'test': array([0, 0, 0, ..., 0, 0, 0])}}

In [280]:
y1 = model_pred_for_ensemble['lgb']['dev']
y2 = model_pred_for_ensemble['catboost']['dev']

In [290]:
np.max(model_pred_for_ensemble['lgb']['dev'] + \
                            model_pred_for_ensemble['catboost']['dev'] + \
                            model_pred_for_ensemble['logistic_regression']['dev'] + \
                            model_pred_for_ensemble['naive_bayes']['dev'])

16

In [259]:
dev_y_pred_label = label_encoder.inverse_transform(dev_y_pred)
dev_df['預測score'] = dev_y_pred_label

In [320]:
error_index = (dev_y != dev_y_pred) & (dev_y == 0)

dev_df.iloc[error_index].sort_values('預測score', ascending=False).tail(60)

Unnamed: 0,index,text,ckip_ws,cleaned_text,is_invite,only_punc,comment_len,emoji_cluster,score,預測score
1067,21154,查看匯率都會當掉,"[查看, 匯率, 都, 會, 當掉]",查看 匯率 當掉,not_invite,0,5,無資訊,1 顆星,1 顆星
518,28343,。ㄜㄧㄧ,"[。, ㄜ, ㄧ, ㄧ]",ㄜ ㄧ ㄧ,not_invite,0,4,無資訊,1 顆星,1 顆星
514,20643,怎麼還沒修好,"[怎麼, 還沒, 修好]",還沒 修好,not_invite,0,3,無資訊,1 顆星,1 顆星
467,738,常常當機,"[常常, 當機]",常常 當機,not_invite,0,2,無資訊,1 顆星,1 顆星
424,7843,沒辦法登入,"[沒辦法, 登入]",沒辦法 登入,not_invite,0,2,無資訊,1 顆星,1 顆星
47,18064,真的不好用,"[真的, 不好, 用]",真的 不好 用,not_invite,0,3,無資訊,1 顆星,1 顆星
103,4188,超爛,[超爛],超爛,not_invite,0,1,無資訊,1 顆星,1 顆星
134,31125,連續好幾天不能等入了,"[連續, 好, 幾天, 不能, 等入, 了]",連續 好 幾天 等入,not_invite,0,6,無資訊,1 顆星,1 顆星
158,6262,無法登入,[無法登入],無法登入,not_invite,0,1,無資訊,1 顆星,1 顆星
174,3375,連線異常是怎樣,"[連線, 異常, 是, 怎樣]",連線 異常,not_invite,0,4,無資訊,1 顆星,1 顆星


ckip count 0.7420

jieba count 0.7410

jieba tfidf 

In [319]:
# dtrain = xgb.DMatrix(train_X, label=train_y)
# ddev = xgb.DMatrix(dev_X)

# # 定义参数
# params = {
#     'max_depth': 3,
#     'eta': 0.1,
#     'objective': 'multi:softmax',  # 多分类问题
#     'num_class': len(set(train_y))  # 类别数目
# }

# # 训练模型
# num_rounds = 100

# model = xgb.train(params, dtrain, num_rounds)

# # 在测试集上进行预测
# dev_y_pred = model.predict(ddev)

# # 计算准确率
# accuracy = accuracy_score(dev_y, dev_y_pred)
# print("Accuracy:", accuracy)

Accuracy: 0.7106875425459497


In [262]:
# dtest = xgb.DMatrix(test_X)
# test_y_pred = model.predict(dtest)
# test_y_pred_label = label_encoder.inverse_transform(test_y_pred.astype(int))

In [318]:
submission_file = test_df.copy()
submission_file['pred'] = test_y_pred_label
submission_file = submission_file[['index', 'pred']]
submission_file.to_csv('../submission/submission.csv', index=False)

:TODO jieba bashline

:TFIDF

:檢查 英文斷詞只有一個字，可能是 clena func 沒寫好