In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json
/kaggle/input/dm-2024-isa-5810-lab-2-homework/sampleSubmission.csv
/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv
/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# 讀取數據
data_identification = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv')
emotion = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv')
tweets = pd.read_json('/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json', lines=True)

# 提取嵌套的 tweet_id 和 text
tweets['tweet_id'] = tweets['_source'].apply(lambda x: x['tweet']['tweet_id'])
tweets['text'] = tweets['_source'].apply(lambda x: x['tweet']['text'])

# 合併數據
data = pd.merge(data_identification, emotion, on='tweet_id', how='left')
data = pd.merge(data, tweets, on='tweet_id')

# 分割訓練集和測試集
train_data = data[data['identification'] == 'train']
test_data = data[data['identification'] == 'test']

# 簡單的數據預處理
train_data.loc[:, 'text'] = train_data['text'].str.lower()
test_data.loc[:, 'text'] = test_data['text'].str.lower()

# 分割特徵和標籤
X = train_data['text']
y = train_data['emotion']

# 分割訓練集和驗證集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立模型管道
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('clf', LogisticRegression(max_iter=1000))
])

# 訓練模型
pipeline.fit(X_train, y_train)

# 驗證模型
val_predictions = pipeline.predict(X_val)
val_score = f1_score(y_val, val_predictions, average='weighted')
print(f'Validation F1 Score: {val_score:.4f}')

# 預測測試集
test_predictions = pipeline.predict(test_data['text'])

# 準備提交文件
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': test_predictions
})

# 保存提交文件
submission.to_csv('submission.csv', index=False)
print('Submission file saved as submission.csv')


Validation F1 Score: 0.5254


Submission file saved as submission.csv


In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.utils import resample

# 讀取數據
data_identification = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv')
emotion = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv')
tweets = pd.read_json('/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json', lines=True)

# 提取嵌套的 tweet_id 和 text
tweets['tweet_id'] = tweets['_source'].apply(lambda x: x['tweet']['tweet_id'])
tweets['text'] = tweets['_source'].apply(lambda x: x['tweet']['text'])

# 合併數據
data = pd.merge(data_identification, emotion, on='tweet_id', how='left')
data = pd.merge(data, tweets, on='tweet_id')

# 分割訓練集和測試集
train_data = data[data['identification'] == 'train']
test_data = data[data['identification'] == 'test']

# 數據清洗和預處理
train_data.loc[:, 'text'] = train_data['text'].str.lower().str.replace(r'http\S+|www.\S+', '', case=False)
test_data.loc[:, 'text'] = test_data['text'].str.lower().str.replace(r'http\S+|www.\S+', '', case=False)

# 分割特徵和標籤
X = train_data['text']
y = train_data['emotion']

# 數據增強：上採樣少數類別
train_data_majority = train_data[train_data.emotion == 'joy']
train_data_minority = train_data[train_data.emotion != 'joy']

train_data_minority_upsampled = resample(train_data_minority, 
                                         replace=True,     # 取樣替換
                                         n_samples=len(train_data_majority),    # 使少數類別數量等於多數類別
                                         random_state=123) # 固定隨機種子

train_data_upsampled = pd.concat([train_data_majority, train_data_minority_upsampled])

# 更新特徵和標籤
X = train_data_upsampled['text']
y = train_data_upsampled['emotion']

# 分割訓練集和驗證集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立模型管道
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  # 減少特徵數量
    ('clf', LogisticRegression(max_iter=1000))
])

# 模型選擇和調參
param_grid = {
    'tfidf__max_features': [5000],
    'clf__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_weighted')  # 減少交叉驗證次數
grid_search.fit(X_train, y_train)

# 最佳模型
best_model = grid_search.best_estimator_

# 驗證模型
val_predictions = best_model.predict(X_val)
val_score = f1_score(y_val, val_predictions, average='weighted')
print(f'Validation F1 Score: {val_score:.4f}')

# 預測測試集
test_predictions = best_model.predict(test_data['text'])

# 準備提交文件
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': test_predictions
})

# 保存提交文件
submission.to_csv('submission.csv', index=False)
print('Submission file saved as submission.csv')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
