In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.chdir('/content/drive/MyDrive/github/Dacon/chatgpt경진대회')
os.getcwd()

'/content/drive/MyDrive/github/Dacon/chatgpt경진대회'

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['text'], train_data['label'], test_size=0.2, random_state=42)

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Tokenize the text data
max_features = 10000
max_len = 256
tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_sequences, maxlen=max_len, padding='post', truncating='post')

# Perform oversampling using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_padded, y_train_encoded = smote.fit_resample(X_train_padded, y_train_encoded)

# Feature columns
categorical_columns = [tf.feature_column.categorical_column_with_identity('x', num_buckets=max_features)]

# Input functions
train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
    x={'x': X_train_padded},
    y=y_train_encoded,
    batch_size=128,
    num_epochs=None,
    shuffle=True
)

eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
    x={'x': X_val_padded},
    y=y_val_encoded,
    batch_size=128,
    num_epochs=1,
    shuffle=False
)

# Build DNNLinearCombinedClassifier
estimator = tf.estimator.DNNLinearCombinedClassifier(
    linear_feature_columns=categorical_columns,
    dnn_feature_columns=categorical_columns,
    dnn_hidden_units=[256, 128],
    n_classes=8,
    model_dir='model',
)

# Train and evaluate the model
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=1000)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=None, start_delay_secs=0, throttle_secs=60)

tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

# Make predictions on the validation set
y_val_preds = list(estimator.predict(input_fn=eval_input_fn))
y_val_preds_classes = [p['class_ids'][0] for p in y_val_preds]

# Calculate the F1 score
f1 = f1_score(y_val_encoded, y_val_preds_classes, average='weighted')
print('Validation F1 score:', f1)

# Refit the model with the combined data
X_train_val = pd.concat([X_train, X_val])
y_train_val_encoded = label_encoder.transform(pd.concat([y_train, y_val]))

X_train_val_sequences = tokenizer.texts_to_sequences(X_train_val)
X_train_val_padded = pad_sequences(X_train_val_sequences, maxlen=max_len, padding='post', truncating='post')

# Perform oversampling using SMOTE for the combined data
X_train_val_padded, y_train_val_encoded = smote.fit_resample(X_train_val_padded, y_train_val_encoded)

# Refit the model with the combined data
train_input_fn_refit = tf.compat.v1.estimator.inputs.numpy_input_fn(
    x={'x': X_train_val_padded},
    y=y_train_val_encoded,
    batch_size=128,
    num_epochs=None,
    shuffle=True
)

train_spec_refit = tf.estimator.TrainSpec(input_fn=train_input_fn_refit, max_steps=1000)
tf.estimator.train(estimator, train_spec_refit)

# Prepare the test data
test_sequences = tokenizer.texts_to_sequences(test_data['text'])
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

# Make predictions on the test set
test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
    x={'x': test_padded},
    num_epochs=1,
    shuffle=False
)

test_predictions = list(estimator.predict(input_fn=test_input_fn))
test_preds_classes = [p['class_ids'][0] for p in test_predictions]

# Decode the predicted labels
test_labels_pred = label_encoder.inverse_transform(test_preds_classes)

# Create a submission file
submission = pd.DataFrame({'id': test_data['id'], 'label': test_labels_pred})
submission.to_csv('submission_DNNLinear.csv', index=False)


Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.

Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use tf.keras instead.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, 

ValueError: ignored