### GPU Test

In [1]:
import tensorflow as tf
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10769964475020082798
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 1399655220
locality {
  bus_id: 1
  links {
  }
}
incarnation: 1888547470471192608
physical_device_desc: "device: 0, name: GeForce MX250, pci bus id: 0000:3a:00.0, compute capability: 6.1"
xla_global_id: 416903419
]


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 초기화할 GPU number

with tf.Graph().as_default():
    # GPU 메모리를 전부 할당하지 않고, 아주 적은 비율로 할당되어 시작
    # 프로세스의 메모리 수요에 따라 자동적으로 증가
    # 단, GPU 메모리를 처음부터 전체 비율로 사용하지 않음
    gpu_options = tf.compat.v1.GPUOptions(allow_growth = True)

### 로지스틱 회귀 사용

In [3]:
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv('data/train.csv')
data['Target'].unique() # 타겟 클래스 이름 확인

array(['neutral', 'surprise', 'fear', 'sadness', 'joy', 'disgust',
       'anger'], dtype=object)

In [5]:
data.isnull().sum() # 결측치 확인

ID             0
Utterance      0
Speaker        0
Dialogue_ID    0
Target         0
dtype: int64

In [7]:
x_train = data['Utterance']
y_train = data['Target']

### 벡터화

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

cnt_vect = CountVectorizer()
cnt_vect.fit(x_train)
x_train_cnt_vect = cnt_vect.transform(x_train)

print(x_train_cnt_vect.shape)

(9989, 5333)


### ML

In [11]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(x_train_cnt_vect, y_train)
pred = lr_clf.predict(x_test_cnt_vect)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 제출

In [8]:
lr_submit = pd.read_csv('data/sample_submission.csv')
lr_submit['Target'] = pred
lr_submit.head()

In [38]:
lr_submit.to_csv('./lr_submit.csv', index=False)

### TF-IDF 벡터화

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(x_train)
x_train_tfidf_vect = tfidf_vect.transform(x_train)

x_test_tridf_vect = tfidf_vect.transform(x_test)

lr_clf = LogisticRegression()
lr_clf.fit(x_train_tfidf_vect, y_train)
pred = lr_clf.predict(x_test_tridf_vect)
lr_submit['Target'] = pred
lr_submit.to_csv('./lr_submit_tfidf.csv', index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


- tfidf 벡터화 결과 일반 벡터화에 비해 F1 score 0.04점 감소

### 파라미터 조정 - gridsearch

In [None]:
from sklearn.model_selection import train_test_split

train_scaled, val_scaled, train_target, val_target = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

with tf.device("/device:GPU:0"): # GPU 사용

    pipeline = Pipeline([
    ('cnt_vect',CountVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
    ])

    params = {'cnt_vect__ngram_range' : [(1,1),(1,2),(1,3)],
          'cnt_vect__max_df' : [100,300,700],
          'lr_clf__C' : [0.5,1,5,10]
          }

    grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv = 3, scoring = 'f1', verbose = 0)
    grid_cv_pipe.fit(train_scaled, train_target)

    print(grid_cv_pipe.best_params_)

    pred = grid_cv_pipe.predict(val_scaled)
    lr_f1 = f1_score(val_target, pred, average = 'macro')

{'cnt_vect__max_df': 100, 'cnt_vect__ngram_range': (1, 1), 'lr_clf__C': 0.5}


### Test Dataset

In [21]:
test = pd.read_csv('data/test.csv')
test.isnull().sum()

ID             0
Utterance      0
Speaker        0
Dialogue_ID    0
dtype: int64

In [37]:
x_test = test['Utterance']
pred = grid_cv_pipe.predict(x_test)

submit = pd.read_csv('data/sample_submission.csv')
submit['Target'] = pred

submit.to_csv('submit/lr_param.csv')