In [0]:
import numpy as np
import pandas as pd

In [0]:
from google.colab import drive
drive.mount("gdrive")

In [0]:
df = pd.read_csv("./gdrive/My Drive/ai/workspace/data/customer_data.csv", encoding="cp949")
df.loc[pd.isna(df["Review"]), "Review"] = ""
df

In [0]:
from konlpy.tag import Okt

In [0]:
twitter = Okt()

## 분리된 품사를 기본형으로 맞춰주는 stem=True를 사용해준다

In [0]:
tagged = twitter.pos(df["Review"][0], stem=True)
tagged

In [0]:
for word, pos in tagged:
    print(f"word : {word}, pos : {pos}")

In [0]:
for word, pos in tagged:
    if pos == "Noun" or pos == "Adjective":
        print(f"word : {word}, pos : {pos}")

In [0]:
def preprocessingText(text):
    stems = []
    tagged_review = twitter.pos(text, stem=True)

    for word, pos in tagged_review:
        if len(word) >= 2:
            if pos == "Noun" or pos == "Adjective":
                stems.append(word)
    if len(stems) != 0:
        return " ".join(stems)
    return ""

In [0]:
preprocessingText("아버지 방에 들어가신다 슬프다 배가 아파요")

In [0]:
preprocessingText(df["Review"][0])

In [0]:
df["Review_clear"] = df["Review"].apply(preprocessingText)

In [0]:
df["Review_clear"]

In [0]:
def preprocessing_Score(score):
    if score<2:
        return 0
    elif score == 2:
        return 1
    else:
        return 2

In [0]:
df["Score2"] = df["Score"].apply(preprocessing_Score)

In [0]:
df

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train, y_test = train_test_split(
    df["Review_clear"],
    df["Score2"],
    test_size = 0.2,
    stratify=df["Score2"],
    random_state=156
)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 2,
                             norm='l2')

In [0]:
X_train_vector = vectorizer.fit_transform(X_train)
X_train_vector

In [0]:
X_train_df = pd.DataFrame(X_train_vector.toarray(),
                          columns = vectorizer.get_feature_names())
X_train_df

In [0]:
X_test_vector = vectorizer.transform(X_test)
X_test_vector

In [0]:
X_test_df = pd.DataFrame(X_test_vector.toarray(),
                          columns = vectorizer.get_feature_names())
X_test_df

In [0]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [0]:
X_train_imbalance, y_train_imbalance = smote.fit_resample(X_train_vector, y_train)
X_test_imbalance, y_test_imbalance = smote.fit_resample(X_test_vector, y_test)

In [0]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_estimators = 30, n_jobs=-1)
xgb_clf

In [0]:
params = {"max_depth":[3, 5, 7],
          "min_child_weight" : [3, 5, 7],
          "colsample_bytree":[0.1, 0.5, 1.0],
          "learning_rate" : [0.001, 0.01, 0.1],
          "num_class":[3]}

In [0]:
from sklearn.model_selection import GridSearchCV

gridcv = GridSearchCV(xgb_clf, param_grid=params, n_jobs=-1)
gridcv

In [0]:
y_test

In [0]:
X_test

In [0]:
gridcv.fit(X_train_imbalance,
           y_train_imbalance,
           eval_set = [(X_test_imbalance, y_test_imbalance)])

In [0]:
from sklearn.metrics import confusion_matrix
pred = gridcv.predict(X_test_imbalance)
print(confusion_matrix(y_test_imbalance, pred, labels=[0, 1, 2]))

In [0]:
x_test_vector = vectorizer.transform(X_test.values)
pred = gridcv.predict(x_test_vector)
print(confusion_matrix(y_test.values, pred, labels=[0, 1, 2]))