In [1]:
from ml1.preamble import*
from sklearn.datasets import load_files

reviews_train = load_files('ml1/data/aclImdb/aclImdb/train/')
text_train, y_train = reviews_train.data, reviews_train.target
text_train = [doc.replace(b"<br />", b" " ) for doc in text_train]

* 표제어 추출과 어간 추출(보통 표제어 추출이 성능이 더 좋음)

In [2]:
import nltk
import spacy

en_nlp = spacy.load('en')
stemmer = nltk.stem.PorterStemmer()

def compare_normalization(doc):
    doc_spacy = en_nlp(doc)
    print('표제어:')
    print([token.lemma_ for token in doc_spacy])
    print('어간:')
    print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])
    
compare_normalization(u"Our meeting today was worse than yesterday, "
                       "I'm scared of meeting the clients tomorrow.")

표제어:
['-PRON-', 'meeting', 'today', 'be', 'bad', 'than', 'yesterday', ',', '-PRON-', 'be', 'scared', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
어간:
['our', 'meet', 'today', 'wa', 'wors', 'than', 'yesterday', ',', 'i', 'am', 'scare', 'of', 'meet', 'the', 'client', 'tomorrow', '.']


* 데이터에 적용

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

en_nlp = spacy.load('en', disable=['parser', 'ner'])

def custom_tokenizer(document):
    doc_spacy = en_nlp(document)
    return [token.lemma_ for token in doc_spacy]

lemma_vect = CountVectorizer(tokenizer=custom_tokenizer, min_df=5)
X_train_lemma = lemma_vect.fit_transform(text_train)
print('X_train_lemma.shape:', X_train_lemma.shape)

vect=CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print('X_train.shape:', X_train.shape)

X_train_lemma.shape: (25000, 22100)
X_train.shape: (25000, 27271)


In [4]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.99, train_size=0.01, random_state=0)
grid = GridSearchCV(LogisticRegression(max_iter=5000), param_grid, cv=cv)
grid.fit(X_train, y_train)
print('최상의 교차 검증 점수'
      '(기본 CountVectorizer): {:.3f}'.format(grid.best_score_))

grid.fit(X_train_lemma, y_train)
print('최상의 교차 검증 점수'
      '(표제어): {:.3f}'.format(grid.best_score_))

최상의 교차 검증 점수(기본 CountVectorizer): 0.719
최상의 교차 검증 점수(표제어): 0.719
