In [2]:
import pandas as pd

## Load Dataset

In [3]:
train = pd.read_csv("data/movie_train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [4]:
test = pd.read_csv("data/movie_test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [5]:
train["Phrase(Origin)"] = train["Phrase"]

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [6]:
test["Phrase(Origin)"] = test["Phrase"]

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Stem Text

In [7]:
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm

stemmer = SnowballStemmer("english")

def stem_text(phrase):
    words = phrase.split(" ")
    
    stem_words = []
    
    for word in words:
        stem_word = stemmer.stem(word)
        stem_words.append(stem_word)
    
    stem_text = " ".join(stem_words)

    return stem_text

tqdm.pandas(desc="stemming(train)...")

train["Phrase"] = train["Phrase"].progress_apply(stem_text)

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

stemming(train)...: 100%|██████████| 156060/156060 [02:36<00:00, 995.94it/s] 


(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,a seri of escapad demonstr the adag that what ...,A series of escapades demonstrating the adage ...
2,a seri of escapad demonstr the adag that what ...,A series of escapades demonstrating the adage ...
3,a seri,A series
4,a,A
5,seri,series


In [8]:
tqdm.pandas(desc="stemming(test)...")

test["Phrase"] = test["Phrase"].progress_apply(stem_text)

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

stemming(test)...: 100%|██████████| 66292/66292 [00:54<00:00, 1220.87it/s]


(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,an intermitt pleas but most routin effort .,An intermittently pleasing but mostly routine ...
156062,an intermitt pleas but most routin effort,An intermittently pleasing but mostly routine ...
156063,an,An
156064,intermitt pleas but most routin effort,intermittently pleasing but mostly routine effort
156065,intermitt pleas but most routin,intermittently pleasing but mostly routine


### Clean Text

In [9]:
def clean_text(phrase):
    phrase = phrase.replace("n't", "not")

    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,a seri of escapad demonstr the adag that what ...,A series of escapades demonstrating the adage ...
2,a seri of escapad demonstr the adag that what ...,A series of escapades demonstrating the adage ...
3,a seri,A series
4,a,A
5,seri,series


In [10]:
test["Phrase"] = test["Phrase"].apply(clean_text)

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,an intermitt pleas but most routin effort .,An intermittently pleasing but mostly routine ...
156062,an intermitt pleas but most routin effort,An intermittently pleasing but mostly routine ...
156063,an,An
156064,intermitt pleas but most routin effort,intermittently pleasing but mostly routine effort
156065,intermitt pleas but most routin,intermittently pleasing but mostly routine


### One hot encode Phrase

In [11]:
#토큰 하나의 횟수만 고려하지 않고 옆에 있는 두세개의 토큰을 함께 고려하는 방식
#토큰 두개를 바이그램, 세개를 트라이그램, 연속된토큰을 n-gram
from sklearn.feature_extraction.text import TfidfVectorizer
#10000
char_vectorizer = TfidfVectorizer(analyzer='char',
                                  max_features=1000,
                                  ngram_range=(1, 9))

char_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

word_vectorizer = TfidfVectorizer(analyzer='word',
                                  max_features=30000,
                                  ngram_range=(1, 2))

word_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
X_train_char = char_vectorizer.transform(train["Phrase"])

print(X_train_char.shape)
X_train_char

(156059, 10000)


<156059x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 18143071 stored elements in Compressed Sparse Row format>

In [15]:
X_train_word = word_vectorizer.transform(train["Phrase"])

print(X_train_word.shape)
X_train_word

(156059, 30000)


<156059x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 1508136 stored elements in Compressed Sparse Row format>

In [86]:
from scipy.sparse import hstack

X_train = hstack([X_train_char, X_train_word])

print(X_train.shape)
X_train

(156059, 40000)


<156059x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 19651207 stored elements in COOrdinate format>

In [17]:
X_test_char = char_vectorizer.transform(test["Phrase"])

print(X_test_char.shape)
X_test_char

(66292, 10000)


<66292x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 7119629 stored elements in Compressed Sparse Row format>

In [18]:
X_test_word = word_vectorizer.transform(test["Phrase"])

print(X_test_word.shape)
X_test_word

(66292, 30000)


<66292x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 499394 stored elements in Compressed Sparse Row format>

In [19]:
from scipy.sparse import hstack

X_test = hstack([X_test_char, X_test_word])

print(X_test.shape)
X_test

(66292, 40000)


<66292x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 7619023 stored elements in COOrdinate format>

In [85]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156059,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

## Score

In [87]:
import scipy

chunk = np.array([1] * X_train.shape[1])

X_train = scipy.sparse.vstack([X_train, chunk])
y_train = np.concatenate([y_train, [1]])

In [88]:
sentence_ids = train["SentenceId"].copy().append(pd.Series([8544]))

print(sentence_ids.shape)
sentence_ids.head()

(156060,)


1    1
2    1
3    1
4    1
5    1
dtype: int64

### Tune Hyperparameters

In [93]:
import xgboost as xgb
import random
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, GroupKFold
from sklearn.metrics import accuracy_score

seed = 37

num_epoch = 100

hyperparameters_list = []

X_train = X_train.tocsr()
X_test = X_test.tocsr()

for epoch in range(num_epoch):
    np.random.seed(epoch)

    num_round = 100
    
    np.random.seed(epoch * 10 + 1)
    lambda_ = 10 ** np.random.uniform(-1, -9)

    np.random.seed(epoch * 10 + 2)
    alpha = 10 ** np.random.uniform(1, -9)

    np.random.seed(epoch * 10 + 3)
    lambda_bias = 10 ** np.random.uniform(1, -9)
    
    np.random.seed(None)

    params = {
        'booster': 'gblinear',
        'objective': 'multi:softmax',
        'eval_metric': 'merror',
        'lambda': lambda_,
        'alpha': alpha,
        'lambda_bias': lambda_bias,
        'num_class': 5,
        'nthread': 8,
        'silent': 1,
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    result = xgb.cv(params, dtrain, num_round, nfold=5, metrics={'merror'})
    num_best_round = result["test-merror-mean"].argmin()

    kfold = GroupKFold(n_splits=5)
    
    scores = []

    for train_index, test_index in kfold.split(X_train, y_train, groups=sentence_ids):
        X_train_kf, X_test_kf = X_train[train_index], X_train[test_index]
        y_train_kf, y_test_kf = y_train[train_index], y_train[test_index]
        
        dtrain = xgb.DMatrix(X_train_kf, label=y_train_kf)

        booster = xgb.train(params, dtrain, num_boost_round=num_best_round)

        dtest = xgb.DMatrix(X_test_kf.toarray(), label=y_test_kf)

        predictions = booster.predict(dtest)
        
        score = accuracy_score(y_test_kf, predictions)
        
        scores.append(score)

    score = np.array(scores).mean()

    print("{0:3} lambda = {1:.10f}, alpha = {2:.10f}, lambda_bias = {3:.10f} Round = {4:3}, Score = {5:.5f}".format(epoch, lambda_, alpha, lambda_bias, num_best_round, score))

    hyperparameters_list.append({
        'epoch': epoch,
        'lambda_': lambda_,
        'alpha': alpha,
        'lambda_bias': lambda_bias,
        'num_best_round': num_best_round,
        'alpha': alpha,
        'score': score,
    })

hyperparameters_list = pd.DataFrame.from_dict(hyperparameters_list)
hyperparameters_list = hyperparameters_list.sort_values(by="score", ascending=False)

hyperparameters_list.head()

  0 lambda = 0.0000461131, alpha = 0.0004365671, lambda_bias = 0.0000310471 Round =   0, Score = 0.04532
  1 lambda = 0.0036127881, alpha = 0.2873237832, lambda_bias = 0.0000001671 Round =   9, Score = 0.59316
  2 lambda = 0.0407568821, alpha = 0.0822990129, lambda_bias = 0.0000671462 Round =  13, Score = 0.58131
  3 lambda = 0.0005147181, alpha = 0.0000000258, lambda_bias = 0.0327264370 Round =   0, Score = 0.04532
  4 lambda = 0.0009831301, alpha = 0.0017972099, lambda_bias = 0.7070568533 Round =   0, Score = 0.04532
  5 lambda = 0.0000003928, alpha = 0.0000000587, lambda_bias = 0.0000000341 Round =   0, Score = 0.04532
  6 lambda = 0.0000000254, alpha = 4.5967712855, lambda_bias = 0.0000288795 Round =  92, Score = 0.59143
  7 lambda = 0.0032764067, alpha = 0.8562842924, lambda_bias = 0.0000037420 Round =  97, Score = 0.60400
  8 lambda = 0.0001434323, alpha = 0.0176906301, lambda_bias = 0.0268890667 Round =   0, Score = 0.04532
  9 lambda = 0.0024658760, alpha = 0.0000000138, lambda

 79 lambda = 0.0000002151, alpha = 0.0000000031, lambda_bias = 0.0000001391 Round =   0, Score = 0.04532
 80 lambda = 0.0000085831, alpha = 0.0001945728, lambda_bias = 0.1422965074 Round =   0, Score = 0.04532
 81 lambda = 0.0000017865, alpha = 0.0000891645, lambda_bias = 0.0000000016 Round =   0, Score = 0.04532
 82 lambda = 0.0000079904, alpha = 0.0993125854, lambda_bias = 3.5133508336 Round =   0, Score = 0.04532
 83 lambda = 0.0000000011, alpha = 0.0000000037, lambda_bias = 0.0000000830 Round =   0, Score = 0.04532
 84 lambda = 0.0000457808, alpha = 0.0000046546, lambda_bias = 0.0041297433 Round =   0, Score = 0.04532
 85 lambda = 0.0002500314, alpha = 0.0550516579, lambda_bias = 0.0015108700 Round =   0, Score = 0.04532
 86 lambda = 0.0000001106, alpha = 0.0000000013, lambda_bias = 0.0000001832 Round =   0, Score = 0.04532
 87 lambda = 0.0000007404, alpha = 0.0198109678, lambda_bias = 0.0115270332 Round =   0, Score = 0.04532
 88 lambda = 0.0000403355, alpha = 0.0000041366, lambda

Unnamed: 0,alpha,epoch,lambda_,lambda_bias,num_best_round,score
50,1.286904,50,0.002186753,6.191707,97,0.613719
54,1.508578,54,6.8301e-06,1.06476e-09,99,0.612854
34,2.236949,34,0.0007562489,1.775987,85,0.60735
99,2.487204,99,0.01760702,0.03364727,92,0.605101
55,2.550211,55,7.576831e-07,1.179943e-08,99,0.605088


* default: 0.59613

## Predict

In [95]:
dtrain = xgb.DMatrix(X_train, label=y_train)

params = {
    'booster': 'gblinear',
    'objective': 'multi:softmax',
    'eval_metric': 'merror',
    'lambda': 2.186753e-03,
    'alpha': 1.286904,
    'lambda_bias': 6.191707e+00,
    'num_class': 5,
    'nthread': 8,
    'silent': 1,
}

booster = xgb.train(params, dtrain, num_boost_round=98)

In [98]:
dtest = xgb.DMatrix(X_test.toarray())

predictions = booster.predict(dtest)

print(predictions.shape)
predictions[0:10]

(66292,)


array([ 3.,  3.,  2.,  3.,  3.,  3.,  3.,  2.,  3.,  2.], dtype=float32)

## Submit

In [101]:
submission = pd.read_csv("../data/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions.astype('int')

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,2
156064,3
156065,3


In [102]:
submission.to_csv("../submissions/10-use-xgboost.csv")