In [1]:
import pandas as pd

### Load Dataset

In [4]:
train = pd.read_csv("train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [5]:
test = pd.read_csv("test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [6]:
train["Phrase(origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [7]:
test["Phrase(origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [8]:
def clean_text(phrase):
    phrase = phrase.replace("doesn't ", "does not ")
    phrase = phrase.replace("ca n't ", "can not ")
    phrase = phrase.replace(" n't ", " not ")

    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [9]:
test["Phrase"] = test["Phrase"].apply(clean_text)

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Stem phrases

In [10]:
from tqdm import tqdm
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

def stem_phrase(phrase):
    stemmed_words = [stemmer.stem(w) for w in phrase.split(" ")]
    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

tqdm.pandas(desc="Stemming...")
train["Phrase"].progress_apply(stem_phrase).head()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

Stemming...: 100%|██████████| 156060/156060 [00:17<00:00, 8994.56it/s]

(156060, 4)





Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [11]:
tqdm.pandas(desc="Stemming...")
test["Phrase"].progress_apply(stem_phrase).head()

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

Stemming...: 100%|██████████| 66292/66292 [00:07<00:00, 9455.59it/s] 

(66292, 3)





Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Vectorize phrases

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

char_vectorizer = TfidfVectorizer(analyzer='char', max_features=10000, ngram_range=(1, 9))
char_vectorizer

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
char_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
X_train_char = char_vectorizer.transform(train["Phrase"])
X_train_char

<156060x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 20095574 stored elements in Compressed Sparse Row format>

In [15]:
X_test_char = char_vectorizer.transform(test["Phrase"])
X_test_char

<66292x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 7897923 stored elements in Compressed Sparse Row format>

In [26]:
columns = char_vectorizer.get_feature_names()
columns

[' ',
 " '",
 " ' ",
 " ''",
 " '' ",
 " 'r",
 " 're",
 " 're ",
 " 's",
 " 's ",
 " 's a",
 " 's a ",
 " 's b",
 " 's c",
 " 's d",
 " 's e",
 " 's f",
 " 's l",
 " 's m",
 " 's n",
 " 's no",
 " 's p",
 " 's r",
 " 's s",
 " 's t",
 " 's w",
 " 'v",
 " 've",
 " 've ",
 ' ,',
 ' , ',
 ' , a',
 ' , a ',
 ' , an',
 ' , and',
 ' , and ',
 ' , and t',
 ' , b',
 ' , bu',
 ' , but',
 ' , but ',
 ' , but i',
 ' , c',
 ' , co',
 ' , d',
 ' , e',
 ' , f',
 ' , g',
 ' , h',
 ' , i',
 ' , in',
 ' , it',
 ' , l',
 ' , m',
 ' , n',
 ' , no',
 ' , o',
 ' , p',
 ' , r',
 ' , s',
 ' , se',
 ' , so',
 ' , t',
 ' , th',
 ' , the',
 ' , the ',
 ' , to',
 ' , u',
 ' , un',
 ' , w',
 ' , wh',
 ' , whi',
 ' , wi',
 ' , wit',
 ' , with',
 ' , with ',
 ' -',
 ' - ',
 ' --',
 ' -- ',
 ' -- a',
 ' -- an',
 ' -l',
 ' -lr',
 ' -lrb',
 ' -lrb-',
 ' -lrb- ',
 ' -r',
 ' -rr',
 ' -rrb',
 ' -rrb-',
 ' -rrb- ',
 ' .',
 ' ..',
 ' ...',
 ' ... ',
 ' 1',
 ' 19',
 ' 2',
 ' :',
 ' : ',
 ' ;',
 ' `',
 ' ` ',
 ' ``',
 ' `` '

In [16]:
word_vectorizer = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 2))
word_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
word_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [18]:
X_train_word = word_vectorizer.transform(train["Phrase"])
X_train_word

<156060x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 1443766 stored elements in Compressed Sparse Row format>

In [19]:
X_test_word = word_vectorizer.transform(test["Phrase"])
X_test_word

<66292x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 469299 stored elements in Compressed Sparse Row format>

In [20]:
from scipy.sparse import hstack

X_train = hstack([X_train_char, X_train_word])
X_train

<156060x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 21539340 stored elements in COOrdinate format>

In [21]:
X_test = hstack([X_test_char, X_test_word])
X_test

<66292x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 8367222 stored elements in COOrdinate format>

In [22]:
# columns = word_vectorizer.get_feature_names()
# pd.DataFrame(X_train.tocsr()[:100].toarray(), columns=columns).head()

In [23]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

In [24]:
sentence_ids = train["SentenceId"]

print(sentence_ids.shape)
sentence_ids.head()

(156060,)


PhraseId
1    1
2    1
3    1
4    1
5    1
Name: SentenceId, dtype: int64

In [1]:
1/156060

6.407791874919903e-06

## Score

In [23]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(alpha=0.000006762746, random_state=37)
model

SGDClassifier(alpha=6.762746e-06, average=False, class_weight=None,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=37, shuffle=True, verbose=0,
       warm_start=False)

In [24]:
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GroupKFold

kfold = GroupKFold(n_splits=5)

y_predict = cross_val_predict(model, X_train, y_train,
                              cv=kfold, groups=sentence_ids)

print(y_predict.shape)
y_predict[0:10]

(156059,)


array([1, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [25]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)
print("Score = {0:.5f}".format(score))

Score = 0.60041


In [26]:
import numpy as np

result = train.copy()
result["Sentiment(predict)"] = y_predict
result["Difference(Phrase)"] = np.abs(y_train - y_predict)

print(result.shape)
result.head()

(156059, 6)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,A series of escapades demonstrating the adage ...,1,A series of escapades demonstrating the adage ...,1,0
2,1,A series of escapades demonstrating the adage ...,2,A series of escapades demonstrating the adage ...,2,0
3,1,A series,2,A series,2,0
4,1,A,2,A,2,0
5,1,series,2,series,2,0


In [27]:
sentiment = result.groupby("SentenceId")["Difference(Phrase)"].mean()
print(sentiment.shape)
sentiment.head()

(8529,)


SentenceId
1    0.190476
2    0.388889
3    0.285714
4    0.375000
5    0.600000
Name: Difference(Phrase), dtype: float64

In [28]:
def find_sentiment(sentence_id):
    return sentiment.loc[sentence_id]

result["Difference(Sentence)"] = result["SentenceId"].apply(find_sentiment)
result = result.sort_values(by="Difference(Sentence)", ascending=False)

print(result.shape)
result.head()

(156059, 7)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase),Difference(Sentence)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
113468,6031,is well below expectations,1,is well below expectations,4,3,3.0
113467,6031,is well below expectations .,0,is well below expectations .,4,4,3.0
113466,6031,Below is well below expectations .,0,Below is well below expectations .,4,4,3.0
113469,6031,below expectations,3,below expectations,2,1,3.0
146850,7989,nothing short of a minor miracle in Unfaithful,3,nothing short of a minor miracle in Unfaithful,0,3,2.5


In [29]:
result[0:1000].to_csv("result.csv")

In [30]:
# vocabulary = vectorizer.get_feature_names()
# vocabulary[0:3]

In [31]:
# pd.DataFrame(vocabulary, columns=["word"]).to_csv("vocabulary.csv")

In [32]:
# result[result["Phrase"].str.contains("can not recommend")]

## Train

In [33]:
!pip install xgboost



In [34]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)

params = {
    'booster': 'gblinear',
    'objective': 'multi:softmax',
    'eval_metric': 'merror',
    'lambda': 2.186753e-03,
    'alpha': 1.286904,
    'lambda_bias': 6.191707e+00,
    'num_class': 5,
    'nthread': 8,
    'silent': 1,
}

%time booster = xgb.train(params, dtrain, num_boost_round=98)



CPU times: user 1min 29s, sys: 104 ms, total: 1min 29s
Wall time: 30.5 s


In [35]:
dtest = xgb.DMatrix(X_test.toarray())

predictions = booster.predict(dtest)

print(predictions.shape)
predictions[0:10]

(66292,)


array([ 3.,  3.,  2.,  3.,  3.,  3.,  3.,  2.,  3.,  2.], dtype=float32)

## Submit

In [36]:
submission = pd.read_csv("data/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions.astype('int')

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,2
156064,3
156065,3


In [37]:
# 경로(ex: baseline-script.csv)는 사용자 설정마다 다름
submission.to_csv("baseline-script.csv")