In [1]:
import pandas as pd

## Load Dataset

In [4]:
train = pd.read_csv("data/movie_train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head(10)

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2
6,1,of escapades demonstrating the adage that what...,2
7,1,of,2
8,1,escapades demonstrating the adage that what is...,2
9,1,escapades,2
10,1,demonstrating the adage that what is good for ...,2


In [5]:
test = pd.read_csv("data/movie_test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [57]:
train["Phrase(origin)"] = train["Phrase"]

print(train.shape)
train[100:120]

(156060, 4)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
101,3,would have a hard time sit through this one .,1,would have a hard time sit through this one .
102,3,would have a hard time sit through this one,0,would have a hard time sit through this one
103,3,would,2,would
104,3,have a hard time sit through this one,0,have a hard time sit through this one
105,3,have,2,have
106,3,a hard time sit through this one,1,a hard time sit through this one
107,3,a hard time,1,a hard time
108,3,hard time,1,hard time
109,3,hard,2,hard
110,3,time,2,time


In [7]:
test["Phrase(origin)"] = test["Phrase"]

print(test.shape)
test.head()

(66292, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
156061,8545,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,8545,An,An
156064,8545,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [29]:
!pip install nltk
!pip install tqdm



In [59]:
train.columns

Index(['SentenceId', 'Phrase', 'Sentiment', 'Phrase(origin)'], dtype='object')

In [72]:
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm

stemmer = SnowballStemmer('english')
phrase = train.iloc[0]["Phrase"]
print(phrase)

a seri of escapad demonstr the adag that what is good for the goo is also good for the gander , some of which occas amus but none of which amount to much of a stori .


In [73]:
words = phrase.split(" ")
words

['a',
 'seri',
 'of',
 'escapad',
 'demonstr',
 'the',
 'adag',
 'that',
 'what',
 'is',
 'good',
 'for',
 'the',
 'goo',
 'is',
 'also',
 'good',
 'for',
 'the',
 'gander',
 ',',
 'some',
 'of',
 'which',
 'occas',
 'amus',
 'but',
 'none',
 'of',
 'which',
 'amount',
 'to',
 'much',
 'of',
 'a',
 'stori',
 '.']

In [74]:
for word in words:
    stemmed_word = stemmer.stem(word)
    print(stemmed_word)

a
seri
of
escapad
demonstr
the
adag
that
what
is
good
for
the
goo
is
also
good
for
the
gander
,
some
of
which
occa
amus
but
none
of
which
amount
to
much
of
a
stori
.


In [84]:
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm

stemmer = SnowballStemmer('english')

def stem_phrase(phrase):
    words = phrase.split(" ")
    stemmed_words = []

    for word in words:
        stemmed_word = stemmer.stem(word)
        stemmed_words.append(stemmed_word)

    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

tqdm.pandas(desc="Stemming... (train)")

train["Phrase"] = train.iloc[0:3]["Phrase"].progress_apply(stem_phrase)

print(train.shape)
train.head




Stemming... (train):   0%|          | 0/3 [00:00<?, ?it/s][A[A[A


Stemming... (train): 100%|██████████| 3/3 [00:00<00:00, 2161.27it/s]

(156060, 4)


[A[A[A

<bound method NDFrame.head of           SentenceId                                             Phrase  \
PhraseId                                                                  
1                  1  a seri of escapad demonstr the adag that what ...   
2                  1  a seri of escapad demonstr the adag that what ...   
3                  1                                             a seri   
4                  1                                                NaN   
5                  1                                                NaN   
6                  1                                                NaN   
7                  1                                                NaN   
8                  1                                                NaN   
9                  1                                                NaN   
10                 1                                                NaN   
11                 1                                                Na

In [37]:
tqdm.pandas(desc="Stemming... (test)")

test["Phrase"] = test["Phrase"].progress_apply(stem_phrase)

print(test.shape)
test.head()

Stemming... (test): 100%|██████████| 66292/66292 [00:06<00:00, 10735.34it/s]

(66292, 3)





Unnamed: 0_level_0,SentenceId,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
156061,8545,an intermitt pleas but most routin effort .,An intermittently pleasing but mostly routine ...
156062,8545,an intermitt pleas but most routin effort,An intermittently pleasing but mostly routine ...
156063,8545,an,An
156064,8545,intermitt pleas but most routin effort,intermittently pleasing but mostly routine effort
156065,8545,intermitt pleas but most routin,intermittently pleasing but mostly routine


In [39]:
def clean_text(phrase):
    # phrase = phrase.replace("disappointments", "disappointment")
    phrase = phrase.replace("n't", "not")

    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)

### One hot encode Phrase

- analyzer('char') 글자 단위로 벡터화
- analyzer('word') 단어 단위로 벡터화

### xg boost xgb
- granient boosting
- 좋은 성능을 낼 수 있다
- svm
- random forest
-eefjreeeretteedwsdfolkj

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(max_features=30000, ngram_range=(1, 2))

vectorizer.fit(train["Phrase"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [41]:
X_train = vectorizer.transform(train["Phrase"])

print(X_train.shape)
X_train

(156060, 30000)


<156060x30000 sparse matrix of type '<class 'numpy.int64'>'
	with 1509189 stored elements in Compressed Sparse Row format>

In [42]:
X_test = vectorizer.transform(test["Phrase"])

print(X_test.shape)
X_test

(66292, 30000)


<66292x30000 sparse matrix of type '<class 'numpy.int64'>'
	with 487526 stored elements in Compressed Sparse Row format>

In [43]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

In [44]:
train_vector = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names())

print(train_vector.shape)
train_vector.head()

(156060, 30000)


Unnamed: 0,000,000 time,10,10 000,10 minut,10 or,10 second,10 set,10 year,100,...,zish,zish and,zombi,zombi you,zone,zone and,zone arm,zone episod,zucker,zucker brothers
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Score

In [55]:
SGDClassifier?

In [45]:
from sklearn.linear_model import SGDClassifier

seed = 37

model = SGDClassifier(n_jobs=-1,
                      random_state=seed)

model

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=-1,
       penalty='l2', power_t=0.5, random_state=37, shuffle=True, verbose=0,
       warm_start=False)

In [56]:
cross_val_score?

In [46]:
from sklearn.model_selection import cross_val_score, GroupKFold
# from sklearn.cross_validation import cross_val_score

kfold = GroupKFold(n_splits=5)

score = cross_val_score(model, X_train, y_train, cv=kfold, groups=train["SentenceId"]).mean()

print("Score = {0:.5f}".format(score))

Score = 0.58895


## Predict

In [47]:
model.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=-1,
       penalty='l2', power_t=0.5, random_state=37, shuffle=True, verbose=0,
       warm_start=False)

In [48]:
predictions = model.predict(X_test)

print(predictions.shape)
predictions[0:10]

(66292,)


array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Submit

In [51]:
submission = pd.read_csv("sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,2
156062,2
156063,2
156064,2
156065,2


In [50]:
submission.to_csv("baseline-script.csv")

NameError: name 'submission' is not defined