In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/sampleSubmission.csv
/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip


In [2]:
df = pd.read_csv("/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip",
                header=0, delimiter='\t', quoting=3)

df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [3]:
import re
from bs4 import BeautifulSoup as bs

# 특수 문자 제거하기
def preprocessing(x):
    x = bs(x, 'html.parser').get_text()
    x = re.sub('\W', " ", x)
    
    return x

df['review'] = df['review'].map(lambda x: preprocessing(x))
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,With all this stuff going down at the moment ...
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...
3,"""3630_4""",0,It must be assumed that those who praised thi...
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...


In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# 불용어 제거 및 토큰화
def tokenizing(words):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(words.lower())
    words = [x for x in words if x not in stop_words]
    
    return words

df['words'] = df['review'].map(lambda x : tokenizing(x))
df.head()

Unnamed: 0,id,sentiment,review,words
0,"""5814_8""",1,With all this stuff going down at the moment ...,"[stuff, going, moment, mj, started, listening,..."
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...,"[classic, war, worlds, timothy, hines, enterta..."
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...,"[film, starts, manager, nicholas, bell, giving..."
3,"""3630_4""",0,It must be assumed that those who praised thi...,"[must, assumed, praised, film, greatest, filme..."
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...,"[superbly, trashy, wondrously, unpretentious, ..."


In [5]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

# 토큰을 인덱스로 바꿔줌
token = Tokenizer()
token.fit_on_texts(df['words'])

df['vector'] = token.texts_to_sequences(df['words'])
df.head()

Unnamed: 0,id,sentiment,review,words,vector
0,"""5814_8""",1,With all this stuff going down at the moment ...,"[stuff, going, moment, mj, started, listening,...","[410, 71, 425, 8956, 511, 2484, 116, 54, 881, ..."
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...,"[classic, war, worlds, timothy, hines, enterta...","[236, 207, 3086, 3611, 7239, 321, 2, 411, 155,..."
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...,"[film, starts, manager, nicholas, bell, giving...","[2, 388, 2854, 4457, 3780, 604, 2210, 18035, 5..."
3,"""3630_4""",0,It must be assumed that those who praised thi...,"[must, assumed, praised, film, greatest, filme...","[101, 4896, 5399, 2, 688, 670, 1272, 42, 215, ..."
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...,"[superbly, trashy, wondrously, unpretentious, ...","[3409, 4193, 37747, 11135, 859, 2062, 13202, 1..."


In [6]:
# 단어 사전 확인

vocab = token.word_index
vocab

{'movie': 1,
 'film': 2,
 'one': 3,
 'like': 4,
 'good': 5,
 'time': 6,
 'even': 7,
 'would': 8,
 'story': 9,
 'really': 10,
 'see': 11,
 'well': 12,
 'much': 13,
 'get': 14,
 'bad': 15,
 'people': 16,
 'also': 17,
 'first': 18,
 'great': 19,
 'made': 20,
 'way': 21,
 'make': 22,
 'could': 23,
 'movies': 24,
 'think': 25,
 'characters': 26,
 'character': 27,
 'watch': 28,
 'two': 29,
 'films': 30,
 'seen': 31,
 'many': 32,
 'life': 33,
 'plot': 34,
 'acting': 35,
 'never': 36,
 'love': 37,
 'little': 38,
 'best': 39,
 'show': 40,
 'know': 41,
 'ever': 42,
 'man': 43,
 'better': 44,
 'end': 45,
 'still': 46,
 'say': 47,
 'scene': 48,
 'scenes': 49,
 'go': 50,
 'something': 51,
 'back': 52,
 'real': 53,
 'watching': 54,
 'though': 55,
 'old': 56,
 'thing': 57,
 'years': 58,
 'actors': 59,
 'director': 60,
 'work': 61,
 'another': 62,
 'new': 63,
 'nothing': 64,
 'funny': 65,
 '10': 66,
 'actually': 67,
 'makes': 68,
 'look': 69,
 'find': 70,
 'going': 71,
 'part': 72,
 'lot': 73,
 'every

In [7]:
vocab["<PAD>"] = 0
len(vocab)

75789

In [8]:
# 최대 길이 기준 75%로 max length 설정

print("min word length", df['vector'].map(lambda x : len(x)).min())
print("25% word length", df['vector'].map(lambda x : len(x)).quantile(0.25))
print("50% word length",df['vector'].map(lambda x : len(x)).quantile(0.50))
print("75% word length",df['vector'].map(lambda x : len(x)).quantile(0.75))
print("max word length",df['vector'].map(lambda x : len(x)).max())

min word length 4
25% word length 64.0
50% word length 90.0
75% word length 148.0
max word length 1429


In [9]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

max_padding = df['vector'].map(lambda x : len(x)).quantile(0.75).astype(int)

# max length에 맞추어 길이 변경 / padding 혹은 cropping
X_train = pad_sequences(df['vector'],maxlen = max_padding, padding = 'post' )
Y_train = df['sentiment']
print(X_train)

[[26373   121     1 ... 18947   320  1372]
 [  236   207  3086 ...     0     0     0]
 [    3   719 18948 ...   707  1187  5398]
 ...
 [  118  3144    14 ...     0     0     0]
 [  831   644   521 ...     0     0     0]
 [  110     1   354 ...     0     0     0]]


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tf-idf 계산
vectorizer = TfidfVectorizer(min_df=0.0, analyzer = "word", sublinear_tf=True,
                           ngram_range=(1,3), max_features=5000, stop_words='english')

tfidf_train = vectorizer.fit_transform(list(df['review']))

print(type(tfidf_train))
print(tfidf_train.shape)
print(tfidf_train[0]) 

# tfidf_train.toarray()

<class 'scipy.sparse.csr.csr_matrix'>
(25000, 5000)
  (0, 2502)	0.1028129880638737
  (0, 1285)	0.06783600784085628
  (0, 4212)	0.09798879691949967
  (0, 4484)	0.10513903894444403
  (0, 3287)	0.09207262080751873
  (0, 2998)	0.09979596337739731
  (0, 4479)	0.09655916967157495
  (0, 3286)	0.1049730103164135
  (0, 37)	0.09626873961802275
  (0, 2776)	0.09875448073816731
  (0, 1638)	0.17021192744889807
  (0, 2448)	0.10116830229528975
  (0, 2825)	0.171073173099871
  (0, 2174)	0.06334114094305214
  (0, 4298)	0.06275013728447221
  (0, 1559)	0.0683315613411591
  (0, 1571)	0.050147944199345866
  (0, 1305)	0.1025268801175609
  (0, 815)	0.10635206492339642
  (0, 1204)	0.056885940864640605
  (0, 1280)	0.03764404759175515
  (0, 4304)	0.07530453665252453
  (0, 1875)	0.06589300465590953
  (0, 312)	0.07042547866816848
  (0, 3348)	0.08579945561982709
  :	:
  (0, 2299)	0.08305401895465898
  (0, 2022)	0.160515236053926
  (0, 2890)	0.058087806097562915
  (0, 2758)	0.038533095865286734
  (0, 1399)	0.10653310

In [11]:
from sklearn.feature_extraction.text import CountVectorizer ## TF 

# count vectorization 방법
# 발화 단어의 종류를 통해 유사성 판단

vectorizer = CountVectorizer(analyzer = "word", max_features = 5000)
count_train = vectorizer.fit_transform(list(df['review']))
print(count_train.shape)
print(count_train[0]) 

(25000, 5000)
  (0, 4917)	4
  (0, 194)	4
  (0, 4465)	11
  (0, 4259)	1
  (0, 1939)	3
  (0, 1345)	1
  (0, 339)	2
  (0, 4439)	19
  (0, 2897)	1
  (0, 4722)	2
  (0, 4175)	1
  (0, 2623)	1
  (0, 4517)	9
  (0, 2128)	3
  (0, 2956)	2
  (0, 4822)	1
  (0, 3091)	1
  (0, 1319)	1
  (0, 2098)	1
  (0, 233)	10
  (0, 4451)	1
  (0, 4820)	2
  (0, 152)	1
  (0, 2785)	3
  (0, 2441)	3
  :	:
  (0, 2359)	1
  (0, 620)	1
  (0, 1918)	1
  (0, 4606)	1
  (0, 4370)	1
  (0, 1550)	1
  (0, 1958)	1
  (0, 3305)	1
  (0, 4850)	2
  (0, 353)	1
  (0, 1885)	1
  (0, 4269)	1
  (0, 1332)	1
  (0, 677)	1
  (0, 428)	1
  (0, 1262)	1
  (0, 457)	1
  (0, 868)	1
  (0, 1340)	1
  (0, 1631)	1
  (0, 1448)	1
  (0, 1620)	1
  (0, 4263)	1
  (0, 2164)	1
  (0, 2533)	1


In [12]:
from gensim.models import word2vec

# word2vec
sentences = []
for review in list(df['review']) :
    sentences.append(review.split())

num_features   = 1000
min_word_count = 20 
context        = 10 # window size
downsampling = 1e-3 # 빈출 단어에 패널티 샘플링 (학습 속도 측면)

w2v = word2vec.Word2Vec(sentences,
                        size=num_features,
                        min_count=min_word_count,
                        window= context,
                        sample=downsampling,
                        iter=10,
                        sg=0 #0: CBOW, 1:skip-gram
                        )

w2v.wv.most_similar("man")

[('woman', 0.6372383832931519),
 ('lady', 0.6055783033370972),
 ('soldier', 0.5552583932876587),
 ('priest', 0.525982677936554),
 ('guy', 0.5193488001823425),
 ('doctor', 0.5188654661178589),
 ('person', 0.5109984874725342),
 ('farmer', 0.49898603558540344),
 ('journalist', 0.49807900190353394),
 ('boy', 0.492019385099411)]

In [13]:
# w2v로 하나의 문서를 표현하기 위해서 각 단어 벡터의 평균 값을 이용함
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features), dtype = np.float32)
    
    num_words = 0
    index2word_set = set(model.wv.index2word)
    
    for w in words:
        if w in index2word_set:
            num_words +=1
            feature_vector = np.add(feature_vector, model[w])
            
    feature_vector = np.divide(feature_vector,num_words)
    
    return feature_vector

def get_dataset(reviews, model, num_features):
    dataset = list()
    
    for s in reviews :
        dataset.append(get_features(s,model,num_features))
    
    reviewFeaturevecs = np.stack(dataset)
    
    return reviewFeaturevecs

word2vec_train = get_dataset(sentences, w2v, num_features)
word2vec_train.shape

  # This is added back by InteractiveShellApp.init_path()


(25000, 1000)

In [14]:
print(f"First document's averaged word vector\n{word2vec_train[0]}")

First document's averaged word vector
[ 7.92942792e-02  2.71616817e-01  1.81260392e-01  1.22539490e-01
  9.66486037e-02  6.87005222e-02 -6.72703609e-02 -4.72922325e-02
  3.01981807e-01 -1.06756156e-02  1.39669299e-01 -1.99800983e-01
  5.44578359e-02 -1.45156115e-01  4.07924414e-01 -1.14219589e-02
  6.45725951e-02 -1.19653136e-01 -5.04831791e-01  8.77734497e-02
  2.10731267e-03 -3.47134583e-02  6.17381632e-02  2.26809662e-02
 -1.53841987e-01 -9.92677435e-02 -9.16189253e-02  1.32000029e-01
  2.50606924e-01 -9.03838724e-02  1.22445226e-01  4.58620954e-03
 -7.17912540e-02  4.26879451e-02  1.73533514e-01  5.08190468e-02
 -6.92595840e-02  1.63001508e-01 -2.21768498e-01  2.85976380e-01
  1.38495192e-02  3.84116381e-01  9.68754478e-03 -3.33310187e-01
 -8.65783729e-03  2.30269924e-01 -5.69937825e-02 -1.76770121e-01
 -1.32580251e-02  5.54324463e-02 -8.18264950e-03  5.55385016e-02
 -1.59476176e-01  4.09865588e-01  4.58134664e-03 -1.85977653e-01
 -5.25313057e-03  1.69091299e-01 -3.39421742e-02  9.