<a href="https://colab.research.google.com/github/ilikemichael/ml1216/blob/main/embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#此處先下載一個 imdb情緒分析 檔案包,包含25000pos跟 25000neg 的語意資料
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [11]:
#下載的檔案存在dataset裡面
import glob
import os
base = os.path.dirname(dataset) #拿出前半段
glob.glob("/root/.keras/datasets/aclImdb/train/*")

['/root/.keras/datasets/aclImdb/train/pos',
 '/root/.keras/datasets/aclImdb/train/unsup',
 '/root/.keras/datasets/aclImdb/train/unsupBow.feat',
 '/root/.keras/datasets/aclImdb/train/urls_pos.txt',
 '/root/.keras/datasets/aclImdb/train/labeledBow.feat',
 '/root/.keras/datasets/aclImdb/train/urls_neg.txt',
 '/root/.keras/datasets/aclImdb/train/urls_unsup.txt',
 '/root/.keras/datasets/aclImdb/train/neg']

In [12]:
#下載的檔案存在dataset裡面, 利用glob.glob一層一層去看,裡面有pos, neg不同的語意, 此處作為說明筆記,下段才能執行程式
# import glob
# import os
# base = os.path.dirname(dataset) #拿出前半段
# glob.glob("/root/.keras/datasets/aclImdb/train/*")

In [13]:
import os
import glob
import pandas as pd

base = os.path.dirname(dataset)
def get_data(category):
    contents, targets = [], []
    dir = os.path.join(base, "aclImdb", category, "pos")
    lfn = glob.glob(os.path.join(dir, "*.txt"))
    ufn = glob.glob(os.path.join(dir, "*.TXT"))
    for fn in lfn + ufn:
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            targets.append(1)
    dir = os.path.join(base, "aclImdb", category, "neg")
    lfn = glob.glob(os.path.join(dir, "*.txt"))
    ufn = glob.glob(os.path.join(dir, "*.TXT"))
    for fn in lfn + ufn:
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            targets.append(0)
    df = pd.DataFrame({
        "content":contents,
        "target":targets
    })
    return df
train_df = get_data("train")
test_df = get_data("test")

In [14]:
train_df

Unnamed: 0,content,target
0,"Elvira Mistress of the Dark is just that, a ca...",1
1,In complete contrast to the opinions of the ot...,1
2,I started watching The Apprentice about 4 year...,1
3,"""Dragonlord"" sees Chan returning to his role o...",1
4,the most amazing combination of love and psych...,1
...,...,...
24995,Chan Wook Park is nothing if not inventive. I'...,0
24996,"For getting so many positive reviews, this mov...",0
24997,This movie is painfully slow and has no plot. ...,0
24998,"OK OK, it might be hard to put the entirety of...",0


In [15]:
# MLP一定要先分詞,然後序列化,tokenize, sequence
# 這裡先Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
# fit
tok.fit_on_texts(train_df["content"])

In [16]:
# transform
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])

In [17]:
pd.DataFrame(x_train_seq)
tok.index_word[3005]
tok.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'he': 26,
 'be': 27,
 'one': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'who': 34,
 'so': 35,
 'from': 36,
 'like': 37,
 'her': 38,
 'or': 39,
 'just': 40,
 'about': 41,
 "it's": 42,
 'out': 43,
 'if': 44,
 'has': 45,
 'some': 46,
 'there': 47,
 'what': 48,
 'good': 49,
 'more': 50,
 'when': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'she': 56,
 'even': 57,
 'my': 58,
 'would': 59,
 'which': 60,
 'only': 61,
 'story': 62,
 'really': 63,
 'see': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'were': 68,
 'me': 69,
 'well': 70,
 'than': 71,
 'we': 72,
 'much': 73,
 'been': 74,
 'bad': 75,
 'get': 76,
 'will': 77,
 'do': 78,
 'also': 79,
 'into': 80,
 'people': 81,
 'other': 82,
 '

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)

In [19]:
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,794,38,944,84,1693,2,38,30,1,34,25,54,240,712,15,38,8,4,1,462,7,7,14,59,132,10,27,3,2032,8,972,49,151,10,158,132,27,3,2547
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1076,46,82,1132,1023,30,726,209,11,19,1404,7,7,249,22,710,167,5,847,229,39,25,3,18,44,22,137,16,32,906,327,22,77,50,71,1326,25,3,49,55
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,150,1,372,104,2178,68,614,60,825,1,1117,1473,7,7,8,1171,22,77,116,11,120,258,1,83,238,2178,187,44,22,398,146,1,120,91,1379,77,2633,122,2,699
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,201,19,6,1,1727,19,2,16,42,1148,20,247,2,765,2246,124,303,1,168,545,136,9,1577,931,3,774,133,9,117,1,344,14,3,212,64,19,8,11,509
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,16,138,12,405,1,17,3,2095,1803,2,116,5,103,3,212,64,17,15,29,1,81,34,261,8,280,116,31,227,1,88,726,30,219,28,4,95,17,4,29,208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,314,69,35,1254,10,1056,543,699,36,55,5,55,50,14,1,19,10,516,758,3,454,155,2,31,12,11,239,1011,54,50,71,3,339,15,152,2116,634,27,467,155
24996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,855,11,17,63,683,69,9,6,547,725,2,193,30,208,1,62,6,21,786,569,8,1,1516,790,102,58,2011,329,1,271,42,3,2726,62,60,1978,42,1488,20,265
24997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,15,58,229,749,1082,454,13,287,146,7,7,2,58,319,555,124,21,57,165,49,8,11,17,237,21,58,549,35,58,124,21,381,73,7,7,10,199,9,3,297
24998,59,25,74,2378,14,3,330,33,2468,32,2651,2,33,2468,24,116,15,34,6,244,946,174,1,436,16,11,233,151,6,12,33,112,384,9,137,280,2298,5,15,1,...,66,54,1097,26,8,5,166,3,179,12,13,679,2,192,5,384,87,166,1,541,12,59,94,87,1,83,726,1099,2,36,3,530,2195,5,3,129,725,16,3,1200
