In [1]:
import numpy as np
import pandas as pd

In [2]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
data_info = pd.read_csv('data_info.csv', encoding='cp949')
sample = pd.read_csv('sample_submission.csv')

In [4]:
train

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise
...,...,...,...,...,...
9984,TRAIN_9984,You or me?,Chandler,1038,neutral
9985,TRAIN_9985,"I got it. Uh, Joey, women don't have Adam's ap...",Ross,1038,neutral
9986,TRAIN_9986,"You guys are messing with me, right?",Joey,1038,surprise
9987,TRAIN_9987,Yeah.,All,1038,neutral


In [5]:
test

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
0,TEST_0000,Why do all the coffee cups have figures below?,Mark,0
1,TEST_0001,"Oh. It's so Monica can follow. Of this way, if...",Rachell,0
2,TEST_0002,You know what?,Rachell,0
3,TEST_0003,"Come on, Lydia, you can do it.",Joeyy,1
4,TEST_0004,To push!,Joeyy,1
...,...,...,...,...
2605,TEST_2605,"Yeah, I mean, go Ross, no one will even notice...",Rachell,279
2606,TEST_2606,They don't listen to me?,Rossi,279
2607,TEST_2607,"Of course, they listen to you! Everyone listen...",Rachell,279
2608,TEST_2608,"Monica, do you really think I should try this ...",Rossi,279


## Missing values

In [6]:
train.isnull().sum()

ID             0
Utterance      0
Speaker        0
Dialogue_ID    0
Target         0
dtype: int64

In [7]:
test.isnull().sum()

ID             0
Utterance      0
Speaker        0
Dialogue_ID    0
dtype: int64

## Duplicates

In [8]:
train.nunique()

ID             9989
Utterance      8931
Speaker         260
Dialogue_ID    1038
Target            7
dtype: int64

In [6]:
train.drop_duplicates(subset=['Utterance'], inplace=True)
train.reset_index(inplace=True)
train

Unnamed: 0,index,ID,Utterance,Speaker,Dialogue_ID,Target
0,0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,4,TRAIN_0004,My duties? All right.,Chandler,0,surprise
...,...,...,...,...,...,...
8926,9983,TRAIN_9983,It made me nuts.,Joey,1038,disgust
8927,9984,TRAIN_9984,You or me?,Chandler,1038,neutral
8928,9985,TRAIN_9985,"I got it. Uh, Joey, women don't have Adam's ap...",Ross,1038,neutral
8929,9986,TRAIN_9986,"You guys are messing with me, right?",Joey,1038,surprise


## Lemmatization

In [7]:
# using pos='v'
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_v(text):
  words = text.split()
  words = [lemmatizer.lemmatize(word, pos='v') for word in words]
  return ' '.join(words)

X_train_l = train['Utterance'].apply(lemmatize_v)
X_train_l

0       also I be the point person on my company’s tra...
1                        You must’ve have your hand full.
2                                 That I did. That I did.
3          So let’s talk a little bite about your duties.
4                                   My duties? All right.
                              ...                        
8926                                     It make me nuts.
8927                                           You or me?
8928    I get it. Uh, Joey, women don't have Adam's ap...
8929                      You guy be mess with me, right?
8930    That be a good one. For a second there, I be l...
Name: Utterance, Length: 8931, dtype: object

In [8]:
X_test_l = test['Utterance'].apply(lemmatize_v)
X_test_l

0            Why do all the coffee cup have figure below?
1       Oh. It's so Monica can follow. Of this way, if...
2                                          You know what?
3                          Come on, Lydia, you can do it.
4                                                To push!
                              ...                        
2605    Yeah, I mean, go Ross, no one will even notice...
2606                             They don't listen to me?
2607    Of course, they listen to you! Everyone listen...
2608    Monica, do you really think I should try this ...
2609                               I think you look good.
Name: Utterance, Length: 2610, dtype: object

## Encoding

In [9]:
y_train = train[['Target']]
y_train

Unnamed: 0,Target
0,neutral
1,neutral
2,neutral
3,neutral
4,surprise
...,...
8926,disgust
8927,neutral
8928,neutral
8929,surprise


In [13]:
y_train['Target'].unique()

array(['neutral', 'surprise', 'fear', 'sadness', 'joy', 'disgust',
       'anger'], dtype=object)

In [10]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

encoder.fit(y_train['Target'])
y_train['Label']=encoder.transform(y_train['Target'])
y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['Label']=encoder.transform(y_train['Target'])


Unnamed: 0,Target,Label
0,neutral,4
1,neutral,4
2,neutral,4
3,neutral,4
4,surprise,6
...,...,...
8926,disgust,1
8927,neutral,4
8928,neutral,4
8929,surprise,6


In [11]:
y_train_l=y_train['Label']
y_train_l

0       4
1       4
2       4
3       4
4       6
       ..
8926    1
8927    4
8928    4
8929    6
8930    3
Name: Label, Length: 8931, dtype: int64

## Modelling

In [16]:
# train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_l, y_train_l, test_size= 0.2, random_state=1234)

In [17]:
X_train

1212         When they’re hungry enough, they’ll come in.
386                                        It do in mine!
2225                                  You're not... gone?
4881                                Are you look at her?!
341                                           Is that so?
                              ...                        
664                                            That's it?
7540                                We need a porn break.
7221                                         Wow, Rhonda.
1318                     I’m just take it to be re-wired.
8915    Oh, you're busy, that's ok, I'll get it. Anybo...
Name: Utterance, Length: 7144, dtype: object

In [18]:
X_test

3842               You didn’t break up with that fireman?
5328                                            Hey guys!
3982                    Hey! Hey! No rough hold in my ER!
4790                                Yep! And lot’s of it!
1278    Well, let me ask you something, be Kip a bette...
                              ...                        
174                         I'll get it! I will get that!
7518                                     Yeah, she’s gay.
6298                                       Oh. Thank you.
5930    Oh my God! It sure didn’t look this way when I...
5690                                  Her answer machine?
Name: Utterance, Length: 1787, dtype: object

In [19]:
y_train

1212    4
386     0
2225    6
4881    0
341     4
       ..
664     6
7540    4
7221    6
1318    4
8915    4
Name: Label, Length: 7144, dtype: int64

In [20]:
y_test

3842    6
5328    3
3982    0
4790    3
1278    5
       ..
174     3
7518    4
6298    4
5930    6
5690    4
Name: Label, Length: 1787, dtype: int64

### Integer encoding

In [21]:
# vocab dictionary based on X_train
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

vocab = {}
X_train_prep = []
stop_words = set(stopwords.words('english'))

for sentence in X_train:
    # word_tokenize를 활용하여 토큰화
    X_train_t = word_tokenize(sentence)
    result=[]
    for word in X_train_t:
        # 단어 모두 소문자로 적용
        word = word.lower() 
        # 불용어 제거
        if word not in stop_words: 
            # 단어의 길이가 1인 짧은 단어 제거
            if len(word) > 1:
                result.append(word)
                if word not in vocab:
                    vocab[word] = 0 
                vocab[word] += 1
    X_train_prep.append(result) 
print(X_train_prep)



In [22]:
vocab_sorted = sorted(vocab.items(), key = lambda x:x[1], reverse = True)
vocab_sorted

[('oh', 786),
 ('know', 545),
 ('get', 530),
 ("'s", 528),
 ('okay', 469),
 ('go', 467),
 ('yeah', 442),
 ('well', 422),
 ('na', 388),
 ('right', 362),
 ("n't", 322),
 ('think', 317),
 ('like', 316),
 ('gon', 302),
 ('hey', 295),
 ('look', 262),
 ("'m", 243),
 ('want', 238),
 ('really', 235),
 ('...', 227),
 ('uh', 224),
 ('come', 223),
 ('tell', 219),
 ('see', 217),
 ('say', 212),
 ('one', 209),
 ('mean', 194),
 ('guy', 184),
 ('ross', 176),
 ("'re", 157),
 ('sorry', 155),
 ('joey', 154),
 ('let', 152),
 ('good', 152),
 ('make', 152),
 ('god', 148),
 ('would', 136),
 ('great', 133),
 ('could', 129),
 ('time', 129),
 ('umm', 128),
 ("y'know", 123),
 ('take', 119),
 ('love', 118),
 ('something', 116),
 ('yes', 116),
 ('back', 115),
 ('monica', 114),
 ('chandler', 108),
 ('talk', 102),
 ('little', 101),
 ('rachel', 100),
 ('phoebe', 95),
 ('give', 91),
 ('hi', 90),
 ('wait', 90),
 ('call', 89),
 ('wan', 86),
 ('maybe', 82),
 ('feel', 80),
 ('need', 78),
 ('us', 78),
 ('ok', 77),
 ('thing

In [23]:
word_to_index = {}
i = 0
for (word, frequency) in vocab_sorted :
    if frequency > 1 :
        i = i + 1
        word_to_index[word] = i

print(word_to_index)

{'oh': 1, 'know': 2, 'get': 3, "'s": 4, 'okay': 5, 'go': 6, 'yeah': 7, 'well': 8, 'na': 9, 'right': 10, "n't": 11, 'think': 12, 'like': 13, 'gon': 14, 'hey': 15, 'look': 16, "'m": 17, 'want': 18, 'really': 19, '...': 20, 'uh': 21, 'come': 22, 'tell': 23, 'see': 24, 'say': 25, 'one': 26, 'mean': 27, 'guy': 28, 'ross': 29, "'re": 30, 'sorry': 31, 'joey': 32, 'let': 33, 'good': 34, 'make': 35, 'god': 36, 'would': 37, 'great': 38, 'could': 39, 'time': 40, 'umm': 41, "y'know": 42, 'take': 43, 'love': 44, 'something': 45, 'yes': 46, 'back': 47, 'monica': 48, 'chandler': 49, 'talk': 50, 'little': 51, 'rachel': 52, 'phoebe': 53, 'give': 54, 'hi': 55, 'wait': 56, 'call': 57, 'wan': 58, 'maybe': 59, 'feel': 60, 'need': 61, 'us': 62, 'ok': 63, 'thing': 64, 'listen': 65, 'people': 66, '``': 67, 'thank': 68, 'never': 69, 'please': 70, "''": 71, 'work': 72, 'i-i': 73, 'first': 74, 'still': 75, 'sure': 76, 'man': 77, 'actually': 78, 'wow': 79, 'much': 80, 'um': 81, 'got': 82, 'way': 83, 'believe': 84

In [25]:
# vocab size
vocab_size = 1000
words_frequency = [word for word, index in word_to_index.items() if index >= vocab_size + 1]

for w in words_frequency:
    del word_to_index[w]

print(word_to_index)

{'oh': 1, 'know': 2, 'get': 3, "'s": 4, 'okay': 5, 'go': 6, 'yeah': 7, 'well': 8, 'na': 9, 'right': 10, "n't": 11, 'think': 12, 'like': 13, 'gon': 14, 'hey': 15, 'look': 16, "'m": 17, 'want': 18, 'really': 19, '...': 20, 'uh': 21, 'come': 22, 'tell': 23, 'see': 24, 'say': 25, 'one': 26, 'mean': 27, 'guy': 28, 'ross': 29, "'re": 30, 'sorry': 31, 'joey': 32, 'let': 33, 'good': 34, 'make': 35, 'god': 36, 'would': 37, 'great': 38, 'could': 39, 'time': 40, 'umm': 41, "y'know": 42, 'take': 43, 'love': 44, 'something': 45, 'yes': 46, 'back': 47, 'monica': 48, 'chandler': 49, 'talk': 50, 'little': 51, 'rachel': 52, 'phoebe': 53, 'give': 54, 'hi': 55, 'wait': 56, 'call': 57, 'wan': 58, 'maybe': 59, 'feel': 60, 'need': 61, 'us': 62, 'ok': 63, 'thing': 64, 'listen': 65, 'people': 66, '``': 67, 'thank': 68, 'never': 69, 'please': 70, "''": 71, 'work': 72, 'i-i': 73, 'first': 74, 'still': 75, 'sure': 76, 'man': 77, 'actually': 78, 'wow': 79, 'much': 80, 'um': 81, 'got': 82, 'way': 83, 'believe': 84

In [26]:
# sort words without index as OOV
word_to_index['OOV'] = len(word_to_index) + 1
print(word_to_index)

{'oh': 1, 'know': 2, 'get': 3, "'s": 4, 'okay': 5, 'go': 6, 'yeah': 7, 'well': 8, 'na': 9, 'right': 10, "n't": 11, 'think': 12, 'like': 13, 'gon': 14, 'hey': 15, 'look': 16, "'m": 17, 'want': 18, 'really': 19, '...': 20, 'uh': 21, 'come': 22, 'tell': 23, 'see': 24, 'say': 25, 'one': 26, 'mean': 27, 'guy': 28, 'ross': 29, "'re": 30, 'sorry': 31, 'joey': 32, 'let': 33, 'good': 34, 'make': 35, 'god': 36, 'would': 37, 'great': 38, 'could': 39, 'time': 40, 'umm': 41, "y'know": 42, 'take': 43, 'love': 44, 'something': 45, 'yes': 46, 'back': 47, 'monica': 48, 'chandler': 49, 'talk': 50, 'little': 51, 'rachel': 52, 'phoebe': 53, 'give': 54, 'hi': 55, 'wait': 56, 'call': 57, 'wan': 58, 'maybe': 59, 'feel': 60, 'need': 61, 'us': 62, 'ok': 63, 'thing': 64, 'listen': 65, 'people': 66, '``': 67, 'thank': 68, 'never': 69, 'please': 70, "''": 71, 'work': 72, 'i-i': 73, 'first': 74, 'still': 75, 'sure': 76, 'man': 77, 'actually': 78, 'wow': 79, 'much': 80, 'um': 81, 'got': 82, 'way': 83, 'believe': 84

In [27]:
X_train_pre = []

for sentence in X_train:
    X_train_t = word_tokenize(sentence)
    result=[]
    for word in X_train_t: 
        word = word.lower() 
        if word not in stop_words: 
            if len(word) > 1:
                result.append(word)
    X_train_pre.append(result) 
print(X_train_pre)



In [28]:
X_train_enc = []

for sentence in X_train_pre:
    X_train_enc_tr = []
    for word in sentence:
        try:
            X_train_enc_tr.append(word_to_index[word])
        except KeyError:
            X_train_enc_tr.append(word_to_index['OOV'])
    X_train_enc.append(X_train_enc_tr)
print(X_train_enc)

[[1001, 236, 22], [248], [30, 20, 431], [16], [], [3, 751], [7, 335, 11, 336, 43], [25], [1001], [125, 6, 572, 478, 178], [354, 890, 162], [16, 43, 647], [1001, 355, 355], [1, 36, 28, 294, 891, 752], [1, 36, 17, 31, 63], [237, 75, 19, 61, 1001, 183, 2, 184, 432, 1001, 120, 1001], [38, 10, 57, 225], [13, 28, 433, 753, 1001, 1001], [479, 1001, 479, 30, 1001, 77, 33, 4, 3, 72], [573, 210, 574, 10, 100], [3, 754, 170, 1001, 1001], [163, 1001, 1001, 18, 61, 19, 184], [4, 38, 21, 17, 19, 648, 101, 1001], [10, 6], [1, 171, 755], [5, 1001, 1001, 1001, 1001], [2, 51, 756, 51, 1001, 167, 13, 45, 13, 27], [261, 25, 757], [23, 62, 286, 1001, 1001, 29], [307, 373], [5, 16, 12, 23, 52, 402, 1001], [325, 86, 1001, 295, 61], [2, 1001, 249, 649, 249, 26], [17, 520], [27, 650], [34, 5], [4, 21, 4, 403], [521, 1001, 1001], [374, 250, 26, 108, 238], [575, 1001, 1001, 1001, 1001, 1001, 48, 576, 1001, 1001, 1001], [], [25], [226, 61, 199, 1001, 651], [1, 38, 79, 7, 652], [434], [16, 1001], [1, 308, 309], [1

In [29]:
X_test_pre = []

for sentence in X_test:
    X_test_t = word_tokenize(sentence)
    result=[]
    for word in X_test_t: 
        word = word.lower() 
        if word not in stop_words: 
            if len(word) > 1:
                result.append(word)
    X_test_pre.append(result) 
print(X_test_pre)

[['break', 'fireman'], ['hey', 'guys'], ['hey', 'hey', 'rough', 'hold', 'er'], ['yep', 'lot'], ['well', 'let', 'ask', 'something', 'kip', 'better', 'roommate'], ["'re", 'constantly', 'like', 'reassure', "'re", 'good', 'time'], ['yeah', 'need', 'bring', 'photos', 'ross'], ['wonderful', 'amaze'], ['know'], ['ohh', "'s", 'nice', 'great', 'well', "'s", 'trip'], ['take', 'suggestions'], ["'s", 'brilliant'], ['uh', 'phoebe'], ['ahh', 'think', 'moved', 'really', 'poke'], ['huh'], ['cos', 'gon', 'na', 'say', 'way', 'could', 'end', 'way', 'guy', 'back'], ['believe', 'guy', 'destine', 'someone', 'else', 'still', 'gon', 'na', 'date'], ['oh-oh', 'risky', 'little', 'game'], ['try', 'little', 'girl'], ['way'], [], ['yes', "y'know"], ['``', 'oh', 'god', 'find', 'boyfriend'], ['case', 'make', 'sure', "'s", 'real', 'good'], ['bra'], ['day', 'gon', 'na', 'die', 'see—darnit', 'get', 'shuffleboard', 'day'], ['friendly'], ['come', 'peeking'], ['enough'], ['say', 'break', 'window', 'crawl', 'and-and', 'know

In [30]:
X_test_enc = []

for sentence in X_test_pre:
    X_test_enc_tr = []
    for word in sentence:
        try:
            X_test_enc_tr.append(word_to_index[word])
        except KeyError:
            X_test_enc_tr.append(word_to_index['OOV'])
    X_test_enc.append(X_test_enc_tr)
print(X_test_enc)

[[164, 1001], [15, 105], [15, 15, 1001, 247, 1001], [346, 101], [8, 33, 97, 45, 1001, 147, 476], [30, 1001, 13, 1001, 30, 34, 40], [7, 61, 163, 1001, 29], [810, 665], [2], [145, 4, 104, 38, 8, 4, 718], [43, 1001], [4, 1001], [21, 53], [326, 12, 1001, 19, 1001], [114], [1001, 14, 9, 25, 83, 39, 375, 83, 28, 47], [84, 28, 1001, 183, 197, 75, 14, 9, 126], [585, 1001, 51, 168], [94, 51, 130], [83], [], [46, 42], [67, 1, 36, 117, 696], [1001, 35, 76, 4, 254, 34], [1001], [157, 14, 9, 374, 1001, 3, 1001, 157], [1001], [22, 1001], [236], [25, 164, 747, 1001, 320, 2, 868, 225], [406, 47, 107, 49, 266, 1001], [1, 148, 298, 896, 1001, 2, 1001, 1001, 1001, 1001, 1001], [73, 2, 365, 146, 92, 13, 325, 174, 487, 2, 1001, 3, 1001], [73, 84, 861, 431, 124, 203, 1001], [53], [16, 16, 794, 232, 111, 154, 1001, 1001, 301, 23, 14, 9, 1001, 366], [19], [1001, 114], [1, 414, 1001, 379, 143, 33, 3, 257, 40], [8, 92], [33, 22, 47, 225], [1, 11, 65, 103, 12, 4, 1001], [3], [16, 65, 1001, 114], [38, 79, 77, 32,

In [31]:
# y_train/test to array
y_train=y_train.to_numpy()
y_train

array([4, 0, 6, ..., 6, 4, 4])

In [32]:
y_test=y_test.to_numpy()
y_test

array([6, 3, 0, ..., 4, 6, 4])

In [33]:
# Padding
max_train = max(len(item) for item in X_train_enc)
max_train

37

In [34]:
max_test = max(len(item) for item in X_test_enc)
max_test

18

In [35]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len=50
X_train_f = pad_sequences(X_train_enc, maxlen=max_len)
X_test_f = pad_sequences(X_test_enc, maxlen=max_len)

In [36]:
X_train_f

array([[   0,    0,    0, ..., 1001,  236,   22],
       [   0,    0,    0, ...,    0,    0,  248],
       [   0,    0,    0, ...,   30,   20,  431],
       ...,
       [   0,    0,    0, ...,    0,   79, 1001],
       [   0,    0,    0, ...,    0,   43, 1001],
       [   0,    0,    0, ...,  197,   18,   26]], dtype=int32)

In [37]:
X_test_f

array([[   0,    0,    0, ...,    0,  164, 1001],
       [   0,    0,    0, ...,    0,   15,  105],
       [   0,    0,    0, ..., 1001,  247, 1001],
       ...,
       [   0,    0,    0, ...,    0,    1,   68],
       [   0,    0,    0, ...,   16,   83,  158],
       [   0,    0,    0, ...,    0,  466,  370]], dtype=int32)

In [38]:
# to_categorical
from tensorflow.keras.utils import to_categorical

y_train_f = to_categorical(y_train, 7)
y_test_f = to_categorical(y_test, 7)

In [39]:
y_train_f

array([[0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]], dtype=float32)

In [40]:
y_test_f

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.]], dtype=float32)

In [71]:
# 1. LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

embedding_dim = 128
hidden_units = 128
num_classes = 7
vocab_size = 2000

model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_dim))
model1.add(LSTM(hidden_units))
model1.add(Dense(num_classes, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
history = model1.fit(X_train_f, y_train_f, batch_size=64, epochs=30, callbacks=[es, mc], validation_data=(X_test_f, y_test_f))

Epoch 1/30
Epoch 1: val_acc improved from -inf to 0.46391, saving model to best_model.h5
Epoch 2/30
Epoch 2: val_acc improved from 0.46391 to 0.49356, saving model to best_model.h5
Epoch 3/30
Epoch 3: val_acc did not improve from 0.49356
Epoch 4/30
Epoch 4: val_acc did not improve from 0.49356
Epoch 5/30
Epoch 5: val_acc did not improve from 0.49356
Epoch 6/30
Epoch 6: val_acc did not improve from 0.49356
Epoch 7/30
Epoch 7: val_acc did not improve from 0.49356
Epoch 7: early stopping


In [72]:
loaded_model1 = load_model('best_model.h5')
print("\n test accuracy: %.4f" % (loaded_model1.evaluate(X_test_f, y_test_f)[1]))


 테스트 정확도: 0.4936


- model1 accuracy 0.4936

In [67]:
# 2. GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

embedding_dim = 128
hidden_units = 128
num_classes = 7
vocab_size = 2000

model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_dim))
model2.add(GRU(hidden_units))
model2.add(Dense(num_classes, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
history = model2.fit(X_train_f, y_train_f, batch_size=64, epochs=30, callbacks=[es, mc], validation_data=(X_test_f, y_test_f))

Epoch 1/30
Epoch 1: val_acc improved from -inf to 0.46838, saving model to best_model.h5
Epoch 2/30
Epoch 2: val_acc improved from 0.46838 to 0.49189, saving model to best_model.h5
Epoch 3/30
Epoch 3: val_acc did not improve from 0.49189
Epoch 4/30
Epoch 4: val_acc did not improve from 0.49189
Epoch 5/30
Epoch 5: val_acc did not improve from 0.49189
Epoch 6/30
Epoch 6: val_acc did not improve from 0.49189
Epoch 6: early stopping


In [68]:
loaded_model2 = load_model('best_model.h5')
print("\n test accuracy: %.4f" % (loaded_model2.evaluate(X_test_f, y_test_f)[1]))


 테스트 정확도: 0.4919


- model2 accuracy 0.4919

In [69]:
# 3. SimpleRNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

embedding_dim = 128
hidden_units = 128
num_classes = 7
vocab_size = 2000

model3 = Sequential()
model3.add(Embedding(vocab_size, embedding_dim))
model3.add(GRU(hidden_units))
model3.add(Dense(num_classes, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
history = model3.fit(X_train_f, y_train_f, batch_size=64, epochs=30, callbacks=[es, mc], validation_data=(X_test_f, y_test_f))

Epoch 1/30
Epoch 1: val_acc improved from -inf to 0.47454, saving model to best_model.h5
Epoch 2/30
Epoch 2: val_acc improved from 0.47454 to 0.48069, saving model to best_model.h5
Epoch 3/30
Epoch 3: val_acc improved from 0.48069 to 0.49468, saving model to best_model.h5
Epoch 4/30
Epoch 4: val_acc did not improve from 0.49468
Epoch 5/30
Epoch 5: val_acc did not improve from 0.49468
Epoch 6/30
Epoch 6: val_acc did not improve from 0.49468
Epoch 6: early stopping


In [70]:
loaded_model3 = load_model('best_model.h5')
print("\n test accuracy: %.4f" % (loaded_model3.evaluate(X_test_f, y_test_f)[1]))


 테스트 정확도: 0.4947


- model3 accuracy 0.4947

In [87]:
# 4. CNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

embedding_dim = 128
dropout_ratio = 0.3
num_filters = 128
kernel_size = 3
hidden_units = 128
vocab_size = 1500

model4 = Sequential()
model4.add(Embedding(vocab_size, embedding_dim))
model4.add(Dropout(dropout_ratio))
model4.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))
model4.add(GlobalMaxPooling1D())
model4.add(Dense(hidden_units, activation='relu'))
model4.add(Dropout(dropout_ratio))
model4.add(Dense(7, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model4.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
history = model4.fit(X_train_f, y_train_f, epochs=20, validation_data=(X_test_f, y_test_f), callbacks=[es, mc])

Epoch 1/20
Epoch 1: val_acc improved from -inf to 0.46391, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_acc improved from 0.46391 to 0.49860, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_acc improved from 0.49860 to 0.50420, saving model to best_model.h5
Epoch 4/20
Epoch 4: val_acc did not improve from 0.50420
Epoch 5/20
Epoch 5: val_acc did not improve from 0.50420
Epoch 6/20
Epoch 6: val_acc did not improve from 0.50420
Epoch 6: early stopping


In [88]:
loaded_model4 = load_model('best_model.h5')
print("\n test accuracy: %.4f" % (loaded_model4.evaluate(X_test_f, y_test_f)[1]))


 테스트 정확도: 0.5042


- model4 accuracy 0.5042

In [90]:
X_train

1212         When they’re hungry enough, they’ll come in.
386                                        It do in mine!
2225                                  You're not... gone?
4881                                Are you look at her?!
341                                           Is that so?
                              ...                        
664                                            That's it?
7540                                We need a porn break.
7221                                         Wow, Rhonda.
1318                     I’m just take it to be re-wired.
8915    Oh, you're busy, that's ok, I'll get it. Anybo...
Name: Utterance, Length: 7144, dtype: object

In [91]:
X_test

3842               You didn’t break up with that fireman?
5328                                            Hey guys!
3982                    Hey! Hey! No rough hold in my ER!
4790                                Yep! And lot’s of it!
1278    Well, let me ask you something, be Kip a bette...
                              ...                        
174                         I'll get it! I will get that!
7518                                     Yeah, she’s gay.
6298                                       Oh. Thank you.
5930    Oh my God! It sure didn’t look this way when I...
5690                                  Her answer machine?
Name: Utterance, Length: 1787, dtype: object

In [92]:
y_train

array([4, 0, 6, ..., 6, 4, 4])

In [93]:
y_test

array([6, 3, 0, ..., 4, 6, 4])

In [94]:
X_train_NB = X_train.to_numpy()
X_train_NB

array(['When they’re hungry enough, they’ll come in.', 'It do in mine!',
       "You're not... gone?", ..., 'Wow, Rhonda.',
       'I’m just take it to be re-wired.',
       "Oh, you're busy, that's ok, I'll get it. Anybody else want one?"],
      dtype=object)

In [95]:
X_test_NB = X_test.to_numpy()
X_test_NB

array(['You didn’t break up with that fireman?', 'Hey guys!',
       'Hey! Hey! No rough hold in my ER!', ..., 'Oh. Thank you.',
       'Oh my God! It sure didn’t look this way when I live here.',
       'Her answer machine?'], dtype=object)

In [97]:
# 5. NB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

vectorizer = CountVectorizer()
X_train_dtm = vectorizer.fit_transform(X_train_NB)

tfidf_transformer = TfidfTransformer()
tfidf_train = tfidf_transformer.fit_transform(X_train_dtm)

model5 = MultinomialNB()
model5.fit(tfidf_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

X_test_dtm = vectorizer.transform(X_test_NB)
tfidf_test = tfidf_transformer.transform(X_test_dtm)

pred = model5.predict(tfidf_test)
print("\n test accuracy: %.4f" % (accuracy_score(y_test, pred)))


 테스트 정확도: 0.4790


- model5 accuracy 0.4790

## CNN

In [12]:
X_train_l

0       also I be the point person on my company’s tra...
1                        You must’ve have your hand full.
2                                 That I did. That I did.
3          So let’s talk a little bite about your duties.
4                                   My duties? All right.
                              ...                        
8926                                     It make me nuts.
8927                                           You or me?
8928    I get it. Uh, Joey, women don't have Adam's ap...
8929                      You guy be mess with me, right?
8930    That be a good one. For a second there, I be l...
Name: Utterance, Length: 8931, dtype: object

In [13]:
X_test_l

0            Why do all the coffee cup have figure below?
1       Oh. It's so Monica can follow. Of this way, if...
2                                          You know what?
3                          Come on, Lydia, you can do it.
4                                                To push!
                              ...                        
2605    Yeah, I mean, go Ross, no one will even notice...
2606                             They don't listen to me?
2607    Of course, they listen to you! Everyone listen...
2608    Monica, do you really think I should try this ...
2609                               I think you look good.
Name: Utterance, Length: 2610, dtype: object

In [14]:
y_train_l

0       4
1       4
2       4
3       4
4       6
       ..
8926    1
8927    4
8928    4
8929    6
8930    3
Name: Label, Length: 8931, dtype: int64

In [15]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

vocab = {}
X_train_prep = []
stop_words = set(stopwords.words('english'))

for sentence in X_train_l:
    # word_tokenize를 활용하여 토큰화
    X_train_t = word_tokenize(sentence)
    result=[]
    for word in X_train_t:
        # 단어 모두 소문자로 적용
        word = word.lower() 
        # 불용어 제거
        if word not in stop_words: 
            # 단어의 길이가 1인 짧은 단어 제거
            if len(word) > 1:
                result.append(word)
                if word not in vocab:
                    vocab[word] = 0 
                vocab[word] += 1
    X_train_prep.append(result) 
print(X_train_prep)



In [16]:
vocab_sorted = sorted(vocab.items(), key = lambda x:x[1], reverse = True)
vocab_sorted

[('oh', 989),
 ('get', 676),
 ('know', 670),
 ("'s", 658),
 ('go', 576),
 ('okay', 573),
 ('yeah', 545),
 ('well', 527),
 ('na', 481),
 ('right', 440),
 ('think', 405),
 ("n't", 398),
 ('like', 388),
 ('hey', 378),
 ('gon', 365),
 ('look', 331),
 ('...', 299),
 ("'m", 297),
 ('really', 294),
 ('want', 288),
 ('uh', 281),
 ('come', 277),
 ('one', 267),
 ('see', 265),
 ('tell', 265),
 ('mean', 256),
 ('say', 251),
 ('guy', 225),
 ('ross', 212),
 ('sorry', 195),
 ("'re", 193),
 ('make', 190),
 ('joey', 189),
 ('god', 185),
 ('good', 184),
 ('let', 183),
 ('would', 174),
 ('great', 168),
 ('could', 164),
 ('time', 161),
 ("y'know", 159),
 ('umm', 151),
 ('love', 147),
 ('take', 145),
 ('back', 142),
 ('yes', 139),
 ('little', 138),
 ('chandler', 134),
 ('something', 132),
 ('monica', 125),
 ('rachel', 124),
 ('talk', 118),
 ('wan', 116),
 ('phoebe', 115),
 ('give', 111),
 ('wait', 108),
 ('hi', 106),
 ('call', 106),
 ('thing', 100),
 ('us', 100),
 ('ok', 99),
 ('thank', 98),
 ('people', 97

In [17]:
word_to_index = {}
i = 0
for (word, frequency) in vocab_sorted :
    if frequency > 1 :
        i = i + 1
        word_to_index[word] = i

print(word_to_index)

{'oh': 1, 'get': 2, 'know': 3, "'s": 4, 'go': 5, 'okay': 6, 'yeah': 7, 'well': 8, 'na': 9, 'right': 10, 'think': 11, "n't": 12, 'like': 13, 'hey': 14, 'gon': 15, 'look': 16, '...': 17, "'m": 18, 'really': 19, 'want': 20, 'uh': 21, 'come': 22, 'one': 23, 'see': 24, 'tell': 25, 'mean': 26, 'say': 27, 'guy': 28, 'ross': 29, 'sorry': 30, "'re": 31, 'make': 32, 'joey': 33, 'god': 34, 'good': 35, 'let': 36, 'would': 37, 'great': 38, 'could': 39, 'time': 40, "y'know": 41, 'umm': 42, 'love': 43, 'take': 44, 'back': 45, 'yes': 46, 'little': 47, 'chandler': 48, 'something': 49, 'monica': 50, 'rachel': 51, 'talk': 52, 'wan': 53, 'phoebe': 54, 'give': 55, 'wait': 56, 'hi': 57, 'call': 58, 'thing': 59, 'us': 60, 'ok': 61, 'thank': 62, 'people': 63, 'need': 64, 'never': 65, "'ll": 66, 'maybe': 67, 'listen': 68, 'please': 69, 'much': 70, '``': 71, 'feel': 72, 'sure': 73, 'i-i': 74, 'still': 75, "''": 76, 'work': 77, 'two': 78, 'way': 79, 'man': 80, 'ah': 81, 'actually': 82, 'first': 83, 'wow': 84, 'u

In [18]:
vocab_size = 1000
words_frequency = [word for word, index in word_to_index.items() if index >= vocab_size + 1]

for w in words_frequency:
    del word_to_index[w]

print(word_to_index)

{'oh': 1, 'get': 2, 'know': 3, "'s": 4, 'go': 5, 'okay': 6, 'yeah': 7, 'well': 8, 'na': 9, 'right': 10, 'think': 11, "n't": 12, 'like': 13, 'hey': 14, 'gon': 15, 'look': 16, '...': 17, "'m": 18, 'really': 19, 'want': 20, 'uh': 21, 'come': 22, 'one': 23, 'see': 24, 'tell': 25, 'mean': 26, 'say': 27, 'guy': 28, 'ross': 29, 'sorry': 30, "'re": 31, 'make': 32, 'joey': 33, 'god': 34, 'good': 35, 'let': 36, 'would': 37, 'great': 38, 'could': 39, 'time': 40, "y'know": 41, 'umm': 42, 'love': 43, 'take': 44, 'back': 45, 'yes': 46, 'little': 47, 'chandler': 48, 'something': 49, 'monica': 50, 'rachel': 51, 'talk': 52, 'wan': 53, 'phoebe': 54, 'give': 55, 'wait': 56, 'hi': 57, 'call': 58, 'thing': 59, 'us': 60, 'ok': 61, 'thank': 62, 'people': 63, 'need': 64, 'never': 65, "'ll": 66, 'maybe': 67, 'listen': 68, 'please': 69, 'much': 70, '``': 71, 'feel': 72, 'sure': 73, 'i-i': 74, 'still': 75, "''": 76, 'work': 77, 'two': 78, 'way': 79, 'man': 80, 'ah': 81, 'actually': 82, 'first': 83, 'wow': 84, 'u

In [19]:
word_to_index['OOV'] = len(word_to_index) + 1
print(word_to_index)

{'oh': 1, 'get': 2, 'know': 3, "'s": 4, 'go': 5, 'okay': 6, 'yeah': 7, 'well': 8, 'na': 9, 'right': 10, 'think': 11, "n't": 12, 'like': 13, 'hey': 14, 'gon': 15, 'look': 16, '...': 17, "'m": 18, 'really': 19, 'want': 20, 'uh': 21, 'come': 22, 'one': 23, 'see': 24, 'tell': 25, 'mean': 26, 'say': 27, 'guy': 28, 'ross': 29, 'sorry': 30, "'re": 31, 'make': 32, 'joey': 33, 'god': 34, 'good': 35, 'let': 36, 'would': 37, 'great': 38, 'could': 39, 'time': 40, "y'know": 41, 'umm': 42, 'love': 43, 'take': 44, 'back': 45, 'yes': 46, 'little': 47, 'chandler': 48, 'something': 49, 'monica': 50, 'rachel': 51, 'talk': 52, 'wan': 53, 'phoebe': 54, 'give': 55, 'wait': 56, 'hi': 57, 'call': 58, 'thing': 59, 'us': 60, 'ok': 61, 'thank': 62, 'people': 63, 'need': 64, 'never': 65, "'ll": 66, 'maybe': 67, 'listen': 68, 'please': 69, 'much': 70, '``': 71, 'feel': 72, 'sure': 73, 'i-i': 74, 'still': 75, "''": 76, 'work': 77, 'two': 78, 'way': 79, 'man': 80, 'ah': 81, 'actually': 82, 'first': 83, 'wow': 84, 'u

In [21]:
X_train_pre = []

for sentence in X_train_l:
    X_train_t = word_tokenize(sentence)
    result=[]
    for word in X_train_t: 
        word = word.lower() 
        if word not in stop_words: 
            if len(word) > 1:
                result.append(word)
    X_train_pre.append(result) 
print(X_train_pre)



In [22]:
X_train_enc = []

for sentence in X_train_pre:
    X_train_enc_tr = []
    for word in sentence:
        try:
            X_train_enc_tr.append(word_to_index[word])
        except KeyError:
            X_train_enc_tr.append(word_to_index['OOV'])
    X_train_enc.append(X_train_enc_tr)
print(X_train_enc)

[[307, 447, 295, 881, 1001, 1001, 1001, 680], [251, 184, 613], [], [36, 52, 47, 322, 1001], [1001, 10], [185, 176, 1001, 106, 1001], [24], [1001, 681, 63, 448, 1001, 1001], [35, 3], [5, 1001], [1001], [10, 1001, 419, 1001, 11, 27, 1001, 882, 8], [19], [348, 383], [1001, 5, 111, 682], [3, 180], [517, 52], [1001, 12, 82, 12, 3], [61], [10, 8, 17], [7, 73], [14, 400], [1001, 53, 9, 135, 49, 614], [113], [1001, 27, 308, 615], [79], [7, 518, 159, 138, 220, 107], [220, 15, 9, 194], [86], [2, 1001], [14, 117, 420], [1001], [683, 28], [1001, 7, 10], [1, 34, 1, 34, 1001, 50], [], [], [252, 1001], [16, 1001, 1001, 616, 120, 617], [89, 323, 296, 1001, 75], [11, 50, 616, 616, 1001], [1, 166, 1, 34, 1, 296, 160], [332], [14], [57], [], [81, 41, 485, 684, 519, 17], [1], [57], [5], [1, 8, 154, 618, 124, 685, 41, 62, 34, 449, 1001, 125, 52, 570, 349, 2, 160, 486], [686], [], [68, 81, 30, 227, 687, 13, 106, 17], [3], [7], [1001], [1001], [1001, 41], [228], [35, 181, 281, 8, 332, 487, 1001], [13, 19, 13

In [23]:
X_test_pre = []

for sentence in X_test_l:
    X_test_t = word_tokenize(sentence)
    result=[]
    for word in X_test_t: 
        word = word.lower() 
        if word not in stop_words: 
            if len(word) > 1:
                result.append(word)
    X_test_pre.append(result) 
print(X_test_pre)

[['coffee', 'cup', 'figure'], ['oh', "'s", 'monica', 'follow', 'way', 'one', 'missing', 'say', '``', 'number', '27', "''"], ['know'], ['come', 'lydia'], ['push'], ['push', 'push', 'harder', 'harder'], ['push', 'push', 'far'], ['let', "'s", 'take', 'ball', 'really', 'move', 'hey', 'hey', 'ho', 'ho'], ['let', "'s", 'give', 'right', 'yeah'], ['push'], ['push'], ['ok'], ['ross', "n't", 'say', 'elevator'], ['uh', 'yes', "n't", 'let', "'s", 'go'], ['okay', 'go', 'left', 'left', 'left'], ['okay', 'know', 'longer', 'left'], ['oh', 'okay', 'lift', 'directly', 'head'], ['straight', 'head'], [], [], ['ok'], ['get'], ['oh', 'okay', 'understand'], ['waiting', 'look', "'m", 'sorry', "'s", 'never', 'even'], ['howard'], ['yes', 'also', "'s", 'guy'], ['okay', 'well', 'matter', 'bienvenue', 'building'], ['ugh', 'believe', 'guy'], ['yeah', 'really', 'like', 'glasses'], ['ohh'], ['quoi'], ['kicked', 'think', 'baby', 'launched'], ['oh', 'god'], ['oh', 'wait', 'oh', 'elastic', 'underwear', 'break'], ['oh', 

In [24]:
X_test_enc = []

for sentence in X_test_pre:
    X_test_enc_tr = []
    for word in sentence:
        try:
            X_test_enc_tr.append(word_to_index[word])
        except KeyError:
            X_test_enc_tr.append(word_to_index['OOV'])
    X_test_enc.append(X_test_enc_tr)
print(X_test_enc)

[[220, 598, 372], [1, 4, 50, 583, 79, 23, 1001, 27, 71, 275, 1001, 76], [3], [22, 1001], [1001], [1001, 1001, 1001, 1001], [1001, 1001, 749], [36, 4, 44, 334, 19, 128, 14, 14, 1001, 1001], [36, 4, 55, 10, 7], [1001], [1001], [61], [29, 12, 27, 1001], [21, 46, 12, 36, 4, 5], [6, 5, 464, 464, 464], [6, 3, 1001, 464], [1, 6, 1001, 1001, 185], [692, 185], [], [], [61], [2], [1, 6, 292], [1001, 16, 18, 30, 4, 65, 87], [1001], [46, 307, 4, 28], [6, 8, 245, 1001, 898], [378, 88, 28], [7, 19, 13, 1001], [142], [1001], [1001, 11, 99, 1001], [1, 34], [1, 56, 1, 1001, 506, 156], [1, 34, 214, 70, 200, 549, 366, 514, 505, 2], [1, 56, 33, 97, 12, 5, 13], [16, 3, 72, 214, 1001, 40], [320, 286, 1001, 339], [22, 1001, 123, 18, 48], [689], [689], [1001], [69], [21, 5], [483, 214], [1001, 68, 214], [12, 55, 275], [1001, 149], [12, 3, 401, 4, 322, 13, 503], [1], [46, 311, 1001], [1], [4, 1001, 4, 166, 1001], [7, 4, 78, 28, 171, 465, 71], [43, 322, 841, 574, 63], [7, 210, 735, 797, 81, 1001, 363, 1001, 100

In [25]:
y_train=y_train_l.to_numpy()
y_train

array([4, 4, 4, ..., 4, 6, 3])

In [26]:
max_train = max(len(item) for item in X_train_enc)
max_train

37

In [27]:
max_test = max(len(item) for item in X_test_enc)
max_test

23

In [28]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len=50
X_train = pad_sequences(X_train_enc, maxlen=max_len)
X_test = pad_sequences(X_test_enc, maxlen=max_len)

In [29]:
X_train

array([[   0,    0,    0, ..., 1001, 1001,  680],
       [   0,    0,    0, ...,  251,  184,  613],
       [   0,    0,    0, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ..., 1001,    4, 1001],
       [   0,    0,    0, ...,   28,  451,   10],
       [   0,    0,    0, ...,   71,  146,   76]], dtype=int32)

In [30]:
X_test

array([[   0,    0,    0, ...,  220,  598,  372],
       [   0,    0,    0, ...,  275, 1001,   76],
       [   0,    0,    0, ...,    0,    0,    3],
       ...,
       [   0,    0,    0, ...,   68,  364,   68],
       [   0,    0,    0, ...,   94,   59, 1001],
       [   0,    0,    0, ...,   11,   16,   35]], dtype=int32)

In [31]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train, 7)

In [32]:
y_train

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [33]:
# modelling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

embedding_dim = 128
dropout_ratio = 0.3
num_filters = 128
kernel_size = 3
hidden_units = 128
vocab_size = 1500

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(dropout_ratio))
model.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_units, activation='relu'))
model.add(Dropout(dropout_ratio))
model.add(Dense(7, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=5, callbacks=[es, mc])
y_pred = model.predict(X_test)

Epoch 1/5



Epoch 2/5



Epoch 3/5



Epoch 4/5



Epoch 5/5





In [34]:
y_pred

array([[0.07102147, 0.12713797, 0.06690144, ..., 0.3733164 , 0.09877618,
        0.1968756 ],
       [0.04279593, 0.01542677, 0.01024355, ..., 0.538712  , 0.01129833,
        0.08949275],
       [0.11377266, 0.03914519, 0.05248107, ..., 0.41659418, 0.06368942,
        0.19671796],
       ...,
       [0.01800128, 0.00713752, 0.1019754 , ..., 0.21036468, 0.47124106,
        0.03696488],
       [0.09526844, 0.07094365, 0.02807215, ..., 0.14726163, 0.04095801,
        0.5515392 ],
       [0.07362791, 0.03147218, 0.03536427, ..., 0.2003293 , 0.05293241,
        0.30258653]], dtype=float32)

In [35]:
y_pred_m=np.argmax(y_pred, axis=-1)
y_pred_m

array([4, 4, 4, ..., 5, 6, 3])

In [37]:
preds = encoder.inverse_transform(y_pred_m) 

In [38]:
sample

Unnamed: 0,ID,Target
0,TEST_0000,NAN
1,TEST_0001,NAN
2,TEST_0002,NAN
3,TEST_0003,NAN
4,TEST_0004,NAN
...,...,...
2605,TEST_2605,NAN
2606,TEST_2606,NAN
2607,TEST_2607,NAN
2608,TEST_2608,NAN


In [39]:
sample['Target'] = preds
sample

Unnamed: 0,ID,Target
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,neutral
...,...,...
2605,TEST_2605,neutral
2606,TEST_2606,neutral
2607,TEST_2607,sadness
2608,TEST_2608,surprise


In [40]:
sample.to_csv('submit.csv', index=False)