In [1]:
import math
import pandas as pd
import kerouz_CNN as kr
import yongyizh as yy
import numpy as np
from keras.preprocessing.text import Tokenizer # https://keras-cn.readthedocs.io/en/latest/preprocessing/text/
from keras.utils import to_categorical


In [2]:
if __name__=='__main__':

    # adjustable parameter
    offset=10 # l1=l3=l2+offset. namely l1 refers to get length, l2 refers to average length, l3 refers to train length
    rm_symbols='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    embedding_dim=50


In [3]:
    # read dataset and dictionary
    data_train=pd.read_csv('../dataset/Train.csv')
    X_train=data_train['TEXT'].values
    Y_train=data_train['Label'].values
    Y_train=to_categorical(Y_train)
    
    data_test=pd.read_csv('../dataset/Test.csv')
    X_test=data_test['TEXT'].values
    
    f='../dataset/glove.6B.50d.txt'
    
    emoji_map = pd.read_csv('../dataset/Mapping.csv')

In [4]:
    # remove special symbols and stopwords from train set
    X_rm=yy.corpus_pre(X_train)

    # segmentation
    tokenizer = Tokenizer(filters=rm_symbols, split=" ", lower=True) # filters：filter symbols that need to be removed lower：convert to lowercase
    tokenizer.fit_on_texts(X_rm) # Tokenizer read train set free of special symbols. Results are stored in tokenize handle.
    
    # vectorize. fill in and truncation
    l2 = math.ceil(sum([len(s.split(" ")) for s in X_rm])/len(X_rm)) # l2:average length
    l1 = l2+offset #get length
    X_pd,tokenizer = kr.toknz(X_rm, l1,tokenizer)

In [5]:
    #Dict that allocate an id(integer) to every word
    ind_dict=tokenizer.word_index

    #Dict that allocate an word vector to every word
    lookup_dict=yy.dict_pre(f)

    # generate weightMatrix according to dictionary
    W=yy.lookup(ind_dict,lookup_dict,embedding_dim)


In [6]:
    # train
    model=kr.model_training(len(ind_dict)+1, W, l2+offset, X_pd, Y_train, embed_dim=embedding_dim, epochs=5)
    print(model.predict_classes(X_pd[1:13])) #test on some sentences in the train data set

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 50)            4275000   
_________________________________________________________________
conv1d (Conv1D)              (None, 20, 128)           32128     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 4, 128)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 4, 128)            82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1, 128)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1, 128)            82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 1, 128)            0

## Predict on test set

In [7]:
    # Prediction on test set
    X_test_rm = yy.corpus_pre(X_test)
    X_test_pd,_ = kr.toknz(X_test_rm, l1,tokenizer)
    label_test = model.predict_classes(X_test_pd)
    for i in range(500, 521, 1):
        print(emoji_map['emoticons'][label_test[i]])
        print(X_test[i])

📸
Trek Remedy 9.8 ️ #trekwomen @ Town Run Trail Park - Indy Parks and Recreation

❤
HAPPY BIRTHDAY TO MY DAMN SISTA Loyalty makes our bond special! She's always there when I need…

🎄
New red wine love thank you for the recommendation @user

❤
Happy birthday best friend. @ Main Street Burgers

😂
As I grow older I pay less attention to what men say. I just watch what they do! happy #tgif…

📸
Thank you @user and @user for such an amazing shoot! Can't wait to see the images …

❤
These two @ Gas Works Park

❤
Loving this sassy cut and color! #ohanahairsalon #Nashville #colorandcut #shortstyle #615…

❤
Pure #happiness @user ️ @ Washington Dulles International Airport

😂
I miss the people I peaked with @ Peek N Peak Resort

❤
Family portrait @ Silver Bay Marina

😂
Partners in crime. #stagemanagers #FirstDateCLT #openingnight @user @ Booth Playhouse…

😂
It's been too long (peep the fathead in the back) @ Mount Saint Mary…

❤
me beautiful d8 @ The Greene Turtle Newark De.

❤
We're thankful for 

In [8]:
    loss, accuracy = model.evaluate(X_pd, Y_train, verbose=1)
    print("Accuracy = %f  ;  loss = %f" % (accuracy, loss))

Accuracy = 0.354329  ;  loss = 2.210211


## Predict on user input

In [None]:
    
    user_str = input("input your sentence:")   
    #user_str = "I love you"
    X_user = np.array([str(user_str)])
    print(X_user[0])

In [None]:
    X_user_rm = yy.corpus_pre(X_user)
    X_user_pd,_ = kr.toknz(X_user_rm, l1,tokenizer)
    label_user = model.predict_classes(X_user_pd)
    print(emoji_map['emoticons'][label_user[0]])
    print(X_user[0]) 