# Общий конвейер

In [1]:
import numpy as np
import keras
import pandas as pd

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D

from keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import TweetTokenizer

import pyodbc

from re import sub

Using TensorFlow backend.


## 1. Загрузка данных

In [2]:
import pandas
df_train = pandas.read_csv("data/train.csv")
df_test  = pandas.read_csv("data/test.csv")
df_val   = pandas.read_csv("data/val.csv")

In [3]:
df_train.describe()

Unnamed: 0,id,class
count,181467.0,181467.0
mean,90733.0,0.505993
std,52385.154987,0.499965
min,0.0,0.0
25%,45366.5,0.0
50%,90733.0,1.0
75%,136099.5,1.0
max,181466.0,1.0


## 2. Токенизация
Кодируем слова индексами (вычисленными по частоте употребления).
Using russian http://www.ruscorpora.ru/en/

from keras.preprocessing.text import Tokenizer Убираем, потому что он режет символы пунктуации
#### Step 1. Clear dataset. 
Select words one by one. Symbols are meaningful because of smiles and emotions.

We will create a HashMap by using a Python dictionary to store the word frequencies of a book.
A dictionary is an associative array (also known as hashes).
Any key of the dictionary is associated, or mapped, to a value.
The values of a dictionary can be any Python data type, so dictionaries are unordered key-value-pairs.

By creating the dictionary, we will store the words as the keys and the value will represent the count. By doing this, we can retrieve any word without having to recount every single word.

#### Step 2. Select meaningful words.
#### Step 3. Calculate frequency of each word
#### Step 4. Replace words by indexes

In [4]:
import nltk
import string
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize

hash_map = {}

In [5]:
class SentimentAnalysis:
    max_words = 20000
    batch_size = 32
    epochs = 30
    max_len = 40
    cnstr = 'Trusted_Connection=yes;DRIVER={SQL Server};SERVER=GORDAPC;DATABASE=positive;UID=sa;PWD=49649952'
    map = {}
    x_train, y_train, x_test, y_test = ([] for i in range(4))

    def data_generator(self, constring, query):
        cnxn = pyodbc.connect(constring)
        cursor = cnxn.cursor()
        cursor.execute(query)

        y, t = [], []
        for row in cursor:
            r_text = row.ttext
            r_type = row.ttype
            y.append(r_text)
            t.append(r_type)

            if len(y) == self.batch_size:
                npx = np.array(y)
                npy = np.array(t)
                yield npx, npy
                y, t = [], []
        pyodbc.Connection.close(cnxn)
    
    def tokenize(self, file_text):
        if file_text is not None:
            #firstly let's apply nltk tokenization
            #tokens = nltk.word_tokenize(file_text)
            #упоминания
            file_text = sub(r'@\w+', "", file_text) 
            #ссылки
            file_text = sub(r'http[^\s]+', "", file_text)
            #хэштеги
            file_text = sub(r'#\w+', "", file_text)
            #RT
            file_text = sub(r'RT +: +', "", file_text)
            #\n
            file_text = sub(r'\n', " ", file_text)


            tknzr = TweetTokenizer()
            tokens = tknzr.tokenize(file_text)

            #let's delete punctuation symbols
            stop_words = ([',','\\','/','*','','-','http',';',':','@',',','.','#','"','n','—'])
            tokens = [i for i in tokens if ( i not in stop_words )]

            #deleting stop_words
            stop_words = list(set(stopwords.words('russian'))-set(['не','лучше','больше','никогда','хорошо']))
            #stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'в', '—', 'к', 'на', 'http', 'чем', 'х','ж','же','\\n','\\nя'
            #                   ,'еще','ещё','d','rt'])

            tokens = [i for i in tokens if ( i not in stop_words )]

            #cleaning words
            #tokens = [i.replace("«", "").replace("»", "") for i in tokens]
            
            return tokens
        else:
            return None
    
    def map_words(self, tokens):
        if tokens is not None:
            for word in tokens:
                word = word.lower()
                # Word Exist?
                if word in hash_map:
                    hash_map[word] = hash_map[word] + 1
                else:
                    hash_map[word] = 1

            return hash_map
        else:
            return None
        
    def frequency_mapping(self):
        hash_map.clear()

        query = "SELECT [ttext], [ttype] FROM [dbo].[mixedmessages]"
        pdg = self.data_generator(self.cnstr, query)


        for current_set in pdg:
            for sentence in current_set[0]:
                words = self.tokenize(sentence)
                self.map = self.map_words(words)
        most_popular_words = sorted(self.map, key=self.map.get, reverse=True)[:self.max_words]
        
        self.map = {key: value for key, value in self.map.items() if key in most_popular_words}
    
    def get_frequency(self, word):
        for word in word_list:
            print( str(self.map.get(word,0)))
    
    def vectorize(self,data):
        max_frequency = max(self.map.values())
        
        x_data = []
        
        for index, row in data.iterrows():
            if pd.isnull(row.text):
                words = sa.tokenize('')
            else:
                words = sa.tokenize(row.text)
            w = []
            #print(words)
            for word in words:
                #print(word, map.get(word,0))
                w.append(self.map.get(word,0))
            x_data.append(w)

        x_data = np.array(x_data)
        x_data = pad_sequences(x_data, maxlen=self.max_len)
        #print(x_train)
        return x_data
    
    def tokenize_message(self, string):
        words = self.tokenize(string)
        res = []
        for word in words:
            res.append(self.map.get(word,0))
        ntm = np.array([res])

        nres = pad_sequences(ntm, maxlen=self.max_len)
            #res.append([word_2_ind[w] for w in sent.split(' ')])
        return nres#np.pad(ntm, (maxlen-len(ntm),0),'constant', constant_values=(0))
    
    def getRating(self, value):
        return {
             value < 0.2: 'Крайне негативная оценка',
             0.2 <= value < 0.4: 'Негативная оценка',
             0.4 <= value < 0.6: 'Нейтральная оценка',
             0.6 <= value < 0.8: 'Положительная оценка',
             0.8 <= value:       'Крайне положительная оценка'
        }[True]
    
    def sentiment(self, model, text):
        inp = np.array(tokenize_message(text), dtype=np.int32)
        p = model.predict(inp)
        return getRating(p[0][0]), p[0][0]

In [6]:
sa = SentimentAnalysis()

создание словаря

In [7]:
sa.frequency_mapping()

In [8]:
from keras.preprocessing import sequence

print('Vectorizing sequence data...')
x_train = sa.vectorize(df_train)
x_test  = sa.vectorize(df_test )
x_val   = sa.vectorize(df_val  )


Vectorizing sequence data...


KeyboardInterrupt: 

In [None]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

In [None]:
x_train

In [None]:
print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
num_classes = 2
y_train = keras.utils.to_categorical(df_train["class"], num_classes)
y_val = keras.utils.to_categorical(df_val["class"], num_classes)
print('y_train shape:', y_train.shape)
print('y_val shape:', y_val.shape)

In [None]:
y_val

In [None]:
print('Building model sequentially 1...')
model = Sequential()
model.add(Embedding(input_dim=max(sa.map.values())+1, output_dim=128, input_length=sa.max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [None]:
model.layers

In [None]:
print(model.to_yaml())

In [None]:
import graphviz
import pydot_ng as pydot
print (pydot.find_graphviz())

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model.png', show_shapes=True)

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
from keras.objectives import categorical_crossentropy
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
from keras.callbacks import TensorBoard  
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
from keras.callbacks import EarlyStopping  
early_stopping=EarlyStopping(monitor='val_loss', patience=3)  


history = model.fit(x_train, y_train,
                    batch_size=sa.batch_size,
                    epochs=sa.epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

In [None]:
score = model.evaluate(x_val, y_val, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

In [None]:
results = model.predict(x_test, batch_size=batch_size, verbose=1)

In [71]:
x_text = tokenizer.texts_to_sequences(['Мне не нравится фильм. Сюжет совсем неинтересный и актеры сыграли плохо'])

x_text = pad_sequences(x_text, maxlen=max_len)

model.predict(x_text)


array([[  9.99674082e-01,   3.25976143e-04]], dtype=float32)

In [64]:
print('x_train shape:', x_text.shape)

x_train shape: (19, 40)


In [67]:
df_train['text'][0]

'Делаю пробное печенье по рецепту makeupkaty , пока что без формы) http://t.co/bRZjtMdXyd'