In [1]:
# Import of packages and libraries:
import pandas as pd
import xml.etree.cElementTree as et
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import gensim
from keras.layers import Input
from keras.layers.embeddings import Embedding
from keras import optimizers
from keras.layers import Dense, concatenate, Activation, Dropout
from keras.models import Model
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalMaxPooling1D
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# XML Parser:

In [2]:
def xml_parser(path):
    d = dict()
    root = et.parse(path)
    rows = root.findall('.//column')
    for row in rows:
        d.setdefault(list(row.attrib.values())[0],[]).append(row.text)
    df = pd.DataFrame.from_dict(d)
    return df

In [3]:
# Parse data:
data_train = pd.DataFrame()
data_test = pd.DataFrame()
path_train = ['tkk_train_2016.xml', 'bank_train_2016.xml']
path_test = ['tkk_test_etalon.xml', 'banks_test_etalon.xml']
for path in path_train:
    df = xml_parser(path)
    data_train = pd.concat([df, data_train], ignore_index=True)
for path in path_test:
    df = xml_parser(path)
    data_test = pd.concat([df, data_test], ignore_index=True)


In [4]:
data_train

Unnamed: 0,alfabank,bankmoskvy,beeline,date,gazprom,id,komstar,megafon,mts,raiffeisen,rostelecom,rshb,sberbank,skylink,tele2,text,twitid,uralsib,vtb
0,0,,,1406224554,,1,,,,,,,,,,http://t.co/YEVHuvVGA1 Взять кредит тюмень аль...,492367586156630000,,
1,,,,1406224691,,2,,,,,,,,,,Мнение о кредитной карте втб 24 http://t.co/SB...,492368160923070000,,0
2,,,,1406224798,,3,,,,0,,,,,,«Райффайзенбанк»: Снижение ключевой ставки ЦБ ...,492368608346260000,,
3,,,,1406225412,,4,,,,,,,0,,,Современное состояние кредитного поведения в р...,492371181946030000,,
4,,,,1406226283,,5,,,,,,,1,,,@sawik_shuster @YevhenS Главное чтоб банки СБЕ...,492374836564750000,,1
5,,0,,1406226450,,6,,,,,,,,,,http://t.co/Qr6JbSVTxY Оформить краткосрочный ...,492375537080600000,,
6,,,,1406226505,,7,,,,,,,,,,Самый выгодный автокредит в втб 24 http://t.co...,492375766907510000,,1
7,,,,1406226960,,9,,,,,,,0,,,Кредит иногородним в москве сбербанк http://t....,492377674535680000,,
8,,,,1406226982,,10,,,,,,0,,,,Кредитный калькулятор россельхозбанк чита http...,492377768232250000,,
9,,,,1406227218,,11,,,,,,,,,,http://t.co/h6r6GdBe4H Легко можно получить де...,492378757769220000,,1


# Data Preparation:

In [5]:
data_train.drop(['date', 'id', 'twitid'], axis=1, inplace = True, errors='ignore')
data_train.head()

Unnamed: 0,alfabank,bankmoskvy,beeline,gazprom,komstar,megafon,mts,raiffeisen,rostelecom,rshb,sberbank,skylink,tele2,text,uralsib,vtb
0,0.0,,,,,,,,,,,,,http://t.co/YEVHuvVGA1 Взять кредит тюмень аль...,,
1,,,,,,,,,,,,,,Мнение о кредитной карте втб 24 http://t.co/SB...,,0.0
2,,,,,,,,0.0,,,,,,«Райффайзенбанк»: Снижение ключевой ставки ЦБ ...,,
3,,,,,,,,,,,0.0,,,Современное состояние кредитного поведения в р...,,
4,,,,,,,,,,,1.0,,,@sawik_shuster @YevhenS Главное чтоб банки СБЕ...,,1.0


In [6]:
data_train.fillna(0)

Unnamed: 0,alfabank,bankmoskvy,beeline,gazprom,komstar,megafon,mts,raiffeisen,rostelecom,rshb,sberbank,skylink,tele2,text,uralsib,vtb
0,0,,0,,0,0,0,,0,,,0,0,http://t.co/YEVHuvVGA1 Взять кредит тюмень аль...,,
1,,,0,,0,0,0,,0,,,0,0,Мнение о кредитной карте втб 24 http://t.co/SB...,,0
2,,,0,,0,0,0,0,0,,,0,0,«Райффайзенбанк»: Снижение ключевой ставки ЦБ ...,,
3,,,0,,0,0,0,,0,,0,0,0,Современное состояние кредитного поведения в р...,,
4,,,0,,0,0,0,,0,,1,0,0,@sawik_shuster @YevhenS Главное чтоб банки СБЕ...,,1
5,,0,0,,0,0,0,,0,,,0,0,http://t.co/Qr6JbSVTxY Оформить краткосрочный ...,,
6,,,0,,0,0,0,,0,,,0,0,Самый выгодный автокредит в втб 24 http://t.co...,,1
7,,,0,,0,0,0,,0,,0,0,0,Кредит иногородним в москве сбербанк http://t....,,
8,,,0,,0,0,0,,0,0,,0,0,Кредитный калькулятор россельхозбанк чита http...,,
9,,,0,,0,0,0,,0,,,0,0,http://t.co/h6r6GdBe4H Легко можно получить де...,,1


# Get Labels:

In [7]:
def label(row):
    if (row == '1').any():
        label = 1
    elif (row == '0').any():
            label = 0
    else:
        label = -1
    return(label)

In [8]:
data_train['label'] = data_train.apply(label, axis=1)
data_train.label.value_counts()

 0    11832
-1     4145
 1     2058
Name: label, dtype: int64

In [9]:
data_test['label'] = data_test.apply(label, axis=1)
data_test.label.value_counts()

 0    3251
-1    1769
 1     540
Name: label, dtype: int64

# Clean Text:

In [10]:
def clean_text(text):
    text = text.lower().replace("ё", "е")
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', text)
    text = re.sub('@[^\s]+', 'USER', text)
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip()

In [11]:
data_train['clean_text'] = data_train.text.apply(clean_text)
data_test['clean_text'] = data_test.text.apply(clean_text)

In [12]:
data_train

Unnamed: 0,alfabank,bankmoskvy,beeline,gazprom,komstar,megafon,mts,raiffeisen,rostelecom,rshb,sberbank,skylink,tele2,text,uralsib,vtb,label,clean_text
0,0,,,,,,,,,,,,,http://t.co/YEVHuvVGA1 Взять кредит тюмень аль...,,,0,URL взять кредит тюмень альфа банк
1,,,,,,,,,,,,,,Мнение о кредитной карте втб 24 http://t.co/SB...,,0,0,мнение о кредитной карте втб 24 URL
2,,,,,,,,0,,,,,,«Райффайзенбанк»: Снижение ключевой ставки ЦБ ...,,,0,райффайзенбанк снижение ключевой ставки цб на ...
3,,,,,,,,,,,0,,,Современное состояние кредитного поведения в р...,,,0,современное состояние кредитного поведения в р...
4,,,,,,,,,,,1,,,@sawik_shuster @YevhenS Главное чтоб банки СБЕ...,,1,1,USER USER главное чтоб банки сбер и втб
5,,0,,,,,,,,,,,,http://t.co/Qr6JbSVTxY Оформить краткосрочный ...,,,0,URL оформить краткосрочный кредит оао банк москвы
6,,,,,,,,,,,,,,Самый выгодный автокредит в втб 24 http://t.co...,,1,1,самый выгодный автокредит в втб 24 URL
7,,,,,,,,,,,0,,,Кредит иногородним в москве сбербанк http://t....,,,0,кредит иногородним в москве сбербанк URL
8,,,,,,,,,,0,,,,Кредитный калькулятор россельхозбанк чита http...,,,0,кредитный калькулятор россельхозбанк чита URL
9,,,,,,,,,,,,,,http://t.co/h6r6GdBe4H Легко можно получить де...,,1,1,URL легко можно получить денежный кредит ы втб...


# TF-IDF Vectorizer: 

In [13]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(data_train.text)
X_train.shape

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(18035, 35957)

In [14]:
X_test = vectorizer.transform(data_test.text)
X_test.shape

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(5560, 35957)

# Logistic Regression Model:

In [15]:
model = LogisticRegression(multi_class='ovr')

In [16]:
model.fit(X_train, data_train.label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
score = model.score(X_test, data_test.label)
print('Model Accuracy:', (score*100), '%')

Model Accuracy: 70.70143884892086 %


# Word2Vec:

In [18]:
# Merge train & test data:
data = pd.concat([data_train, data_test], ignore_index=True)

# Split text:
split_text = data.clean_text.apply(lambda sent: sent.lower().split())

## Training W2V on tweet texts:

In [19]:
w2v = gensim.models.Word2Vec(sentences=split_text, sg=1, min_count=1)

In [20]:
# Check # of unique words:
keys = len(list(w2v.wv.vocab.keys()))
keys

28652

In [21]:
split_text.apply(lambda row: len(row)).value_counts()

6     2386
7     1937
8     1876
5     1848
9     1681
10    1463
11    1411
12    1085
13    1040
16     947
17     927
14     911
15     903
18     880
19     783
20     713
4      696
21     601
22     426
23     312
3      212
24     203
25     116
2       95
26      75
27      33
28      16
29       8
1        5
30       4
32       2
Name: clean_text, dtype: int64

## Transforming Data:

In [22]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data.clean_text)
X_train = tokenizer.texts_to_sequences(data_train.clean_text)
X_test = tokenizer.texts_to_sequences(data_test.clean_text)

In [23]:
X_train = pad_sequences(X_train, maxlen=30)
X_test = pad_sequences(X_test, maxlen=30)

In [24]:
X_train

array([[    0,     0,     0, ...,   924,    31,    14],
       [    0,     0,     0, ...,    13,    26,     1],
       [    0,     0,     0, ...,  1996,   208, 13012],
       ...,
       [    0,     0,     0, ...,    20,   601,   532],
       [    0,     0,     0, ...,    20,   601,   532],
       [    0,     0,     0, ...,    20,   601,   532]], dtype=int32)

## Embedding Matrix:

In [25]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
vec = []
for word, i in tokenizer.word_index.items():
    if word in w2v.wv.vocab:
        embedding_vector = w2v.wv.get_vector(word)
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        vec.append(word)

In [26]:
embedding_matrix.shape

(28653, 100)

## CNN Model:

In [27]:
n = 30
tweet_input = Input(shape=(n,), dtype='int32')
inp = Embedding(keys+1, 100, input_length=n,
                weights=[embedding_matrix], trainable=False)(tweet_input)

In [28]:
#del branches

In [29]:
branches = []

for size, filters_count in [(2, 10), (3, 10), (4, 10), (5, 10)]:
    for i in range(filters_count):
        # Add Conv. layer
        branch = Conv1D(filters=1, kernel_size=size, padding='valid', activation='relu')(inp)
        # Subsampling layer
        branch = GlobalMaxPooling1D()(branch)
        branches.append(branch)
x = concatenate(branches, axis=1) 
drop_1 = Dropout(0.2)(x)

# Now flatten to 1D, apply FC > Softmax (with dropout) > softmax
hidden = Dense(30, activation='softmax')(drop_1)
drop_3 = Dropout(0.2)(hidden)
out = Dense(3, activation='softmax')(drop_3)

# Initiate Model
model = Model(input=tweet_input, output=out) 

# Compile Model
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy']) 
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 100)      2865300     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 29, 1)        201         embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 29, 1)        201         embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (



In [30]:
# Fit Model
model.fit([X_train], y=to_categorical(data_train.label.as_matrix()+1), verbose=1, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12feeccc0>

### Results:

In [31]:
model.evaluate(X_test, to_categorical(data_test.label.as_matrix()+1))



[0.7394580670183512, 0.6841726618705036]

In [35]:
data_train.label.as_matrix()+1
y_ = model.predict(X_test)
y_[-1] = [0,0,1]

In [38]:
f_macro = f1_score(to_categorical([np.argmax(x) for x in y_]), to_categorical(data_test.label.as_matrix()+1), average='macro')
f_micro = f1_score(to_categorical([np.argmax(x) for x in y_]), to_categorical(data_test.label.as_matrix()+1), average='micro')
print('F1 Score (Macro):', f_macro)
print('F1 Score (Micro):', f_micro)

F1 Score (Macro): 0.47042022436657494
F1 Score (Micro): 0.6843525179856115
