In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from collections import defaultdict

import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import load_model

np.random.seed(69)

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv('data/train_data.csv')
train_data.drop(train_data.columns[0], axis=1,inplace=True)
dev_data = pd.read_csv('data/dev_data.csv')
dev_data.drop(dev_data.columns[0], axis=1,inplace=True)

#data = pd.read_csv('data/all_data.csv')
#data.drop(data.columns[0], axis=1,inplace=True)
print('Process: Data reading is completed.')

#train_data['author_Id'] = '__label__'+train_data['author_Id'].astype(str)
train_data['is_retweet'] = 1*train_data['is_retweet']
train_data['is_modified_retweet'] = 1*train_data['is_modified_retweet']

#dev_data['author_Id'] = '__label__'+dev_data['author_Id'].astype(str)
dev_data['is_retweet'] = 1*dev_data['is_retweet']
dev_data['is_modified_retweet'] = 1*dev_data['is_modified_retweet']

#data['is_retweet'] = 1*data['is_retweet']
#data['is_modified_retweet'] = 1*data['is_modified_retweet']

Process: Data reading is completed.


In [None]:
#dev_data.to_csv('fastText/dev.txt', index=False, sep=' ', header=None)
#train_data.to_csv('fastText/train.txt', index=False, sep=' ', header=None)

In [3]:
#from sklearn.preprocessing import LabelEncoder
#labelencoder = LabelEncoder()
#train_data['author_Id_label'] = train_data['author_Id']
#dev_data['author_Id_label'] = dev_data['author_Id'])
#print(train_data.author_Id.max())
#print(dev_data.author_Id.max())
#print(train_data.author_Id_label.max())
#print(dev_data.author_Id_label.max())
#print(train_data.author_Id.head())
#print(train_data.author_Id_label.head())
#print(dev_data.author_Id.head())
#print(dev_data.author_Id_label.head())

y_train = to_categorical(train_data['author_Id'])
X_train = train_data['tweet']
#X_train = train_data.drop('author_Id', axis=1)

y_dev = to_categorical(dev_data['author_Id'])
X_dev = dev_data['tweet']
#X_dev = dev_data.drop('author_Id', axis=1)

#y = to_categorical(data['author_Id'])
#X = data['tweet']

print(len(y_train),len(X_train),len(y_dev),len(X_dev))
#print(X.shape)
#print(y.shape)
#print(data.shape)

246699 246699 82233 82233


In [4]:
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [5]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df:
        if type(doc) == float:
            print(doc)
            doc = str(doc)
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    return docs

In [6]:
min_count = 3

docs_train = create_docs(X_train)
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(docs_train)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
tokenizer.fit_on_texts(docs_train)
docs_train = tokenizer.texts_to_sequences(docs_train)

maxlen = 256

docs_train = pad_sequences(sequences=docs_train, maxlen=maxlen)

print(docs_train[:100])

[[     0      0      0 ...   5096    148     77]
 [     0      0      0 ...    295  19225   5474]
 [     0      0      0 ...  11757  50442    911]
 ...
 [     0      0      0 ...  40561  86732  21234]
 [     0      0      0 ...    403   2281  24621]
 [     0      0      0 ... 157601  54855     45]]


In [7]:
min_count = 3

docs_dev = create_docs(X_dev)
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(docs_dev)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
tokenizer.fit_on_texts(docs_dev)
docs_dev = tokenizer.texts_to_sequences(docs_dev)

maxlen = 128

docs_dev = pad_sequences(sequences=docs_dev, maxlen=maxlen)

print(docs_dev[:100])

[[    0     0     0 ... 27682     1 37924]
 [    0     0     0 ...   808  1967   758]
 [    0     0     0 ... 10768  8477  1555]
 ...
 [    0     0     0 ... 27723  4100   465]
 [    0     0     0 ...  7111  5719   136]
 [    0     0     0 ...  1470   211 60103]]


In [8]:
#print(y_train[:100])
print(np.max(docs_train))
print(np.max(docs_dev))

220177
85097


In [9]:
input_dim = np.max(docs_train)+1
print('input_dim = ' + str(input_dim))
embedding_dims = 20

def create_model(embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(y_train.shape[1], activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

input_dim = 220178


In [None]:
#X_train, X_dev, y_train, y_dev = train_test_split(docs, y, test_size=0.25)

In [11]:
epochs = 10

model = create_model()
hist = model.fit(docs_train, y_train,
                 batch_size=64,
                 validation_data=(docs_dev, y_dev),
                 verbose=1,
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=3, monitor='val_loss', min_delta=0.0001)])

Train on 246699 samples, validate on 82233 samples
Epoch 1/10
Epoch 2/10
   192/246699 [..............................] - ETA: 4:11:27 - loss: 8.9535 - acc: 0.0000e+00



Epoch 3/10
   128/246699 [..............................] - ETA: 44:33 - loss: 8.8012 - acc: 0.0000e+00  



Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [1]:
model.save_weights('fastTextmodel_weights.h5')
model.save('fastTextmodel.h5')

NameError: name 'model' is not defined

In [None]:
#from sklearn.naive_bayes import MultinomialNB
#model = MultinomialNB()
#model = model.fit(text_bow_train, y_train)

In [17]:
hist = model.fit(docs_train, y_train,
                 batch_size=64,
                 validation_data=(docs_dev, y_dev),
                 verbose=1,
                 epochs=5,
                 callbacks=[EarlyStopping(patience=3, monitor='val_loss', min_delta=0.0001)])

Train on 246699 samples, validate on 82233 samples
Epoch 1/5
Epoch 2/5
   128/246699 [..............................] - ETA: 1:44:12 - loss: 7.4041 - acc: 0.0312



Epoch 3/5
Epoch 4/5
Epoch 5/5
