In [None]:
import pandas as pd
import csv
training = pd.read_csv('train-small.tsv', sep="\t", header=None, quoting = csv.QUOTE_NONE)
training.columns = ['label', 'text']
training = training[pd.notnull(training['text'])]
training.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_dev, y_train_str, y_dev_str = train_test_split(training['text'], training['label'], test_size=0.2, random_state = 42)
y_train = [int(a) for a in y_train_str]
y_dev = [int(a) for a in y_dev_str]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
V=10000
mytokenizer = Tokenizer(num_words=V)

In [None]:
mytokenizer.fit_on_texts(training['text'])
print(len(mytokenizer.word_index))

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = max([len(s.split()) for s in training['text']])
# V = 1+len(mytokenizer.word_index)
X_train_seq = mytokenizer.texts_to_sequences(X_train)
X_dev_seq = mytokenizer.texts_to_sequences(X_dev)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_dev_padded = pad_sequences(X_dev_seq, maxlen=max_length, padding='post')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
emb_dim = 100
model = Sequential()
model.add(Embedding(input_dim=V, output_dim=emb_dim, input_length=max_length))
model.add(LSTM(units=100,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_train_padded, y_train, batch_size=128, epochs=10, validation_data=(X_dev_padded, y_dev),verbose=2)

In [None]:
more_tests = ["Mae'n ddiwrnod hyfryd yn y gymdogaeth.", "Rwy'n drist iawn bod fy nghyrsiau i gyd ar-lein."]
more_tests_seq = mytokenizer.texts_to_sequences(more_tests)
more_tests_padded = pad_sequences(more_tests_seq, maxlen=max_length, padding='post')
model.predict(x=more_tests_padded)


In [None]:
from sklearn.metrics import classification_report, accuracy_score
y_classes = model.predict_classes(x=X_dev_padded)
print(accuracy_score(y_dev, y_classes))
print(classification_report(y_dev, y_classes))

In [None]:
import numpy as np
embeddings = dict()
f = open('cy-vectors.txt','r')
for line in f:
  pieces = line.split()
  embeddings[pieces[0]] = np.asarray(pieces[1:])
f.close()

In [None]:
totalV = 1+len(mytokenizer.word_index)
embedding_matrix = np.zeros((totalV, emb_dim))
for word, i in mytokenizer.word_index.items():
  vec = embeddings.get(word)
  if vec is not None:
    embedding_matrix[i] = vec

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.initializers import Constant
model = Sequential()
model.add(Embedding(input_dim=totalV, output_dim=emb_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(units=100,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_train_padded, y_train, batch_size=128, epochs=10, validation_data=(X_dev_padded, y_dev),verbose=2)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
y_classes = model.predict_classes(x=X_dev_padded)
print(accuracy_score(y_dev, y_classes))
print(classification_report(y_dev, y_classes))