# Capstone project - Deep learning

## Imports

In [None]:
import pandas as pd
import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LeakyReLU, Dropout
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.compat.v1.losses import sparse_softmax_cross_entropy

## Dataset

In [None]:
df = pd.read_excel("spell_checked_data.xlsx")

## Label encoding, Tokenization and Padding

In [None]:
label_encoder = LabelEncoder()
df['class_encoded'] = label_encoder.fit_transform(df['class'])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['lemmatized_tweet'])
X = tokenizer.texts_to_sequences(df['lemmatized_tweet'])
X = pad_sequences(X, maxlen=15)
y = df['class_encoded'].values

## Splitting the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## CNN model

In [None]:
model_cnn = Sequential()
model_cnn.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=15))
model_cnn.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(LeakyReLU(alpha=0.01))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(units=1, activation='sigmoid'))

opt = keras.optimizers.Adam(learning_rate=0.0001)
model_cnn.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=opt)

## Fitting the model and early stopping

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model_cnn.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

## Evaluations

In [None]:
loss, accuracy = model_cnn.evaluate(X_test, y_test)
print("Test Accuracy: " + str(accuracy))
print("Test Loss: " + str(loss))