In [1]:
import tensorflow as tf

In [2]:
import keras

In [3]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [4]:
train = pd.read_table('train.txt', delimiter = ';', header = None, )
val = pd.read_table('val.txt', delimiter = ';', header = None, )
test = pd.read_table('test.txt', delimiter = ';', header = None, )

In [5]:
data = pd.concat([train, val, test])
data.columns = ['text', 'label']

In [6]:
data.shape

(20000, 2)

In [7]:
data.isna().any(axis = 1).sum()

0

In [8]:
#text preprocessing
ps = PorterStemmer()

def preprocess(line):
    review = re.sub('[^a-zA-Z]', ' ', line)      #leave only characters from a to z
    review = review.lower()                      #lower the text
    review = review.split()                      #turn string into list of words
    
    #apply stemming
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]  #delete stop words
    
    #turn list into sentences
    return " ".join(review)

In [9]:
data ['text'] = data['text'].apply(lambda x: preprocess(x))

In [10]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
data['N_label'] = label_encoder.fit_transform(data['label'])

In [11]:
data['text']

0                                       didnt feel humili
1       go feel hopeless damn hope around someon care ...
2                    im grab minut post feel greedi wrong
3          ever feel nostalg fireplac know still properti
4                                            feel grouchi
                              ...                        
1995    keep feel like someon unkind wrong think get b...
1996              im feel littl cranki neg doctor appoint
1997                feel use peopl give great feel achiev
1998    im feel comfort derbi feel though start step s...
1999    feel weird meet w peopl text like dont talk fa...
Name: text, Length: 20000, dtype: object

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, ngram_range = (1,3))
data_cv = cv.fit_transform(data['text']).toarray()

In [13]:
data_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_cv, data['N_label'], test_size = 0.25, random_state = 42)

In [15]:
# first neural network with Keras tutorial
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np  # Make sure to import numpy for working with arrays

# Define the keras model
model = Sequential()
model.add(Dense(12, input_shape=(x_train.shape[1],), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(6, activation='softmax'))

# Compile the keras model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the keras model on the dataset
model.fit(x_train, y_train, epochs=10, batch_size=10)  # Corrected 'epoch' to 'epochs'

# Evaluate the keras model
_, accuracy = model.evaluate(x_train, y_train)
print('Accuracy: %.2f' % (accuracy * 100))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 99.65


In [16]:
_, accuracy = model.evaluate(x_test, y_test)
print('accuracy: %.2f' % (accuracy*100))

accuracy: 85.06


In [17]:
text = 'I feel sad'
text = preprocess(text)
array = cv.transform([text]).toarray()
pred = model.predict(array)
a = np.argmax(pred, axis=1)
label_encoder.inverse_transform(a)[0]



'sadness'

In [18]:
tf.keras.models.save_model(model, 'my_model.h5')

  tf.keras.models.save_model(model, 'my_model.h5')


In [19]:
import pickle
pickle.dump(label_encoder, open('encoder.pkl','wb'))
pickle.dump(cv, open('CountVectorizer.pkl','wb'))
pickle.dump(preprocess, open('preprocess.pkl', 'wb'))