In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
sw = stopwords.words('english')
sw.remove('not')

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
from tensorflow.keras.layers import Dense,Embedding,Dropout,Conv1D,MaxPooling1D,Dropout,LSTM
from tensorflow.keras.models import Model, Sequential



In [2]:
DATASET_COLUMNS=['target','ids','date','flag','user','text']
DATASET_ENCODING = "ISO-8859-1"
data = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv",encoding=DATASET_ENCODING, names=DATASET_COLUMNS)
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
print("Length of data", len(data))

Length of data 1600000


In [4]:
data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [5]:
data['target'] = data['target'].replace(4, 1)
data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [6]:
def clean_texts(text):
    text = re.sub(r'(@[A-Za-z0–9_]+)|[^\w\s]|#|http\S+', '', text)
    text = " ".join([word for word in text.split() if word not in sw])
    text = text.lower()
    return text

In [7]:
data['text'] = data['text'].apply(clean_texts)

In [8]:
data['text'][:10]

0    awww thats bummer you shoulda got david carr t...
1    upset cant update facebook texting might cry r...
2    i dived many times ball managed save 50 the re...
3                     whole body feels itchy like fire
4                       not behaving im mad i cant see
5                                       not whole crew
6                                             need hug
7    hey long time see yes rains bit bit lol im fin...
8                                           nope didnt
9                                            que muera
Name: text, dtype: object

In [9]:
x = data['text']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [10]:
print('Length train data', len(X_train))
print('Length test data', len(X_test))
print('Length val data', len(X_val))

Length train data 1152000
Length test data 160000
Length val data 288000


In [11]:
tokenizer = Tokenizer(num_words=40000)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

In [12]:
len(tokenizer.word_index)

365662

In [13]:
max_words = 100
X_train = pad_sequences(X_train, maxlen=max_words, padding = 'post')
X_val = pad_sequences(X_val, maxlen=max_words, padding = 'post')
X_test = pad_sequences(X_test, maxlen=max_words, padding = 'post')

In [14]:
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

In [15]:
model = Sequential()
model.add(Embedding(40000, 10, input_length=X_train.shape[1]))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 10)           400000    
                                                                 
 conv1d (Conv1D)             (None, 100, 32)           992       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 50, 32)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 50, 32)            3104      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 25, 32)           0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 25, 32)            3

In [16]:
model.fit(X_train, y_train, validation_data=(X_val,y_val), epochs=10, batch_size=256, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [17]:
test_metrics = model.evaluate(X_test, y_test)
print(f"Test accuracy - {test_metrics[1]*100:.2f}%")

Test accuracy - 80.27%


In [18]:
test_input = "you are the worst person i have met."
test_input = [clean_texts(test_input)]
test_input = tokenizer.texts_to_sequences(test_input)
test_input = pad_sequences(test_input, maxlen=max_words, padding = 'post')

sentiment = 'Positive' if model.predict(test_input)[0][0] >= 0.5 else 'Negative'
print(f"Sentiment: {sentiment}")

Sentiment: Negative


In [19]:
model.save("sentiment_analysis.keras")

In [20]:
import io
import json

In [21]:
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))