In [None]:

%matplotlib inline

import itertools
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import tensorflow
ps = PorterStemmer()

In [None]:
data=pd.read_csv('train.csv', encoding = "ISO-8859-1")
#test_data=pd.read_csv('test.csv', encoding = "ISO-8859-1")
data.head()

In [None]:
def preProcessString(text):
    #Strips quotes at end of text
    text=text.strip('')
    #Rmoving twitter handles @user
    text=re.sub("@[\w]*"," ",text) 
    # Remove URLs with the space
    text = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' ', text)
    # Strip space, " and ' from text
    text = text.strip(' "\'')
    #remove digits
    #text = re.sub(r'(\d)', '', text)
    #Remove repeated letters of string such as jusssssst to just 
    text=re.sub(r'(.)\1{3,}', r'\1', text)
    # remove all special characters
    text = re.sub('[^A-Za-z]', ' ', text)
    #replace two or more dots with space
    text = re.sub("\\.{2,}"," ",text);
    # converting all text into small letters and store them as words for further processing
    text_list = text.lower().split()
     # stemming the words (removing prefix and postfix) using Porter stemming algorithm..
    text_list = [ps.stem(word) for word in text_list]
    return ' '.join(text_list)

In [13]:
data['Preprocessed_data']=data['SentimentText'].apply(preProcessString)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
train, test = train_test_split(data, test_size=0.20)
Count_vectorization=TfidfVectorizer(use_idf=True, max_features=20000)

In [16]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.datasets import imdb
from tensorflow.keras.preprocessing.text import one_hot

# Embedding
max_features = 2000
maxlen = 200
embedding_size = 128

# Convolution
kernel_size = 3
filters = 250
pool_size = 2

# LSTM
lstm_output_size = 70

# Training
batch_size = 32
epochs = 3


print('Loading data...')
x_train = train["Preprocessed_data"]
y_train = train["Sentiment"]
x_test  = test["Preprocessed_data"]
y_test  = test["Sentiment"]

x_train = [one_hot(d, max_features) for d in x_train]
y_train = np.asarray(y_train)
x_test  = [one_hot(d, max_features) for d in x_test]
y_test  = np.asarray(y_test)

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Dropout(0.25))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          validation_split=0.2, epochs=epochs)
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Loading data...
Pad sequences (samples x time)
x_train shape: (79991, 200)
x_test shape: (19998, 200)
Build model...
Train...
Train on 63992 samples, validate on 15999 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.5073296641013493
Test accuracy: 0.7564256425404122
