In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!pip install tensorflow==1.14.0

In [0]:
#import libraries
import time
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import matplotlib.pyplot as plt
import seaborn as sns
#Data Cleaning
from nltk.corpus import stopwords
from keras.layers.embeddings import Embedding
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
from keras.layers.core import Dense, Dropout, Activation, Lambda
import string
#Model Evaludation
from sklearn.metrics import accuracy_score
from sklearn import metrics
#Model Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential

In [0]:
#Stemmer and Lemmatizer intialization
porter = PorterStemmer()
lancaster=LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_en = stopwords.words("english")
punctuations="?:!.,;'\"-()*"

In [0]:
import re
def newdataclean(uncleandata):
  cleanphrase=[]
  for x in range(0,len(uncleandata.values)):
    tempphrase=uncleandata.values[x].lower()
    tempphrase=re.sub('[^a-zA-Z]',' ',tempphrase)
    tempphrase=tempphrase.strip()
    cleanphrase.append(tempphrase)
    
  return cleanphrase

In [0]:
#utility function to clean the dataset
def datacleaning(remove_stopwords,useStemming,useLemma,removePuncs,newdata):
  cleanReview=[]
  for x in range(0,len(newdata.values)):
    tmpReview=[]
    for w in nltk.word_tokenize(newdata.values[x]):
        newWord = str(w).lower() #Set newWork to be the updated word
        if remove_stopwords and (w in stopwords_en):#if the word is a stopword & we want to remove stopwords
            continue #skip the word and don’t had it to the normalized review
        if removePuncs and (w in punctuations):#if the word is a punc. & we want to remove punctuations
            continue #skip the word and don’t had it to the normalized review
        if useStemming: #if useStemming is set to True
            #Keep one stemmer commented out
            #newWord = porter.stem(newWord) #User porter stemmer
            newWord = lancaster.stem(newWord) #Use Lancaster stemmer
        if useLemma:
            newWord = wordnet_lemmatizer.lemmatize(newWord)
        tmpReview.append(newWord) #Add normalized word to the tmp review
    cleanReview.append(' '.join(tmpReview))
  return cleanReview

In [0]:
url="/content/drive/My Drive/NLP/Combined_News_DJIA.csv"
dataset = pd.read_csv(url,encoding = "ISO-8859-1")

In [0]:
dataset.shape

In [0]:
dataset.head(5)

In [0]:
#Count of Each Sentiment
df_plot=dataset['Label'].value_counts()

In [0]:
#Plot the sentiment class count
plt.style.use("ggplot")
plt.figure(figsize=(5,4))
fig = dataset.groupby('Label').Top1.count().plot.bar(ylim=0, width=0.4)
plt.title('Label Count')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Sentiment', fontsize=12)
plt.show()

In [0]:
#Split the dataset in Test Train
X_train, X_test, Y_train, Y_test = train_test_split(dataset.iloc[:,2:],dataset['Label'], test_size=0.3, random_state=2003)

In [0]:
Y_train.value_counts()

In [0]:
X_train.shape

Train Data Preprocessing

In [0]:
#Combine all the top heading in a list
headlines = []
for row in range(0,len(X_train.index)):
    headlines.append(' '.join(str(x) for x in X_train.iloc[row,0:]))

In [0]:
#Add the list as column to the X_train dataframe
X_train['Combined']=headlines

In [0]:
X_train.head(2)

In [0]:
#Final Train data frame
train_data=pd.concat([X_train['Combined'],Y_train],axis=1)

In [0]:
train_data.shape

In [0]:
train_data.head(2)

In [0]:
#clean the data and concat the clean text to the train_data
temp_train_phrase=datacleaning(True,False,True,True,train_data['Combined'])
train_data['Cleaned']=temp_train_phrase

In [0]:
train_data.head(2)

In [0]:
train_data['Label'].value_counts()

In [0]:
#Training paramteters intialization
max_features = 20000
maxlen = 200

Feature Generation

In [0]:
# vectorize the text samples into a 2D integer tensor
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(train_data['Cleaned'])
sequences_train = tokenizer.texts_to_sequences(train_data['Cleaned'])
#sequences_test = tokenizer.texts_to_sequences(testheadlines)

In [0]:
#Padding the trainin data
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.optimizers import SGD,Adam,Nadam
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.layers import SpatialDropout1D
X_train_pad = sequence.pad_sequences(sequences_train, maxlen=maxlen)

Y_train_pad = np_utils.to_categorical(train_data['Label'], 2)
print('X_train shape:', X_train_pad.shape)


Model Training

In [0]:
from keras import backend as K
def recall_m(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true*y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0 ,1)))
  recall = true_positives / (possible_positives + K.epsilon())
  return recall

def precision_m(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true*y_pred, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_pred, 0 ,1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  return precision

def f1_m(y_true, y_pred):
  precision = precision_m(y_true, y_pred)
  recall = recall_m(y_true, y_pred)
  return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
#opt = SGD(lr=0.001, momentum=0.5)
#opt = Adam(lr=0.0001)
model = Sequential()
model.add(Embedding(max_features, 64))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(64, dropout=0.5, recurrent_dropout=0.5)) 
model.add(Dense(2))
model.add(Activation('sigmoid'))
#compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',f1_m,precision_m,recall_m])
#Train the model
modelhistory=model.fit(X_train_pad, Y_train_pad, batch_size=32, epochs=3,validation_split=0.2)


In [0]:
# Model Diagram

from keras.utils.vis_utils import plot_model  
plot_model(model, to_file='/content/drive/My Drive/NLP/model_plot.png', show_shapes=True, show_layer_names=True) 
from IPython.display import Image
Image(retina=True, filename='/content/drive/My Drive/NLP/model_plot.png')


Model Test

In [0]:
X_test.head(2)

In [0]:
X_test.shape

Test Data Preprocessing

In [0]:
#Combine the top headline in to list for test data
headlines_test = []
for row in range(0,len(X_test.index)):
    headlines_test.append(' '.join(str(x) for x in X_test.iloc[row,0:]))

In [0]:
#Adding the combined healines as column to test data
X_test['Combined']=headlines_test

In [0]:
#New dataframe having only required data
test_data=pd.concat([X_test['Combined'],Y_test],axis=1)

In [0]:
test_data.head(2)

In [0]:
X_test.shape

In [0]:
test_data.shape

In [0]:
test_data['Label'].value_counts()

In [0]:
#Clean the test data and add the clean data as column to test_data
temp_test_phrase=datacleaning(True,False,True,True,test_data['Combined'])
test_data['Cleaned']=temp_test_phrase

In [0]:
test_data.head(2)

In [0]:
#Padding the test data
sequences_test = tokenizer.texts_to_sequences(test_data['Cleaned'])
X_test_pad = sequence.pad_sequences(sequences_test, maxlen=maxlen)
Y_test_pad = np_utils.to_categorical(test_data['Label'], 2)
print('X_test_pad shape:', X_test_pad.shape)


In [0]:
X_test_pad

Model test and performance

In [0]:
#score, acc = model.evaluate(X_test_pad, Y_test_pad,batch_size=batch_size)
#print('Test score:', score)
#print('Test accuracy:', acc)
print(model.metrics_names)
model.evaluate(X_test_pad,Y_test_pad,batch_size=32)

For Saving and Loading the model

In [0]:
# For saving the model, uncomment the below lines
#!apt-get install libhdf5-serial-dev
#import h5py
#model.save('/content/drive/My Drive/NLP/NLP_Project/Model/10_project_2_TT.h5')

In [0]:
# For loading the model, uncomment the below lines
#from keras.models import load_model
#!apt-get install libhdf5-serial-dev
#import h5py
#model = load_model('/content/drive/My Drive/NLP/NLP_Project/Model/10_project_2_TT.h5', custom_objects={'f1_m': f1_m,'precision_m':precision_m,'recall_m':recall_m})