# CNN

## Global Imports
Keras - deep learning library

In [41]:
import pandas as pd
import re
import csv
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, Flatten
from keras import layers
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dropout
import datetime

## Importing dataset
Imports a certain number of lines from the dataset and adds the sentiment column, according to the number of stars given in the review.

In [30]:
filename = 'y2.csv'
# Importing 30000 lines from the CSV
data = pd.read_csv(filename, error_bad_lines=False)[:30000]
# Remove unnecessary columns
data = data.drop(['user_id','business_id','date','funny','cool','useful'],axis=1) 
# Adds the sentiment column
data['sentiment'] = ['pos' if (x > 3) else 'neutral' if (x == 3) else 'neg' for x in data['stars']]
data.head()

Unnamed: 0,review_id,stars,text,sentiment
0,vkVSCC7xljjrAI4UGfnKEQ,5,Super simple place but amazing nonetheless. It...,pos
1,n6QzIUObkYshz4dz2QRJTw,5,Small unassuming place that changes their menu...,pos
2,MV3CcKScW05u5LVfF6ok0g,5,Lester's is located in a beautiful neighborhoo...,pos
3,IXvOzsEMYtiJI0CARmj77Q,4,Love coming here. Yes the place always needs t...,pos
4,L_9BTb55X0GDtThi6GlZ6w,4,Had their chocolate almond croissant and it wa...,pos


## Pre-processing
Remove special characters and tokenize the text, to break the sentences into tokens, without pontuaction or spacing.
The tokenizer is a class from Keras that prepares text for deep learning.

**Arguments:**
- **num_words:** the maximum number of words to keep, based on word frequency. Only the most common num_words words will be kept.
- **lower:** boolean. Whether to convert the texts to lowercase.
- **split:** str. Separator for word splitting.

**Outputs:**
- **word_counts:** A dictionary of words and their counts.
- **word_docs:** A dictionary of words and how many documents each appeared in.
- **word_index:** A dictionary of words and their uniquely assigned integers.
- **document_count:** An integer count of the total number of documents that were used to fit the Tokenizer.

In [36]:
# Remove special characters
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

# Tokenizes the text (2500 words, turn lowercase and split on spacing)
tokenizer = Tokenizer(num_words=2500, lower=True,split=' ')

tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X) # Ensures that all sequences have the same length

# Summarize what was learned
print(tokenizer.word_counts)



## CNN Model
Model with:
- 1 Embedding Layer (Turns positive integers (indexes) into dense vectors of fixed size)
- 1-Dimensional CNN Layer with 1-Dimension
- 1 Flatten Layer
- 1 Dense Layer with ReLu activation (Rectified Linear Unit) [0..∞)
- 1 Dense Layer with sigmoid activation [0..1]

In [None]:
batch_size= 32 # Iterate the training data in batches of size 32
input_dim = X.shape[1]  # Size of the vocabulary

embedding_dim = 100 # Size of the output of the embedding

model = Sequential()
model.add(Embedding(2500, embedding_dim, input_length=input_dim)) # Embedding layer
model.add(Conv1D(128, 5, activation='relu')) # 1-Dimensional CNN layer
model.add(Flatten()) # Flattens the input, without this layer, it can't connects to the dense layer
model.add(Dense(10, activation='relu')) # Dense layer
model.add(Dense(3, activation='sigmoid')) # Dense layer
model.compile(optimizer='adam', # required argument
              loss='binary_crossentropy', #required argument
              metrics=['accuracy']) # judge the performance of the model
model.summary()

## Training
80% of the samples used for training and 20% used for testing. Running with only 1 epoch.

In [34]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20)

# Trains the model for a given number of epochs (iterations on a dataset).
model.fit(X_train, Y_train, batch_size = batch_size, epochs = 1, verbose = 1, validation_split=0.1)

# Measuring score and accuracy on validation set
score,acc = model.evaluate(X_test, Y_test, verbose = 0, batch_size = batch_size)
print("Loss score: %.2f" % (score))
print("Test Accuracy: %.2f" % (acc*100))

Train on 21600 samples, validate on 2400 samples
Epoch 1/1
Loss score: 0.33
Test Accuracy: 86.99


## Prediction
Predicting all the training data, that is 20% of the input

In [44]:
start = datetime.datetime.now()
classes = model.predict(X_test, batch_size=32) # Prediction
end = datetime.datetime.now()
print(end - start)
print(classes)

0:00:02.571298
[[  9.95109258e-06   4.27116221e-03   9.98824120e-01]
 [  3.17367911e-01   2.71065980e-01   4.50607181e-01]
 [  3.13119620e-01   2.69455105e-01   4.53755111e-01]
 ..., 
 [  1.16437860e-03   3.00427768e-02   9.75125313e-01]
 [  1.88625939e-02   9.01869014e-02   8.65160406e-01]
 [  7.68696191e-04   2.54035573e-02   9.80868578e-01]]
