# Notebook 2: CNN

## Overview: 

1. Begin by importing and getting the embeddings and word to index mappings we created in [Notebook 1: Embed Words](Notebook_1_Embed_Words.ipynb)


In [1]:
import cPickle as pickle
from tensorflow.models.rnn.rnn_cell import BasicLSTMCell, LSTMCell 
import itertools
from collections import Counter
import csv
import re
import numpy as np
import string

In [2]:
embeddings = None
mappings = None
rows = None

with open("word_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)
with open("word_mappings.pkl", "rb") as f:
    mappings = pickle.load(f)
    

urlFinder = re.compile('\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*')
atNameFinder = re.compile(r'@([A-Za-z0-9_]+)')
atNameCounter = 0

exclude_punc = set([
        "!",
        "?",
        ".",
        ",",
        ":",
        ";",
        "'",
        "\"",
        "“",
        "’",
        "-"
])

sentences = []
labels = []
x = []
y = []
_y = []

with open('data.csv', 'rb') as f:
    reader = csv.reader(f, delimiter=',')   
    
    for row in reader:
        words = []
        
        for word in row[1] \
            .strip() \
            .replace("&amp;", "") \
            .replace("&gt;","") \
            .replace("&lt;", "") \
            .lower().split():
            
            if urlFinder.match(word):
                words.append("<URL/>")
            elif atNameFinder.search(word):
                words.append("<AT_NAME_%s/>" % atNameCounter)
                atNameCounter +=1
            else:
                word = ''.join(ch for ch in word if ch not in exclude_punc)
                words.append(word)
        sentences.append(words)
        labels.append(([0, 1] if row[0] == "example" else [1, 0]))
        _y.append(1 if row[0] == "example" else 0)


sequence_length = max(len(i) for i in sentences)
padded_sentences = []
for i in range(len(sentences)):
    sentence = sentences[i]
    num_padding = sequence_length - len(sentence)
    new_sentence = sentence + ["<PAD/>"] * num_padding
    padded_sentences.append(new_sentence)
    
 
word_counts = Counter(itertools.chain(*padded_sentences))

# Mapping from index to word
vocabulary_inv = [x[0] for x in word_counts.most_common()]
# Mapping from word to index
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}

x = np.array([[vocabulary[word] for word in sentence] for sentence in padded_sentences])
y = np.array(labels)


In [3]:
import numpy as np

# Original Hold-out


# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# Split train/test set
# TODO: This is very crude, should use cross-validation
x_train, x_dev = x_shuffled[:len(x_shuffled)/2-1], x_shuffled[-len(x_shuffled/2)/2-1:]
y_train, y_dev = y_shuffled[:len(y_shuffled)/2-1], y_shuffled[-len(y_shuffled/2)/2-1:]
print("Vocabulary Size: {:d}".format(len(vocabulary)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
print("Train Pos/Dev Pos Split {:d}/{:d}"
      .format(
        len(
            [a for a in y_train if a[0] == 0 and a[1] == 1]
        ), len(
            [a for a in y_dev if a[0] == 0 and a[1] == 1]
        )))
print("Train Neg/Dev Neg Split {:d}/{:d}"
      .format(
        len(
            [a for a in y_train if a[0] == 1 and a[1] == 0]
        ), len(
            [a for a in y_dev if a[0] == 1 and a[1] == 0]
        )))

    


Vocabulary Size: 11555
Train/Dev split: 2429/2432
Train Pos/Dev Pos Split 347/359
Train Neg/Dev Neg Split 2082/2073


In [4]:
from sklearn.cross_validation import StratifiedKFold

num_folds = 2
skf = StratifiedKFold(_y, n_folds=num_folds)

for idx, fold in zip(skf, range(num_folds)):
    x_train = x[idx[0]]
    y_train = y[idx[0]]
    
    x_dev = x[idx[1]]
    y_dev = y[idx[1]]
    
    print("Fold #%s" % fold)
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    print("Train Pos/Dev Pos Split {:d}/{:d}"
      .format(
        len(
            [a for a in y_train if a[0] == 0 and a[1] == 1]
        ), len(
            [a for a in y_dev if a[0] == 0 and a[1] == 1]
        )))
    print("Train Neg/Dev Neg Split {:d}/{:d}"
      .format(
        len(
            [a for a in y_train if a[0] == 1 and a[1] == 0]
        ), len(
            [a for a in y_dev if a[0] == 1 and a[1] == 0]
        )))
    

Fold #0
Train/Dev split: 2430/2431
Train Pos/Dev Pos Split 353/353
Train Neg/Dev Neg Split 2077/2078
Fold #1
Train/Dev split: 2431/2430
Train Pos/Dev Pos Split 353/353
Train Neg/Dev Neg Split 2078/2077


In [5]:
from sklearn.cross_validation import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(_y, 1, test_size=0.5, random_state=0)
for train, test in sss:
    x_train = np.random.permutation(x[train])
    y_train = np.random.permutation(y[train])

    x_dev = np.random.permutation(x[test])
    y_dev = np.random.permutation(y[test])
    
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
print("Train Pos/Dev Pos Split {:d}/{:d}"
      .format(
        len(
            [a for a in y_train if a[0] == 0 and a[1] == 1]
        ), len(
            [a for a in y_dev if a[0] == 0 and a[1] == 1]
        )))
print("Train Neg/Dev Neg Split {:d}/{:d}"
      .format(
        len(
            [a for a in y_train if a[0] == 1 and a[1] == 0]
        ), len(
            [a for a in y_dev if a[0] == 1 and a[1] == 0]
        )))



Train/Dev split: 2430/2431
Train Pos/Dev Pos Split 353/353
Train Neg/Dev Neg Split 2077/2078
