In [19]:
# Pandas implementation
import pandas as pd
import numpy as np
import time
import pickle
from collections import Counter
import itertools
import re
import warnings

# Dask implementation
from numba import njit
from dask import compute, delayed
import dask.dataframe as dd
import datashader as ds
from dask.distributed import Client

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import gensim

warnings.filterwarnings('ignore')

In [20]:
# global variables
DATASET_ENCODING = "ISO-8859-1"
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

path = '/Users/chrislouie/Documents/Python/random/training.1600000.processed.noemoticon.csv'

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrislouie/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Pandas + Word2Vec

### TODO: %%time every cell in our preprocessing pipeline all the way to training


In [7]:
%%time

df = pd.read_csv(path, encoding = DATASET_ENCODING, 
                     names = DATASET_COLUMNS)

CPU times: user 3.3 s, sys: 264 ms, total: 3.57 s
Wall time: 3.58 s


In [8]:
df = pd.read_csv(path, encoding = DATASET_ENCODING, 
                     names = DATASET_COLUMNS)
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [9]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [10]:
%%time
df.target = df.target.apply(lambda x: decode_sentiment(x))

CPU times: user 495 ms, sys: 8.6 ms, total: 503 ms
Wall time: 510 ms


In [13]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

In [14]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [15]:
%%time
df.text = df.text.apply(lambda x: preprocess(x))

CPU times: user 43.2 s, sys: 233 ms, total: 43.5 s
Wall time: 43.6 s


In [16]:
df_train, df_test = train_test_split(df, test_size=0.25, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 1200000
TEST size: 400000


In [23]:
%%time

# word2vec

documents = [_text.split() for _text in df_train.text] 

CPU times: user 2.23 s, sys: 200 ms, total: 2.43 s
Wall time: 2.43 s


In [24]:
documents = [_text.split() for _text in df_train.text]

In [28]:
# Word2Vec vars

W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            )

In [29]:
w2v_model.build_vocab(documents)

In [30]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

CPU times: user 12min 31s, sys: 4.24 s, total: 12min 35s
Wall time: 4min 10s


(246357451, 276847840)

In [33]:
w2v_model.most_similar("tesla")

[('logitech', 0.3429398536682129),
 ('bjp', 0.3017113208770752),
 ('chevy', 0.3003014922142029),
 ('components', 0.2916417717933655),
 ('scheme', 0.2903280258178711),
 ('2008', 0.28540724515914917),
 ('bmw', 0.28179261088371277),
 ('customization', 0.2806711494922638),
 ('ibm', 0.2800779938697815),
 ('installation', 0.2799241542816162)]

In [35]:
w2v_model.most_similar("facebook")

[('fb', 0.5869807004928589),
 ('twitter', 0.5807549357414246),
 ('myspace', 0.5275101065635681),
 ('hotmail', 0.4754437804222107),
 ('profiles', 0.47502514719963074),
 ('friendster', 0.46950381994247437),
 ('bebo', 0.4628342390060425),
 ('orkut', 0.4625546336174011),
 ('flickr', 0.4566538333892822),
 ('lj', 0.45159757137298584)]

In [38]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 278491
CPU times: user 17.1 s, sys: 120 ms, total: 17.2 s
Wall time: 17.2 s


For some odd reason, if you run the magic function time in the same cell as the assignment of a variable, the variable assignment is not saved to memory...? unless it is and i'm doing something wrong.

In [39]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 278491


In [40]:
%%time
X_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=SEQUENCE_LENGTH)
X_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=SEQUENCE_LENGTH)

CPU times: user 25.2 s, sys: 985 ms, total: 26.2 s
Wall time: 26.6 s


In [41]:
X_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=SEQUENCE_LENGTH)
X_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=SEQUENCE_LENGTH)

In [42]:
labels = df_train.target.unique().tolist()
labels.append(NEUTRAL)
labels

['NEGATIVE', 'POSITIVE', 'NEUTRAL']

In [43]:
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())

y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

y_train (1200000, 1)
y_test (400000, 1)


In [44]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))

for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

print(embedding_matrix.shape)

(278491, 300)


In [46]:
embedding_layer = Embedding(vocab_size, W2V_SIZE, 
                            weights=[embedding_matrix], 
                            input_length=SEQUENCE_LENGTH, 
                            trainable=False)

### At this point the preprocessing is done 
### How do we know preprocessing is ready?
- Train and Test set are encoded and padded
- We have an embedding matrix which is essentially all the vectorized words 
- an embedding layer is created using the embedding matrix as weights

# Dask + Keras

### TODO: Dask pipeline below 

In [22]:
%%time
df = dd.read_csv(path,encoding=DATASET_ENCODING)

CPU times: user 10.8 ms, sys: 4.36 ms, total: 15.2 ms
Wall time: 14 ms


In [24]:
df = dd.read_csv(path,encoding=DATASET_ENCODING)