In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from collections import defaultdict
import os
import sys
import re
import numpy as np
import preprocessor as p
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import operator
import pickle

p.set_options(p.OPT.URL, p.OPT.NUMBER, p.OPT.RESERVED, p.OPT.MENTION)

TWEET_EMBEDDING_PATH = ".glove.twitter.27B.200d.txt"

MAX_SEQUENCE_LENGTH = 40
MAX_NUM_WORDS = 40000
EMBEDDING_DIM = 200


def load_sentences_from_df(train_df, id_field = 'id', sentiment_label='label', tweet_field='tweet', lower=True, clean=True):
    """
    Loads sentences.
    :param train_df: pandas.DataFrame containing labeled tweets.
    :return: sents (paired with labels), word doc freq, list of labels.
    """
    sents = []
    lbl = {'negative':0,
           'neutral':1,
          'positive':2}
    ids = set()
    word_df = defaultdict(int)        
    for line in train_df.iterrows():
        
        if not(line[1][id_field] in ids):
            ids.add(line[1][id_field])
            tweet = line[1][tweet_field]
            sentiment = line[1][sentiment_label]

            clean_text = tweet.lower() if lower else text
            clean_text = p.clean(clean_text) if clean else clean_text
            clean_text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', re.sub(r'[^\x00-\x7f]',r'', clean_text)) if clean else clean_text

            words = clean_text.split()
            for word in set(words):
                word_df[word] += 1
            pair = (words, lbl[sentiment])
            sents.append(pair)

    labels = [0] * len(lbl)
    for l,i in lbl.items():
        labels[i] = l
        
    return sents, word_df, labels

def split(df, pct):
    start = 0
    end = 0
    result = []

    for i in range(0,len(pct)):
        end = start+int(len(df)*pct[i])
        result.append(df.iloc[start:end])
        start=end
    
    return result  

def df_from_tsv(path):
    data = []
    with open(path, "r") as f:
        for l in f:
            rec = l[:-1].split('\t')
            if len(rec) == 3:
                data.append(rec)
    return pd.DataFrame(columns=['id', 'label', 'tweet'], data=data)

def createDatasets(df, labels, column, pct, shuffle=False):
    
    results_tmp = []
    result = []
    i=0
    
    for l in labels:
        d_tmp = df[df[column]==l]
        results_tmp.append(split(d_tmp,pct))


    for i in range(0,len(labels)):
        d=pd.DataFrame()
        for j in range(0,len(labels)):
            d=d.append(results_tmp[j][i])
        if shuffle:
            d=d.reindex(np.random.RandomState(seed=2).permutation(d.index))
        result.append(d)

    return result


Using TensorFlow backend.


# Load data

In [2]:
train_dfs = [ df_from_tsv("./data/train/twitter-train"+str(i)+".txt") for i in range(6)]
train_df = pd.concat(train_dfs)
print("Train with dup has " + str(len(train_df)) + " records")

#removing duplicates
no_dup = train_df.groupby(as_index=False, by=['id']).first()
print("Train with no dup has " + str(len(no_dup)) + " records, "+str(len(train_df)-len(no_dup))+" less.")
no_dup.head()

Train with dup has 21826 records
Train with no dup has 21240 records, 586 less.


Unnamed: 0,id,label,tweet
0,100000794790727680,positive,One Night like In Vegas I make dat Nigga Famous
1,100000831528632320,positive,Walking through Chelsea at this time of day is...
2,100000950005145600,neutral,"""And on the very first play of the night, Aaro..."
3,100000974885748736,neutral,"""Drove the bike today, about 40 miles. Felt li..."
4,100001038454624257,negative,looking at the temp outside....hpw did it get ...


In [3]:
#It's possible to divide the dataset in three part: test, train and validation. We decided to keep the dateset intact
tweet_df=createDatasets(no_dup,['positive','negative','neutral'],'label',[0.,1.,0.],shuffle=True)
train=tweet_df[1]

#statistics
train_sents, word_df, train_labels = load_sentences_from_df(train)
max_l = max(len(words) for words,l in train_sents)
print('------- Train -------')
print( "number of sentences: %d" % len(train_sents))
print( "vocab size: %d" % len(word_df))
print( "max sentence length: %d" % max_l)

------- Train -------
number of sentences: 21240
vocab size: 48625
max sentence length: 33


# Build embedding index from GloVe-Twitter-.27B-200d 

In [4]:
#we build the embeddings composed of the only words present in our tweets
embeddings_index = {}
with open(TWEET_EMBEDDING_PATH, mode='rb') as f:
    for line in f:
        values = line.split()
        word = values[0]
        if(word_df.get(word) is not None): #if the word is present in word_df, we keep its values
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

In [6]:
text = []
labls = []

for s in train_sents:
    text.append(s[0])
    labls.append(s[1])

print('Found %s tweets.' % len(train_sents))

Found 21240 tweets.


# Tokenization

In [6]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

train_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_labels = to_categorical(np.asarray(labls))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', train_labels.shape)

Found 42130 unique tokens.
Shape of data tensor: (16991, 40)
Shape of label tensor: (16991, 3)


# Build embedding matrix for the lookup layer

In [7]:
num_words = min(MAX_NUM_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
high = 2.38 / np.sqrt(len(text) + EMBEDDING_DIM) # see (Bottou '88)
for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros
        embedding_matrix[i] = embedding_vector
    else: 
        embedding_matrix[i] = np.random.uniform(-high, high, EMBEDDING_DIM)


In [8]:
pickle.dump(file = open('embedding_matrixG200', 'wb'), obj = embedding_matrix)
pickle.dump(file = open('train_data', 'wb'), obj = train_data)
pickle.dump(file = open('train_labels', 'wb'), obj = train_labels)