In [128]:
import sys
import os
import numpy as np
import tensorflow as tf
sys.path.append('../src/')

In [22]:
from preprocessors.pretrained_embeddings import Pretrained


In [24]:
pre_embeds = Pretrained(200, 'glove27b')
pre_embeds.create_from_file('/embeddings/glove.twitter.27B.200d.txt')

In [27]:
pre_embeds.embeddings_index.get('pippa')[:50]

array([-0.17313 , -0.083534,  0.094943,  0.27862 , -0.09849 , -0.64505 ,
       -0.034571, -0.033253, -0.14127 ,  0.77595 , -0.50909 ,  0.48752 ,
       -0.11292 ,  0.12668 ,  0.43857 ,  0.25239 ,  0.034386, -0.19116 ,
        0.1735  , -0.081495,  0.15889 , -0.46626 , -0.036711,  0.30366 ,
        0.26433 ,  0.59701 ,  0.45844 , -0.24775 , -0.35584 ,  0.037122,
        0.18221 ,  0.46557 , -0.44061 , -0.17489 ,  0.10444 ,  0.42832 ,
        0.37302 , -0.017115, -0.33439 , -0.071264,  0.77562 , -0.6526  ,
        0.47253 ,  0.32325 ,  0.095131,  0.13935 ,  0.074671,  0.31263 ,
       -0.53981 ,  0.041234], dtype=float32)

In [115]:
os.listdir('/data')

['joy_train.txt', 'joy_test.txt']

In [76]:
inputs = []
labels = []
with open('/data/joy_train.txt') as f:
    contents = f.read()
    lines = contents.split("\n")
    data = [row.split('\t') for row in lines]
    header, data = data[0], data[1:-1]
    input_index, label_index = header.index('Tweet'), header.index('Intensity Class')
    for row in data:
        inputs.append(row[input_index])
        labels.append(row[label_index])
    

In [77]:
inputs

['@david_garrett Quite saddened.....no US dates, no joyous anticipation of attending a DG concert (since 2014). Happy you are keeping busy.',
 "2 days until #GoPackGo and 23 days until #GoGipeGo..... I'm so excited! ",
 'Positive #psychology research shows salespeople who score in the top 10% for #optimism have 88% &gt; sales than those in top 10% for pessimism.',
 'As the birds chirp and the cows moo we need to listen to the sound of nature to ensure that all is well.',
 'Howling with laughter at “WELL DONE BEZZA!” #bakeoff #GBBO',
 '@StephaliciousD afternoon delight',
 '@UKLittleKitchen Defo a hearty root veg gratin. Nice comfort food as Autumn kicks in',
 'As I remember and reflect on Reginald Denny on a night much like this (like #Charlotte) I think to myself, the gun lobby must be rejoicing.',
 "@hesham786 that's the spirit #optimism",
 '@simmy_hanley @Schrise also a ',
 '@scottlocker220 Thank you for that, the notification came in the nick of time to cheer me up - have to re-twee

In [131]:
class TextData:
    def __init__(self, batch_size=64, buffer_size=10000):
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.train_inputs = []
        self.train_labels = []
        self.test_inputs = []
        self.test_labels = []
        
    def load_from_csv(self, path, delim, has_header, text_col, label_col, array, label):
        with open(path) as f:
            contents = f.read()
            lines = contents.split("\n")
            data = [row.split(delim) for row in lines if row != '']

            
            if has_header:
                header, data = data[0], data[1:]
                input_index, label_index = header.index(text_col), header.index(label_col)
            else:
                input_index, label_index = text_col, label_col
            
            for row in data:
                array.append(row[input_index])
                label.append(row[label_index])
                
    def train_data_csv(self, path, delim, has_header, text_col, label_col):
        self.load_from_csv(path, delim, has_header, text_col, label_col, self.train_inputs, self.train_labels)
        
    def test_data_csv(self, path, delim, has_header, text_col, label_col):
        self.load_from_csv(path, delim, has_header, text_col, label_col, self.test_inputs, self.test_labels)
    
    def create_training_tensors(self):
        self.training_data = tf.data.Dataset.from_tensor_slices((self.train_inputs, self.train_labels)).shuffle(self.buffer_size).batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
        self.testing_data = tf.data.Dataset.from_tensor_slices((self.test_inputs, self.test_labels)).batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
        

In [132]:
text = TextData()
text.train_data_csv('/data/joy_train.txt', delim='\t', has_header=True, text_col='Tweet', label_col='Intensity Class')
text.test_data_csv('/data/joy_test.txt', delim='\t', has_header=True, text_col='Tweet', label_col='Intensity Class')

In [133]:
print("number of training samples: {}".format(len(text.train_inputs)))
print("number of testing samples:  {}".format(len(text.test_inputs)))

number of training samples: 1616
number of testing samples:  290


In [122]:
import numpy as np
np.char.encode(['james is aweomse', 'bain'])

array([b'james is aweomse', b'bain'], dtype='|S16')

In [134]:
text.create_training_tensors()
text.training_data

<PrefetchDataset shapes: ((None,), (None,)), types: (tf.string, tf.string)>

In [136]:
for example, label in text.training_data.take(1):
    print('texts:', example.numpy()[:3])

texts: [b"Ahh when my voice gets better I've been asked to play :D id be more than #happy to do that"
 b'Getting my comedic relief w/ @SofiaVergara during season premiere of #ModernFamily. Just what a girl needs! '
 b"Take my kindness for weakness when you acting silly keeping it 100 ain't your fort\xc3\xa9  #ChrisBrown #TeamBreezy"]
