In [12]:
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import string
import json
import os
import re

In [13]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer("english")

In [14]:
# load training data
with open("data/training_set.json", 'r') as f:
    train_datas = json.load(f)

In [15]:
# load testing data
with open("data/test_set.json", 'r') as f:
    test_datas = json.load(f)

In [16]:

def preprocess(datas, rm_stop_words=False):
    idx = 0
    neutral_idxs = []
    texts = []
    senti_class = []
    for data in datas:
        
        snippet = data["snippet"]
        
        if type(snippet) == list:
            snippet = ' '.join(snippet)
        
        elif snippet == [] or snippet == "": # some data have empty snippet
            snippet = data["tweet"]
            
        raw_snippet = snippet

        # remove strange token
        snippet = re.sub('https?://[A-Za-z0-9./]+', ' ', snippet)
        snippet = re.sub(u"(?:[^a-zA-Z0-9\+\-\$'#\/])", ' ', snippet)

        snippet = snippet.lower().split(' ')
    
        # monitor if empty
        if snippet == []:
            print raw_snippet
 

        # remove <num>, only leave +<num>, -<num>
        snippet = [re.sub("\d+.*", "<num>", word) for word in snippet]         
        snippet = [re.sub(r'\+<num>.*', '_pos_num', word) for word in snippet]
        snippet = [re.sub(r'\-<num>.*', '_neg_num', word) for word in snippet]
        snippet = [word for word in snippet if "<num>" not in word]
        
        # remove <company>  
        snippet = [word for word in snippet if word.startswith('$') == False]
                
        if rm_stop_words:
            snippet = [word for word in snippet if word not in stopwords.words('english')] # remove stopwords
        
        snippet = [stemmer.stem(word) for word in snippet] # stem words
        snippet = [word for word in snippet if word != '' ]
    
#         print raw_snippet
#         print snippet
        
#         print ' '.join(snippet)
        
#         print snippet
#         print raw_snippet
    
          # check empty snippet after preprocessing
        if snippet == []:
            print "->" + raw_snippet
  
        # ignore empty training data
        if snippet != []:
            texts.append(snippet)
            
            if data["sentiment"] < 0.0:
                senti_class.append(0)
            elif data["sentiment"] == 0.0:
                senti_class.append(1)
            elif data["sentiment"] > 0.0:
                senti_class.append(2)
            
            if data["sentiment"] == 0.0:
                neutral_idxs.append(idx)
                
            idx += 1
            
#             senti_scores.append(data["sentiment"])
        
    return texts, senti_class, neutral_idxs

In [17]:
def preprocess_pipe(_rm_stop_words):
    train_texts, y_train, train_neutral_idxs = preprocess(train_datas, rm_stop_words=_rm_stop_words)
    test_texts, y_test, test_neutral_idxs = preprocess(test_datas, rm_stop_words=_rm_stop_words)

    print "before preprocesing, len(train_datas): %d" % len(train_datas)
    print "after preprocesing, len(train_texts): %d" % len(train_texts)
    print "before preprocesing, len(test_datas): %d" % len(test_datas)
    print "after preprocesing, len(test_texts): %d" % len(test_texts)

    c = np.array(y_train)
    y_test = np.array(y_test)

    np.save("data/y_train", y_train)
    np.save("data/y_test", y_test)
    
    return train_texts, test_texts, y_train, y_test, train_neutral_idxs, test_neutral_idxs

In [18]:
train_texts, test_texts, y_train, y_test, train_neutral_idxs, test_neutral_idxs = preprocess_pipe(_rm_stop_words=False)

->$YOKU 0
->15%
before preprocesing, len(train_datas): 1396
after preprocesing, len(train_texts): 1394
before preprocesing, len(test_datas): 634
after preprocesing, len(test_texts): 634


In [19]:
train_neutral_idxs

[71, 126, 150, 227, 285, 311, 321, 396, 618, 619, 728, 734]

In [20]:
distribution = [0,0,0]
for s in y_train:
    distribution[s] += 1
print "training -> bullish:%d, neutral:%d, bearish:%d" % (distribution[0], distribution[1], distribution[2])

distribution = [0,0,0]
for s in y_test:
    distribution[s] += 1
print "testing -> bullish:%d, neutral:%d, bearish:%d" % (distribution[0], distribution[1], distribution[2])


training -> bullish:265, neutral:12, bearish:1117
testing -> bullish:221, neutral:12, bearish:401


In [21]:
# predict
from sklearn.feature_extraction.text import CountVectorizer
from mlp import predict
import tensorflow as tf

f1_scores = []
ngram_range = (1, 3)
max_features = 3200

vectorizer = CountVectorizer(max_features=max_features, ngram_range=ngram_range)

corpus = [' '.join(text) for text in train_texts]
x_train = vectorizer.fit_transform(corpus)
x_train = x_train.toarray()

corpus = [' '.join(text) for text in test_texts]
x_test = vectorizer.transform(corpus)
x_test = x_test.toarray()

np.save("data/x_train", x_train)
np.save("data/x_test", x_test)

    