In [1]:
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import string
import json
import os
import re

In [2]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer("english")

In [4]:
# load data
with open("data/training_set.json", 'r') as f:
    train_datas = json.load(f)
with open("data/test_set.json", 'r') as f:
    test_datas = json.load(f)

In [5]:
def preprocess(datas, rm_stop_words=False):
    texts = []
    senti_scores = []
    for data in datas:
        
        snippet = data["snippet"]
        
        if type(snippet) == list:
            snippet = ' '.join(snippet)
        
        elif snippet == [] or snippet == "": # some data have empty snippet
            snippet = data["tweet"]
            
        raw_snippet = snippet

        # remove strange token
        snippet = re.sub('https?://[A-Za-z0-9./]+', ' ', snippet)
        snippet = re.sub(u"(?:[^a-zA-Z0-9\+\-\$'#\/])", ' ', snippet)

        snippet = snippet.lower().split(' ')
    
        # monitor if empty
        if snippet == []:
            print raw_snippet
 

        # remove <num>, only leave +<num>, -<num>
        snippet = [re.sub("\d+.*", "<num>", word) for word in snippet]         
        snippet = [re.sub(r'\+<num>.*', '_pos_num', word) for word in snippet]
        snippet = [re.sub(r'\-<num>.*', '_neg_num', word) for word in snippet]
        snippet = [word for word in snippet if "<num>" not in word]
        
        # remove <company>  
        snippet = [word for word in snippet if word.startswith('$') == False]
                
        if rm_stop_words:
            snippet = [word for word in snippet if word not in stopwords.words('english')] # remove stopwords
        
        snippet = [stemmer.stem(word) for word in snippet] # stem words
        snippet = [word for word in snippet if word != '' ]
    
#         print raw_snippet
#         print snippet
        
#         print ' '.join(snippet)
        
#         print snippet
#         print raw_snippet
    
        if snippet == []:
            print "->" + raw_snippet
  
        # ignore empty training data
        if snippet != []:
            texts.append(snippet)
            senti_scores.append(data["sentiment"])
        
    return texts, senti_scores

In [7]:
train_texts, y_train = preprocess(train_datas, rm_stop_words=False)
test_texts, y_test = preprocess(test_datas, rm_stop_words=False)

print "before preprocesing, len(train_datas): %d" % len(train_datas)
print "after preprocesing, len(train_texts): %d" % len(train_texts)
print "before preprocesing, len(test_datas): %d" % len(test_datas)
print "after preprocesing, len(test_texts): %d" % len(test_texts)

->$YOKU 0
->15%
before preprocesing, len(train_datas): 1396
after preprocesing, len(train_texts): 1394
before preprocesing, len(test_datas): 634
after preprocesing, len(test_texts): 634


In [11]:
# encode x to BOW vector
from sklearn.feature_extraction.text import CountVectorizer

ngram_range = (1, 3)
max_features = 3200

vectorizer = CountVectorizer(max_features=max_features, ngram_range=ngram_range)

corpus = [' '.join(text) for text in train_texts]
x_train = vectorizer.fit_transform(corpus)
x_train = x_train.toarray()

corpus = [' '.join(text) for text in test_texts]
x_test = vectorizer.transform(corpus)
x_test = x_test.toarray()


In [12]:
np.save("data/x_train", x_train)
np.save("data/x_test", x_test)
np.save("data/y_train", y_train)
np.save("data/y_test", y_test)

print "create file: data/x_train"
print "create file: data/x_test"
print "create file: data/y_train"
print "create file: data/y_test"

create file: data/x_train
create file: data/x_test
create file: data/y_train
create file: data/y_test
