In [1]:
# Task 1. Sentiment analysis
# @author: dimitris.paraschakis@mah.se

from sys import stdout
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import numpy as np
import re
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from autocorrect import spell #https://github.com/phatpiglet/autocorrect



In [9]:
# download Wordnet data
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ae0670\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [17]:
# deal with repeated characters
def merge_multichars(token):    
    multichars = {}
    prev = token[0]
    for char in token[1:]:
        if char == prev:
            if char not in multichars:
                multichars[char] = 1
            multichars[char] += 1
        prev = char
    for key,value in multichars.items():
        if value > 3:
            token = token.replace(key*value, key)
    return token

In [14]:
# prepare the data for sentiment analysis
def process(data):
    sentiment_file = open(data)
    st = EnglishStemmer()
    lm = WordNetLemmatizer()
    y = []
    corpus = []
    i = 0
    for line in sentiment_file:
        stdout.write('\rprocessing row: %d...' % i)
        stdout.flush()
        sentence = ''
        tokens = line.split('\t')
        y.append(int(tokens[0] == 'neg'))
        tokens = tokens[1].split()
        for token in tokens:
            if not ('http' in token or '@' in token):
                token = token.replace('&quot;', '')
                subtokens = re.findall(r"[\w']+", token)
                for subtoken in subtokens:
                    if len(subtoken) > 0:
                        subtoken = merge_multichars(subtoken)
                        subtoken = st.stem(lm.lemmatize(spell(subtoken)))
                        sentence += subtoken+' '
        if (sentence.strip()==''):
            del y[i]
            continue
        i += 1
        corpus.append(sentence.strip())
    y = np.array(y, dtype=float)
    stdout.write('\n')
    return np.array(corpus), y

In [15]:
# calculate scores based on TF-IDF encoding via cross-validation
def cross_validate(corpus, y, model, folds):
    splits = StratifiedKFold(y, folds, True)
    y_pred = np.zeros_like(y, dtype=float)
    for i_train, i_test in splits:
        X_train = [corpus[i] for i in i_train]
        X_test = [corpus[i] for i in i_test]
        y_train = y[i_train]
        vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False)
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        model.fit(X_train, y_train)
        y_pred[i_test] = model.predict_log_proba(X_test)[:,1]
    return roc_auc_score(y, y_pred)

In [16]:
# perform sentiment analysis of tweets
corpus, y = process('data/sentiment.tsv')
classifier = MultinomialNB(fit_prior=False)
roc_auc = cross_validate(corpus, y, classifier, folds=10)
print('auc_roc = %.3f' % (roc_auc))

processing row: 1996...
auc_roc = 0.805
