In [1]:
from sklearn.naive_bayes import MultinomialNB
import csv
import re
import nltk
import time
import pandas as pd

In [2]:
with open('./pos_train.txt', 'r') as f:
    pos_data = f.read().splitlines()
with open('./neg_train.txt', 'r') as f:
    neg_data = f.read().splitlines()

In [3]:
def checkNumber(word):
    originalWord = word
    specialChars = ['.', ',', '/', '%', '-']
    for char in specialChars:
        word = word.replace(char, '')
    if word.isdigit():
        return '<number>'
    else:
        return originalWord
    
def removeNumbers(words):
    return list(map(checkNumber, words))

In [4]:
from hashtag_separator import infer_spaces

def checkHashtag(word):
    if (len(word)==0):
        return False
    return word[0]=='#'

def replaceHashtags(words):
    newWords = []
    for word in words:
        if not (checkHashtag(word)):
            newWords.append(word)
            continue
        
        newWords.append('<hashtag>')
        #print(infer_spaces(word[1:]))
        for hashWord in infer_spaces(word[1:]).split(' '):
            newWords.append(hashWord)
    
    return newWords
        
        

In [5]:
from autocorrect import spell
#https://stackoverflow.com/questions/10072744/remove-repeating-characters-from-words
def emphasize(word):
    if (word[:2]=='..'):
        return '...'
    
    newWord = re.sub(r'(.)\1{2,}', r'\1', word)
    if (len(newWord)!=len(word)):
        return '<<'+spell(newWord)+'>>'
    else:
        return word

def emphasizeWords(words):
    return list(map(emphasize, words))

In [6]:
def emphasizePunctiation(words):
    specialChars = ['!', '?']
    i = 1
    while (i<len(words)):
        word1 = words[i-1]
        word2 = words[i]
        if (word1 in specialChars and word2 in specialChars):
            start = i-1
            while (i+1<len(words) and words[i+1] in specialChars):
                i += 1
            words = words[:start] + ['<emphasis>'] + words[i+1:]
            i = start
        
        i += 1
    
    return words

In [7]:
from nltk.corpus import stopwords
stopWords = stopwords.words('english')

def removeStopwords(words):
    return list(filter(lambda x: x not in stopWords, words))

In [8]:
lancasterStemmer = nltk.stem.lancaster.LancasterStemmer()
wordNetLemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize(word):
    try:
        temp = wordNetLemmatizer.lemmatize(word).lower()
        return temp
    except:
        return word

def normalizeWords(words):
    return list(map(lemmatize, words))

In [9]:
#https://github.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107
with open('./positive-words.txt', 'r') as f:
    positiveWords = f.read().splitlines()
with open('./negative-words.txt', 'r') as f:
    negativeWords = f.read().splitlines()

In [10]:
def checkPositiveNegative(words):
    newWords = []
    for word in words:
        if (word[:2]=='<<'):
            checkWord = word[2:-2]
        else:
            checkWord = word
        
        if checkWord in positiveWords:
            newWords += ['positively', word]
        elif checkWord in negativeWords:
            newWords += ['negatively', word]
        else:
            newWords.append(word)
    
    return newWords
    

In [11]:
def parseTweet(t):
    words = t.split(' ')
    words = removeNumbers(words)
    words = replaceHashtags(words)
    words = emphasizeWords(words)
    words = emphasizePunctiation(words)
    words = removeStopwords(words)
    words = normalizeWords(words)
    words = checkPositiveNegative(words)
    t = ' '.join(words)
    return t

In [12]:
def parseData(data):
    parsed = []
    start_time = time.time()
    for i, t in enumerate(data):
        if (i>0 and i%5000==0):
            print(str(i)+'/'+str(len(data)), time.time()-start_time)
        parsed.append(parseTweet(t))
    print(time.time()-start_time)
    
    return parsed

In [19]:
from textblob.classifiers import NaiveBayesClassifier

In [33]:
parsed_pos = parseData(pos_data[0:5000])
parsed_neg = parseData(neg_data[0:5000])

8.556854009628296
7.729357719421387


In [34]:
posDF = pd.DataFrame(parsed_pos)
posDF.columns = ['text']
posDF['label'] = 'pos'
negDF = pd.DataFrame(parsed_neg)
negDF.columns = ['text']
negDF['label'] = 'neg'

In [35]:
df = pd.concat([posDF, negDF]).sample(frac=1).reset_index(drop=True)
df.to_csv('parsed_test.csv', index=False)

In [36]:
with open('parsed_test.csv') as f:
    cl = NaiveBayesClassifier(f, format='csv')

In [37]:
cl.classify('I suck dick')

'pos'

In [38]:
with open('./test_data.txt', 'r') as f:
    test_data = f.read().splitlines()

In [48]:
cl_data = []
start_time = time.time()
for i, l in enumerate(test_data):
    if (i>0 and i%500==0):
        print(str(i)+'/'+str(len(test_data)), time.time()-start_time)
    l = l.split(',', 1)
    id_ = l[0]
    tweet = l[1]
    cl_data.append([id_, cl.classify(tweet)])
print(time.time()-start_time)
df = pd.DataFrame(cl_data)
df.columns = ['Id', 'Prediction']

100/10000 7.3515729904174805
200/10000 14.485529899597168
300/10000 22.302922010421753
400/10000 29.161931037902832
500/10000 36.11663603782654
600/10000 43.951600074768066
700/10000 51.47758078575134
800/10000 58.543980836868286
900/10000 65.5224220752716
1000/10000 72.61669516563416
1100/10000 79.66771697998047
1200/10000 86.70493912696838
1300/10000 93.74563312530518
1400/10000 100.66223812103271
1500/10000 107.4223120212555
1600/10000 114.47362279891968
1700/10000 121.5224449634552
1800/10000 128.64934515953064
1900/10000 135.6797730922699
2000/10000 142.72301506996155
2100/10000 150.5981569290161
2200/10000 157.54157304763794
2300/10000 165.45492315292358
2400/10000 174.4098241329193
2500/10000 181.92578792572021
2600/10000 189.03197503089905
2700/10000 197.71762108802795
2800/10000 205.74543809890747
2900/10000 213.60946488380432
3000/10000 224.17110681533813
3100/10000 232.73848605155945
3200/10000 240.20346093177795
3300/10000 248.08762311935425
3400/10000 255.29530882835388
35

In [54]:
df['Prediction'] = df['Prediction'].apply(lambda x : 1 if x == 'pos' else -1)

In [55]:
df.to_csv('submission.csv', index=False)

In [None]:
def classify(model, test_data):
    cl_data = []
    for l in test_data:
        l = l.split(',', 1)
        id_ = l[0]
        tweet = l[1]
        cl_data.append([id_, model.classify(tweet)])
    df = pd.DataFrame(cl_data)
    df.columns = ['Id', 'Prediction']
    df.to_csv('submission.csv', index=False)

In [22]:
t = '<user> i .... this sucks ? ! ! haaappyyyyy dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15\n'
print('original:\n', t)
t = t.replace('\n', '')
words = t.split(' ')
print(words)

words = removeNumbers(words)
words = replaceHashtags(words)
words = emphasizeWords(words)
words = emphasizePunctiation(words)
words = removeStopwords(words)
words = normalizeWords(words)
words = checkPositiveNegative(words)
print(words)

t = ' '.join(words)
print('\nafter:\n', t)

original:
 <user> i .... this sucks ? ! ! haaappyyyyy dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15

['<user>', 'i', '....', 'this', 'sucks', '?', '!', '!', 'haaappyyyyy', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']
['<user>', '...', 'negatively', 'suck', '<emphasis>', 'positively', '<<happy>>', 'dunno', 'justin', 'read', 'mention', '.', 'justin', 'god', 'know', ',', 'hope', 'follow', '<hashtag>', 'believe', '<number>']

after:
 <user> ... negatively suck <emphasis> positively <<happy>> dunno justin read mention . justin god know , hope follow <hashtag> believe <number>
