In [1]:
import pandas as pd
import re
from re import sub
from wordEmbedders import AverageClassifier, Word2Vec, WESCScore
from tqdm.notebook import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
tqdm.pandas()

In [2]:
stops = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [3]:
def cleanText(text, stopwords = None, stemming = None, bigram = None, hashtags = False):
    text = sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text, flags=re.MULTILINE) #remove links
    text = text.lower() #lowercase
    if(hashtags):
        text = sub(r'#\w+', '', text) #hashtags
        text = sub(r'@\w+', '', text) #usernames
    text = sub("'s", ' is', text) #english weird stuff
    text = sub("'nt", ' not', text)
    text = sub("'re", ' are', text)
    text = sub(r'[^a-z]', ' ', text) #remove characters
    words = text.split()
    if(stopwords):
        words = [w for w in words if w not in stops]
    if(stemming):
        words = [stemming(w) for w in words]
    if(bigram):
        words = bigram[words]
    return ' '.join(words)

In [4]:
def bigramFromCorpus(corpus):
    sent = [row.split() for row in df['text']]
    phrases = Phrases(sent, min_count=1)
    bigram = Phraser(phrases)
    return bigram

In [5]:
dataset = 'Sentiment140'

In [6]:
positiveWords = ["good", "nice", "cool", "lovely", "wonderful", "great", "awesome", "fantastic", "amazing", "fun", "excellent"]
negativeWords = ["bad", "horrible", "terrible", "awful", "worst", "shitty", "crappy", "sucks", "hate"]

In [7]:
dataFile = f"../data/{dataset}/Data-Compiled.csv"
df = pd.read_csv(dataFile)

In [12]:
def testCleaning(name, fun):
    data = pd.DataFrame()
    data['sentiment'] = df['sentiment']
    data['text'] = df['text'].progress_map(fun)
    word2vec = Word2Vec(data['text'])
    word2vec.train(tqdm, epochs=30)
    pos = fun(' '.join(positiveWords)).split()
    neg = fun(' '.join(negativeWords)).split()
    classifier = AverageClassifier(word2vec.model.wv, pos, neg, None, None)
    result = classifier.predict(data)
    print(name)
    print(result.balancedAccuracy)
    print(result.f1Score)
    result.save(name + '.csv')

In [9]:
testCleaning('plain', lambda text: cleanText(text))
testCleaning('stopwords', lambda text: cleanText(text, stopwords=stopwords))
testCleaning('stemming', lambda text: cleanText(text, stemming=stemmer.stem))
testCleaning('lemma', lambda text: cleanText(text, stemming=lemmatizer.lemmatize))

clean = df['text'].map(cleanText)
bigram = bigramFromCorpus(clean)
testCleaning('bigrams', lambda text: cleanText(text, bigram=bigram))
testCleaning('hashtags', lambda text: cleanText(text, hashtags=True))

  0%|          | 0/1600000 [00:00<?, ?it/s]

Word2Vec epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1600000 [00:00<?, ?it/s]

plain
0.615583125
0.4543006626641245


  0%|          | 0/1600000 [00:00<?, ?it/s]

Word2Vec epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1600000 [00:00<?, ?it/s]

stopwords
0.59140625
0.4340503039463683


  0%|          | 0/1600000 [00:00<?, ?it/s]

Word2Vec epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1600000 [00:00<?, ?it/s]

stemming
0.61459
0.47292647111615593


  0%|          | 0/1600000 [00:00<?, ?it/s]

Word2Vec epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1600000 [00:00<?, ?it/s]

lemma
0.594851875
0.38353030010527506


  0%|          | 0/1600000 [00:00<?, ?it/s]

Word2Vec epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1600000 [00:00<?, ?it/s]

bigrams
0.5904825
0.38353182032865607


  0%|          | 0/1600000 [00:00<?, ?it/s]

Word2Vec epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1600000 [00:00<?, ?it/s]

hashtags
0.6517831249999999
0.6001301925399818


In [10]:
result = WESCScore.load('./plain.csv')

In [11]:
print(result.confusionMatrix)

256025 | 71092 
-------+-------
543975 | 728908


In [13]:
testCleaning('plain', lambda text: cleanText(text))
testCleaning('stemming', lambda text: cleanText(text, stemming=stemmer.stem))
testCleaning('hashtags', lambda text: cleanText(text, hashtags=True))

  0%|          | 0/1600000 [00:00<?, ?it/s]

Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/1600000 [00:00<?, ?it/s]

plain
0.6858593749999999
0.6865733999731861


  0%|          | 0/1600000 [00:00<?, ?it/s]

Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/1600000 [00:00<?, ?it/s]

stemming
0.689024375
0.6828824017167502


  0%|          | 0/1600000 [00:00<?, ?it/s]

Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/1600000 [00:00<?, ?it/s]

hashtags
0.6218925
0.704827201261544
