In [320]:
import numpy as np
import nltk
from nltk.sentiment import vader
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer 
import re
import string
from sklearn.svm import SVR
from scipy.stats import spearmanr, pearsonr
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

In [80]:
!git clone https://github.com/kdhingra307/nlp_data

Cloning into 'nlp_data'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 18 (delta 2), reused 18 (delta 2), pack-reused 0[K
Unpacking objects: 100% (18/18), done.


In [264]:
class tweet:

  def lexicon(self, file):
      return [e.split("\t") for e in open(file).read().split("\n")[:-1]]

  def __init__(self):
      self.vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english', tokenizer=LemmaTokenizer(), preprocessor=self.preprocess)
      self.lemmatizer = WordNetLemmatizer()
      self.emoticon_score = {e[0]:float(e[1]) for e in self.lexicon("nlp_data/lexicons/emoticon.txt")}
      self.emotion_score = {self.preprocess(e[0]):np.array(e[1:]).astype(np.float32) for e in self.lexicon("nlp_data/lexicons/emotion_score.csv")[1:]}
      self.emotion_count = {self.preprocess(e[0, 0]):e[:, -1].astype(np.float32) for e in np.array(self.lexicon("nlp_data/lexicons/emotion_count.txt")).reshape([-1, 10, 3])}
      #self.emotion_hashtag = {self.preprocess(e[0]):float(e[1]) for e in self.lexicon("nlp_data/lexicons/hashtag.txt")}
      self.polarity_hashtag = {self.preprocess(e[0]):float(e[1]) for e in self.lexicon("nlp_data/lexicons/hashtag.txt")}
      self.polarity_sentiment = {self.preprocess(e[0]):np.array([float(e[2]), float(e[3])])  for e in self.lexicon("nlp_data/lexicons/polarity_sentiment.txt")}
      # self.polarity_sentiword = {self.preprocess(e[0]):np.array([float(e[2]), float(e[3])])  for e in self.lexicon("nlp_data/lexicons/polarity_sentiment.txt")}
      mapp = {"positive":0, "negative":1, "neutral":2, "both":3}
      self.pwc_mpqa = {self.preprocess(e[0]):mapp[e[1]]  for e in self.lexicon("nlp_data/lexicons/pwc_mpqa.txt")}
      self.pwc_bing = {self.preprocess(e[0]):mapp[e[1]]  for e in self.lexicon("nlp_data/lexicons/pwc_bing_liu.csv")}



  def preprocess(self, text):
      remove_numbers = re.sub(r'\d+', '', text)
      remove_punctuations = "".join([char.lower() for char in remove_numbers if char not in string.punctuation]) 
      remove_extra_spaces = re.sub('\s+', ' ', remove_punctuations).strip()
      return " ".join([self.lemmatizer.lemmatize(t) for t in word_tokenize(remove_extra_spaces)])

  def feature(self, x):
      output = np.zeros([10+10+1+2+1+1])
      x = preprocess(x)
      for e in x:
          if e in self.emotion_score:
            output[:10] += self.emotion_score[e]
          
          if e in self.emotion_count:
            output[10:20] += self.emotion_count[e]
          
          if e in self.polarity_hashtag:
            output[20] += self.polarity_hashtag[e]
          
          if e in self.polarity_sentiment:
            output[21:23] += self.polarity_sentiment[e]
          
          if e in self.pwc_mpqa:
            output[23] += self.pwc_mpqa[e]
          
          if e in self.pwc_bing:
            output[24] += self.pwc_bing[e]
      
      return np.concatenate([output, list(vader_analyser.polarity_scores(x).values())])

In [265]:
tweet_parser = tweet()

In [266]:
tweet_parser.feature(train_data[0][0])

array([ 2.56046070e+00,  2.82955130e+00,  1.83647573e+00,  1.84195930e+00,
        1.66854285e+00,  7.75093326e+00,  7.42493064e+00,  1.15074075e+00,
        1.25660393e+00,  5.64127346e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -1.86178000e+02,  2.31200000e+03,  3.81200000e+03,  0.00000000e+00,
        0.00000000e+00,  3.17000000e-01,  6.83000000e-01,  0.00000000e+00,
       -7.57900000e-01])

In [263]:
list(vader_analyser.polarity_scores(train_data[0][0]).values())

[0.0, 1.0, 0.0, 0.0]

In [251]:
def process_data(file_name):
  X = []
  y = []
  for e in open(file_name).read().split("\n")[:-1]:
    tab_split = e.split("\t")
    X.append(tab_split[1])
    y.append(tab_split[3])
  
  return np.array(X), np.array(y).astype(np.float32)

In [324]:
train_data =  process_data("nlp_data/data/joy_train")
val_data =  process_data("nlp_data/data/joy_test")

In [325]:
tweet_parser.vectorizer.fit(train_data[0])



CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2),
                preprocessor=<bound method tweet.preprocess of <__main__.tweet object at 0x7f5a2f959978>>,
                stop_words='english', strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<__main__.LemmaTokenizer object at 0x7f5a2f959438>,
                vocabulary=None)

In [326]:
def lex_feature(data):
  return np.concatenate([tweet_parser.vectorizer.transform(data).todense(), np.array([tweet_parser.feature(e) for e in data])], axis=1)

In [327]:
train_x = lex_feature(train_data[0])
val_x = lex_feature(val_data[0])

In [328]:
svm_regression = SVR()
svm_regression.fit(train_x, train_data[1])

print(svm_regression.fit_status_)

output = svm_regression.predict(train_x)
val_output = svm_regression.predict(val_x)

print(pearsonr(val_output, val_data[1]))
print(pearsonr(output, train_data[1]))

print(spearmanr(val_output, val_data[1]))
print(spearmanr(output, train_data[1]))

0
(0.10555881060711204, 0.004749298691680515)
(0.15552319692330957, 7.388987894317273e-06)
SpearmanrResult(correlation=0.10490512251543137, pvalue=0.005016365695322027)
SpearmanrResult(correlation=0.1595378778463356, pvalue=4.238072807795342e-06)


(0.07570299004284718, 0.03692909544883846)

In [331]:
svm_regression = DecisionTreeRegressor()
svm_regression.fit(train_x, train_data[1])

output = svm_regression.predict(train_x)
val_output = svm_regression.predict(val_x)

print(pearsonr(val_output, val_data[1]))
print(pearsonr(output, train_data[1]))

print(spearmanr(val_output, val_data[1]))
print(spearmanr(output, train_data[1]))

(0.3272297242811444, 2.78710263423923e-19)
(0.9998963535393343, 0.0)
SpearmanrResult(correlation=0.31837228500844894, pvalue=2.77808686692216e-18)
SpearmanrResult(correlation=0.9998546133040691, pvalue=0.0)


In [330]:
svm_regression = MLPRegressor()
svm_regression.fit(train_x, train_data[1])

output = svm_regression.predict(train_x)
val_output = svm_regression.predict(val_x)

print(pearsonr(val_output, val_data[1]))
print(pearsonr(output, train_data[1]))

print(spearmanr(val_output, val_data[1]))
print(spearmanr(output, train_data[1]))

(0.07639710078362456, 0.04127106073465666)
(0.2915021825375017, 1.3899664797161746e-17)
SpearmanrResult(correlation=0.02633519253239857, pvalue=0.4823139480997083)
SpearmanrResult(correlation=0.30736817589681953, pvalue=1.8182462269132478e-19)


In [12]:
import nltk

In [52]:
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
vader_analyser.