In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
tr_f = './Data/train.tsv'
train = pd.DataFrame.from_csv(tr_f, sep='\t')

In [4]:
def vader_classify(row):
    vader_analyzer = SentimentIntensityAnalyzer()
    polarity = vader_analyzer.polarity_scores(row.text)
    pos = polarity['pos']
    neg = polarity['neg']
    return vote(pos, neg)

def keep_first(group):
    return pd.Series({"Phrase": group["Phrase"].iloc[0], "Sentiment": group["Sentiment"].iloc[0]})

def mean_sentiment(row):
    return train[train.SentenceId == row.name]["Sentiment"].mean()

In [31]:
sentences = train.groupby("SentenceId").apply(keep_first)
sentences.head()

Unnamed: 0_level_0,Phrase,Sentiment
SentenceId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,1
2,"This quiet , introspective and entertaining in...",4
3,"Even fans of Ismail Merchant 's work , I suspe...",1
4,A positively thrilling combination of ethnogra...,3
5,Aggressive self-glorification and a manipulati...,1


In [32]:
sentences["SentenceId"] = sentences.index

In [33]:
sentences = sentences.reset_index(drop=True)
sentences.head()

Unnamed: 0,Phrase,Sentiment,SentenceId
0,A series of escapades demonstrating the adage ...,1,1
1,"This quiet , introspective and entertaining in...",4,2
2,"Even fans of Ismail Merchant 's work , I suspe...",1,3
3,A positively thrilling combination of ethnogra...,3,4
4,Aggressive self-glorification and a manipulati...,1,5


In [4]:
def vote(pos, neg):
    if pos >= neg:
        polarity = 1
    elif pos < neg:
        polarity = 0
    return polarity

def vader_classify(text):
    vader_analyzer = SentimentIntensityAnalyzer()
    polarity = vader_analyzer.polarity_scores(text)
    pos = polarity['pos']
    neg = polarity['neg']
    return vote(pos, neg)

def intensity(row):
    if row.Sentiment == 2:
        s = sentences[sentences.SentenceId == row.SentenceId]["NewSentiment"]
        return int(s)
    else:
        return int(row.Sentiment > 2)

In [40]:
sentences["NewSentiment"] = sentences.apply(lambda r: vader_classify(r["Phrase"]), axis=1)

In [41]:
sentences.head()

Unnamed: 0,Phrase,Sentiment,SentenceId,NewSentiment
0,A series of escapades demonstrating the adage ...,1,1,1
1,"This quiet , introspective and entertaining in...",4,2,1
2,"Even fans of Ismail Merchant 's work , I suspe...",1,3,0
3,A positively thrilling combination of ethnogra...,3,4,0
4,Aggressive self-glorification and a manipulati...,1,5,0


In [47]:
train["NewSentiment"] = train.apply(intensity, axis=1)
train.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,NewSentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,A series of escapades demonstrating the adage ...,1,0
2,1,A series of escapades demonstrating the adage ...,2,1
3,1,A series,2,1
4,1,A,2,1
5,1,series,2,1


In [52]:
train.columns = ["SentenceId", "Phrase", "Sentiment", "Binary1"]

In [55]:
train["Binary2"] = train["Sentiment"] > 2

In [58]:
train["Binary2"] = train["Binary2"].apply(int)
train.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Binary1,Binary2
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,A series of escapades demonstrating the adage ...,1,0,0
2,1,A series of escapades demonstrating the adage ...,2,1,0
3,1,A series,2,1,0
4,1,A,2,1,0
5,1,series,2,1,0


In [12]:
train.to_csv("./Data/train_bin.csv", sep='\t', encoding='utf-8')

In [2]:
train = pd.DataFrame.from_csv("./Data/train_bin.csv", sep='\t', encoding='utf-8')
train.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Vader,Naive,VaderOnPhrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,A series of escapades demonstrating the adage ...,1,0,0,0
2,1,A series of escapades demonstrating the adage ...,2,1,0,1
3,1,A series,2,1,0,1
4,1,A,2,1,0,1
5,1,series,2,1,0,1


In [8]:
def phrase2toPol(row):
    if row.Sentiment == 2:
        return vader_classify(row.Phrase)
    else:
        return int(row.Sentiment > 2)

In [9]:
train["VaderOnPhrase"] = train.apply(phrase2toPol, axis=1)

In [10]:
train.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Vader,Naive,VaderOnPhrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,A series of escapades demonstrating the adage ...,1,0,0,0
2,1,A series of escapades demonstrating the adage ...,2,1,0,1
3,1,A series,2,1,0,1
4,1,A,2,1,0,1
5,1,series,2,1,0,1


In [11]:
len(train[train["VaderOnPhrase"] == 0])/len(train)

0.2841022683583237

In [5]:
len(train[train["VaderOnPhrase"]==0])/len(train)

0.2841022683583237