In [1]:
## Standard stuff:
import numpy as np, seaborn as sns, pandas as pd
## For text classification:
import nltk, nltk.sentiment, sklearn
%matplotlib inline

In [35]:
### Download data as pandas dataframe
import requests
path2data = 'https://raw.githubusercontent.com/snorreralund/scraping_seminar/master/english_review_sample.csv'
df = pd.read_csv(path2data)

In [36]:
# Download positive and negative sentiment lexicons, split into words and convert to set representation. 
negative = set(requests.get('http://ptrckprry.com/course/ssd/data/negative-words.txt').text.split(';\n')[-1].split('\n'))
positive = set(requests.get('http://ptrckprry.com/course/ssd/data/positive-words.txt').text.split(';\n')[-1].split('\n'))
print(len(negative),len(positive))

4784 2007


In [65]:
df = pd.read_csv (r'C:\Users\jtoft\Downloads\dataframe_u_more.csv')

In [66]:
# # dropping null value columns to avoid errors 
# df.dropna(inplace = True) 
   
df['country'] = df['location'].str.rsplit(',').str[-1] 
df

Unnamed: 0.1,Unnamed: 0,name,location,bubble,rating,country
0,0,Ate here last night with a group of friends. ...,"Waukesha, Wisconsin","<span class=""ui_bubble_rating bubble_40""></span>",40,Wisconsin
1,1,"Awesome!! Delicious food, nice staff. We had ...","Hong Kong, China","<span class=""ui_bubble_rating bubble_50""></span>",50,China
2,2,We wanted a nice dinner. We could place our o...,"Sollentuna, Sweden","<span class=""ui_bubble_rating bubble_20""></span>",20,Sweden
3,3,Restuarant Tight is located off the main shopp...,"London, United Kingdom","<span class=""ui_bubble_rating bubble_40""></span>",40,United Kingdom
4,4,We concluded our trip to Denmark by stumbling ...,,"<span class=""ui_bubble_rating bubble_50""></span>",50,
5,5,Visited on a Sunday night\nWe booked a table b...,"Newquay, United Kingdom","<span class=""ui_bubble_rating bubble_50""></span>",50,United Kingdom
6,6,"Based on reviews of this establishment, my hus...",,"<span class=""ui_bubble_rating bubble_50""></span>",50,
7,7,"Had a late lunch at Tight, ordering off the lu...",Texas,"<span class=""ui_bubble_rating bubble_50""></span>",50,Texas
8,8,A great dinner with most gracious staff for ou...,"Aurora, Colorado","<span class=""ui_bubble_rating bubble_50""></span>",50,Colorado
9,9,My husband and I ate here for our last dinner ...,"San Francisco, California","<span class=""ui_bubble_rating bubble_50""></span>",50,California


In [68]:
# initialize tokenizer
tokenizer = nltk.tokenize.TweetTokenizer()

# define function
def preprocessing(string):
    return tokenizer.tokenize(string.lower())

In [69]:
documents = df.name.apply(preprocessing)

In [70]:
# Define count function using a list comprehension.
def count_dictionary(tokenized_doc,dictionary):
    return len([word for word in tokenized_doc if word in dictionary])

In [71]:

df['positive_liu'] = documents.apply(count_dictionary,dictionary=positive)
df['negative_liu'] = documents.apply(count_dictionary,dictionary=negative)

In [72]:
df

Unnamed: 0.1,Unnamed: 0,name,location,bubble,rating,country,positive_liu,negative_liu
0,0,Ate here last night with a group of friends. ...,"Waukesha, Wisconsin","<span class=""ui_bubble_rating bubble_40""></span>",40,Wisconsin,3,0
1,1,"Awesome!! Delicious food, nice staff. We had ...","Hong Kong, China","<span class=""ui_bubble_rating bubble_50""></span>",50,China,6,0
2,2,We wanted a nice dinner. We could place our o...,"Sollentuna, Sweden","<span class=""ui_bubble_rating bubble_20""></span>",20,Sweden,3,1
3,3,Restuarant Tight is located off the main shopp...,"London, United Kingdom","<span class=""ui_bubble_rating bubble_40""></span>",40,United Kingdom,3,0
4,4,We concluded our trip to Denmark by stumbling ...,,"<span class=""ui_bubble_rating bubble_50""></span>",50,,4,0
5,5,Visited on a Sunday night\nWe booked a table b...,"Newquay, United Kingdom","<span class=""ui_bubble_rating bubble_50""></span>",50,United Kingdom,3,0
6,6,"Based on reviews of this establishment, my hus...",,"<span class=""ui_bubble_rating bubble_50""></span>",50,,1,1
7,7,"Had a late lunch at Tight, ordering off the lu...",Texas,"<span class=""ui_bubble_rating bubble_50""></span>",50,Texas,4,0
8,8,A great dinner with most gracious staff for ou...,"Aurora, Colorado","<span class=""ui_bubble_rating bubble_50""></span>",50,Colorado,5,0
9,9,My husband and I ate here for our last dinner ...,"San Francisco, California","<span class=""ui_bubble_rating bubble_50""></span>",50,California,1,0


In [73]:
# import nltk.sentiment
import nltk
nltk.download('vader_lexicon')
import nltk.sentiment
# initialize the vader function
vader = nltk.sentiment.vader.SentimentIntensityAnalyzer()
# apply the function and convert to dataframe
vader_df = pd.DataFrame(list(df['name'].apply(vader.polarity_scores)))
# rename columns adding the 'vader_' prefix using a list comprehension
vader_df.columns = ['vader_'+col for col in vader_df.columns]
# merge with original dataframe
df = pd.concat([df,vader_df],axis=1)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jtoft\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [74]:
from afinn import Afinn
afinn = Afinn()
df['afinn'] = df.name.apply(afinn.score)

In [None]:
# # define columns
sentiment_columns = ['afinn','positive_liu','negative_liu']+[col for col in df.columns if 'vader_' in col]
hue = 'rating'
sns.pairplot(df.sample(1000)[sentiment_columns+[hue]],hue=hue)

<seaborn.axisgrid.PairGrid at 0x17027361128>

In [64]:
# Pick documents with positive LIU classification
positive_liu_reviews = df[((df.positive_liu-df.negative_liu)>0)]

# We, now, want to get the indeces of these documents and the words within them LUI classified as positive:
for idx in df.iloc[df[((df.positive_liu-df.negative_liu)>0)].vader_compound.sort_values().index].head().index:
    print(idx,set(documents[idx])&positive)

# # So that, in a final step, we can pick one and look at the whole text to assess why LUI and VADER disagree:
# print(df.name[235])

# #LIU defines 'well' as a positive word. However, 'well' is in the context of 'not' well.  
# print(vader.polarity_scores('the videos are not well organized'))

# # LIU defines 'work' as a positive word. However, here it is in the context of 'hard work'
# print(vader.polarity_scores('find it hard to work through'))

## The VADER understands these negations.  

df.dropna(inplace = True) 
df

AttributeError: 'DataFrame' object has no attribute 'vader_compound'