# 0. Loading the data

In [39]:
import pandas as pd
df = pd.read_csv("peers.csv")[["airline_code","review","recommended"]]

In [2]:
df.head(3)

Unnamed: 0,airline_code,review,recommended
0,air-berlin,it has managed to avoid paying. Florence to Lo...,0.0
1,air-berlin,was one of my favourite airlines. Düsseldorf ...,0.0
2,air-berlin,very positive experience. Milan to New York r...,1.0


In [3]:
germanwings = df[df.airline_code=="germanwings"].review
germanwings.head(5)

3626    Seat was fine with enough legroom.  Dusseldorf...
3627    crew were smiling and good.  Berlin to Dusseld...
3628    only two agents available. Check in process at...
3629    good flight and friendly staff.  Amsterdam to ...
3630    never been treated as badly.  I have been a fr...
Name: review, dtype: object

In [4]:
df.groupby("airline_code", as_index=False)["review"].count()

Unnamed: 0,airline_code,review
0,air-berlin,488
1,british-airways,2863
2,eurowings,275
3,germanwings,146
4,klm-royal-dutch-airlines,1002
5,niki,42
6,ryanair,1610
7,vueling-airlines,967


In [35]:
df = df[df.airline_code!="nikki"]
df.head()

Unnamed: 0,airline_code,review,recommended
0,air-berlin,it has managed to avoid paying. Florence to Lo...,0.0
1,air-berlin,was one of my favourite airlines. Düsseldorf ...,0.0
2,air-berlin,very positive experience. Milan to New York r...,1.0
3,air-berlin,Definitely avoid if you can. Berlin to Prague...,0.0
4,air-berlin,"happy their service. Frankfurt to Berlin, our...",1.0


In [40]:
pressed_reviews = df.groupby("airline_code", as_index=False)["review"].agg({"review": lambda x: "%s" % ' '.join(x)[::]})
pressed_reviews.head()

Unnamed: 0,airline_code,review
0,air-berlin,it has managed to avoid paying. Florence to Lo...
1,british-airways,outstanding courtesy and service. I would like...
2,eurowings,"avoided like the plague. Rome to Vienna, I nee..."
3,germanwings,Seat was fine with enough legroom. Dusseldorf...
4,klm-royal-dutch-airlines,Stay away from KLM. Frankfurt to Saint Petersb...


In [7]:
pressed_germanwings = pressed_reviews[pressed_reviews.airline_code=="germanwings"].review.iloc[0]
pressed_germanwings[:300]

'Seat was fine with enough legroom.  Dusseldorf to Berlin. Eurowings flight operated by Germanwings EW9050. Flight had a slight delay of 15 minutes. Flight was smooth and good. Seat was fine with enough legroom. Food and drinks for puchase, crew was just average. crew were smiling and good.  Berlin t'

In [8]:
extra_stopwords = ['flight','seat','time', 'Airline','klm','passenger','passengers','flight','airport','airline']


<h3>Simple sentiment analysis</h3>
Compute the proportion of positive and negative words in a text

In [9]:
def get_words(url):
    """
    Get sentiment coded words from Hu and Liu's sentiment analysis lexicon.
    """
    import requests
    words = requests.get(url).content.decode('latin-1')
    word_list = words.split('\n')
    index = 0
    while index < len(word_list):
        word = word_list[index]
        if ';' in word or not word:
            word_list.pop(index)
        else:
            index+=1
    return word_list

#Get lists of positive and negative words
p_url = 'http://ptrckprry.com/course/ssd/data/positive-words.txt'
n_url = 'http://ptrckprry.com/course/ssd/data/negative-words.txt'
positive_words = get_words(p_url)
negative_words = get_words(n_url)

In [10]:
positive_words[:5]

['a+', 'abound', 'abounds', 'abundance', 'abundant']

In [11]:
negative_words[:5]

['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable']

In [12]:
df.groupby("airline_code", as_index=False).review.count()

Unnamed: 0,airline_code,review
0,air-berlin,488
1,british-airways,2863
2,eurowings,275
3,germanwings,146
4,klm-royal-dutch-airlines,1002
5,niki,42
6,ryanair,1610
7,vueling-airlines,967


In [13]:
def pos_neg_freq(text):
    """
    Compute sentiment by looking at the proportion of positive and negative words in the text.
    """
    from nltk import word_tokenize
    pos = neg = 0
    for word in word_tokenize(text):
        if word in positive_words:
            pos+=1
        if word in negative_words:
            neg+=1
    return { "Positive": round(pos/len(word_tokenize(text))*100,1),
            "Negative": round(neg/len(word_tokenize(text))*100,1),
            "Net": round((pos-neg)/len(word_tokenize(text))*100,1)}

In [14]:
def pos_neg_freg_documents(documents):
    """
    Compute sentiment dor the data frame by using pos_neg_freq() funciton.
    """
    freq_list = []
    for (review,airline_code) in zip(documents["review"], documents["airline_code"]):
        freq_dict = {"airline_code" : airline_code}
        freq_dict.update(pos_neg_freq(review))
        freq_list.append(freq_dict)
    freq_list = freq_list[-1:] + freq_list[:-1]
    freq = pd.DataFrame.from_dict(freq_list)
    return freq[["airline_code","Positive","Negative","Net"]]
        

In [15]:
pos_neg = pos_neg_freg_documents(pressed_reviews)

In [33]:
pos_neg

Unnamed: 0,airline_code,Positive,Negative,Net
0,vueling-airlines,1.9,2.7,-0.8
1,air-berlin,3.1,2.3,0.8
2,british-airways,3.3,2.4,0.8
3,eurowings,2.1,2.6,-0.6
4,germanwings,3.5,2.1,1.4
5,klm-royal-dutch-airlines,4.1,2.1,2.0
6,niki,5.5,1.8,3.7
7,ryanair,2.5,2.6,-0.1


## Simple sentiment analysis using NRC data
NRC data codifies words with emotions where 14,182 words are coded into 2 sentiments and 8 emotions. For example, the word abandonment is associated with anger, fear, sadness and has a negative sentiment.

In [17]:
"""
Rear the NRC sentiment data.
"""
nrc = "NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
count=0
emotion_dict=dict()
with open(nrc,'r') as f:
    all_lines = list()
    for line in f:
        if count < 46:
            count+=1
            continue
        line = line.strip().split('\t')
        if int(line[2]) == 1:
            if emotion_dict.get(line[0]):
                emotion_dict[line[0]].append(line[1])
            else:
                emotion_dict[line[0]] = [line[1]]
    

In [18]:
emotion_dict['abandoned']

['anger', 'fear', 'negative', 'sadness']

In [19]:
def emotion_analyzer(text,emotion_dict=emotion_dict):
    """
    Analize 8 emotions and 2 sentiments using NRC data.
    """
    #Set up the result dictionary
    emotions = {x for y in emotion_dict.values() for x in y}
    # print(type(emotions),emotions)
    emotion_count = dict()
    for emotion in emotions:
        emotion_count[emotion] = 0

    #Analyze the text and normalize by total number of words
    total_words = len(text.split())
    for word in text.split():
        if emotion_dict.get(word):
            for emotion in emotion_dict.get(word):
                emotion_count[emotion] += 1/len(text.split())
    #rounding the values in emotion_count
    return emotion_count

emotion_analyzer(pressed_reviews["review"][3])

{'disgust': 0.004561791539950241,
 'trust': 0.029858999170583427,
 'anticipation': 0.02571191595244684,
 'joy': 0.017348631462538053,
 'negative': 0.023223666021564886,
 'anger': 0.006773569256289754,
 'sadness': 0.012164777439867318,
 'surprise': 0.009469173348078536,
 'fear': 0.008501520597179998,
 'positive': 0.03939729057229758}

In [20]:
def emotion_analyzer_documents(documents):
    """
    Analize emotions and sentiments for data frame.
    """
    freq_list = []
    for (review,airline_code) in zip(documents["review"], documents["airline_code"]):
        freq_dict = {"airline_code" : airline_code}
        freq_dict.update(emotion_analyzer(review))
        freq_list.append(freq_dict)
    freq_list = freq_list[-1:] + freq_list[:-1]
    freq = pd.DataFrame.from_dict(freq_list).round(2)
    return freq

In [21]:
emotions = emotion_analyzer_documents(pressed_reviews)
emotions

Unnamed: 0,airline_code,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,vueling-airlines,0.01,0.02,0.01,0.01,0.01,0.02,0.03,0.01,0.01,0.02
1,air-berlin,0.01,0.02,0.01,0.01,0.02,0.02,0.04,0.01,0.01,0.03
2,british-airways,0.01,0.02,0.01,0.01,0.02,0.02,0.04,0.01,0.01,0.03
3,eurowings,0.01,0.02,0.01,0.01,0.01,0.02,0.03,0.01,0.01,0.02
4,germanwings,0.01,0.03,0.0,0.01,0.02,0.02,0.04,0.01,0.01,0.03
5,klm-royal-dutch-airlines,0.01,0.02,0.0,0.01,0.02,0.02,0.04,0.01,0.01,0.03
6,niki,0.01,0.02,0.01,0.01,0.02,0.02,0.04,0.01,0.01,0.04
7,ryanair,0.01,0.02,0.01,0.01,0.02,0.02,0.03,0.01,0.01,0.02


## Weighted word analysis using Vader ##
Vader contains a list of 7500 features weighted by how positive or negative they are.
It uses these features to calculate stats on how positive, negative and neutral a passage is.
And combines these results to give a compound sentiment (higher = more positive) for the passage.  Human trained on twitter data and generally considered good for informal communication. 10 humans rated each feature in each tweet in context from -4 to +4.
* Calculates the sentiment in a sentence using word order analysis: "marginally good" will get a lower positive score than "extremely good"
* Computes a "compound" score based on heuristics (between -1 and +1)
* Includes sentiment of emoticons, punctuation, and other 'social media' lexicon elements

In [22]:
def vader_comparison(documents):
    """
    Analize the intensity of sentiments using Vader's list. 
    """
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    from nltk.tokenize import sent_tokenize
    analyzer = SentimentIntensityAnalyzer()
    freq_list = []
    for (review,airline_code) in zip(documents["review"], documents["airline_code"]):
        sentences = sent_tokenize(review)
        pos=compound=neu=neg=0
        for sentence in sentences:
            vs = analyzer.polarity_scores(sentence)
            pos+=vs['pos']/(len(sentences))
            compound+=vs['compound']/(len(sentences))
            neu+=vs['neu']/(len(sentences))
            neg+=vs['neg']/(len(sentences))
        freq_dict = {"Compound": round(compound,2),
                     "Negative": round(neg,2),
                     "Neutral": round(neu,2),
                     "Positive": round(pos,2),
                     "airline_code" : airline_code}
        freq_list.append(freq_dict)
    freq = pd.DataFrame.from_dict(freq_list)
    return freq[["airline_code", "Neutral", "Positive", "Negative", "Compound"]]

In [23]:
vader = vader_comparison(pressed_reviews)
vader

Unnamed: 0,airline_code,Neutral,Positive,Negative,Compound
0,air-berlin,0.81,0.1,0.08,0.04
1,british-airways,0.81,0.11,0.08,0.05
2,eurowings,0.81,0.08,0.11,-0.03
3,germanwings,0.8,0.11,0.07,0.07
4,klm-royal-dutch-airlines,0.79,0.14,0.07,0.12
5,niki,0.77,0.17,0.06,0.18
6,ryanair,0.82,0.09,0.09,0.02
7,vueling-airlines,0.82,0.06,0.11,-0.07


In [24]:
vader.mean()

Neutral     0.80375
Positive    0.10750
Negative    0.08375
Compound    0.04750
dtype: float64