# 0. Loading the data

In [1]:
import pandas as pd
df = pd.read_csv("peers.csv")[["airline_code","review","recommended"]]

In [2]:
df.head(3)

Unnamed: 0,airline_code,review,recommended
0,air-berlin,it has managed to avoid paying. Florence to Lo...,0.0
1,air-berlin,was one of my favourite airlines. Düsseldorf ...,0.0
2,air-berlin,very positive experience. Milan to New York r...,1.0


In [3]:
germanwings = df[df.airline_code=="germanwings"].review
germanwings.head(5)

3626    Seat was fine with enough legroom.  Dusseldorf...
3627    crew were smiling and good.  Berlin to Dusseld...
3628    only two agents available. Check in process at...
3629    good flight and friendly staff.  Amsterdam to ...
3630    never been treated as badly.  I have been a fr...
Name: review, dtype: object

In [4]:
df.groupby("airline_code", as_index=False)["review"].count()

Unnamed: 0,airline_code,review
0,air-berlin,488
1,british-airways,2863
2,eurowings,275
3,germanwings,146
4,klm-royal-dutch-airlines,1002
5,niki,42
6,ryanair,1610
7,vueling-airlines,967


In [5]:
df = df[df.airline_code!="nikki"]
df.head()

Unnamed: 0,airline_code,review,recommended
0,air-berlin,it has managed to avoid paying. Florence to Lo...,0.0
1,air-berlin,was one of my favourite airlines. Düsseldorf ...,0.0
2,air-berlin,very positive experience. Milan to New York r...,1.0
3,air-berlin,Definitely avoid if you can. Berlin to Prague...,0.0
4,air-berlin,"happy their service. Frankfurt to Berlin, our...",1.0


In [6]:
pressed_reviews = df.groupby("airline_code", as_index=False)["review"].agg({"review": lambda x: "%s" % ' '.join(x)[::]})
pressed_reviews.head()

Unnamed: 0,airline_code,review
0,air-berlin,it has managed to avoid paying. Florence to Lo...
1,british-airways,outstanding courtesy and service. I would like...
2,eurowings,"avoided like the plague. Rome to Vienna, I nee..."
3,germanwings,Seat was fine with enough legroom. Dusseldorf...
4,klm-royal-dutch-airlines,Stay away from KLM. Frankfurt to Saint Petersb...


In [7]:
pressed_germanwings = pressed_reviews[pressed_reviews.airline_code=="germanwings"].review.iloc[0]
pressed_germanwings[:300]

'Seat was fine with enough legroom.  Dusseldorf to Berlin. Eurowings flight operated by Germanwings EW9050. Flight had a slight delay of 15 minutes. Flight was smooth and good. Seat was fine with enough legroom. Food and drinks for puchase, crew was just average. crew were smiling and good.  Berlin t'

In [8]:
extra_stopwords = ['flight','seat','time', 'Airline','klm','passenger','passengers','flight','airport','airline']


## Topic Modeling ##
LDA: Latent Dirichlet Allocation Model
* Identifies potential topics using pruning techniques
* Computes conditional probabilities for topic word sets
* Identifies the most likely topics
* Does this over multiple passes probabilistically picking topics in each pass

# 2. Data Preprocessing
We will perform the following steps:

* **Tokenization**: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All **stopwords** are removed.
* Words are **lemmatized** - words in third person are changed to first person and verbs in past and future tenses are changed into present (eg. went -> go).
* Words are **stemmed** - words are reduced to their root form (eg. dies, died, dead -> die).

In [9]:
def lemmatize_stemming(text):
    from nltk.stem import WordNetLemmatizer, SnowballStemmer
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    import gensim
    from gensim.utils import simple_preprocess
    from nltk.corpus import stopwords
    newstopwords = stopwords.words("english")
    try:
        newstopwords.extend(extra_stopwords)
    except:
        pass
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in newstopwords and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [10]:
def topic_modeling(documents,num_topics=3, passes = 4,workers=4, num_words=3): 
    processed_docs = documents.map(preprocess)
    import gensim.corpora
    dictionary = gensim.corpora.Dictionary(processed_docs)

    # Remove very rare and very common words
    #dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)
    #Create the Bag-of-words model for each document
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    ## 3.2: TF-IDF 
    from gensim import corpora, models
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                           num_topics=num_topics, 
                                           id2word = dictionary, 
                                           passes = passes, 
                                           workers=workers)

    for idx, topic in lda_model.print_topics(num_words=num_words):
        print("* Topic {}: {}".format(idx+1, topic))

In [11]:
topic_modeling(germanwings,num_topics=3)

* Topic 1: 0.017*"germanw" + 0.010*"servic" + 0.010*"lufthansa"
* Topic 2: 0.015*"crew" + 0.014*"cabin" + 0.012*"germanw"
* Topic 3: 0.023*"germanw" + 0.010*"staff" + 0.010*"delay"


In [12]:
for airline_code in df.airline_code.unique().tolist():
    print("\n** Reviews of ",airline_code)
    text = df[df.airline_code==airline_code].review
    topic_modeling(text,num_topics=2)


** Reviews of  air-berlin
* Topic 1: 0.027*"berlin" + 0.013*"servic" + 0.011*"busi"
* Topic 2: 0.029*"berlin" + 0.011*"servic" + 0.010*"seat"

** Reviews of  british-airways
* Topic 1: 0.013*"servic" + 0.012*"crew" + 0.011*"seat"
* Topic 2: 0.011*"servic" + 0.011*"airway" + 0.010*"london"

** Reviews of  eurowings
* Topic 1: 0.017*"eurow" + 0.010*"hour" + 0.009*"delay"
* Topic 2: 0.019*"eurow" + 0.014*"delay" + 0.013*"servic"

** Reviews of  germanwings
* Topic 1: 0.018*"germanw" + 0.011*"good" + 0.011*"check"
* Topic 2: 0.015*"germanw" + 0.011*"delay" + 0.009*"crew"

** Reviews of  klm-royal-dutch-airlines
* Topic 1: 0.017*"amsterdam" + 0.014*"servic" + 0.013*"good"
* Topic 2: 0.015*"amsterdam" + 0.014*"seat" + 0.011*"servic"

** Reviews of  niki
* Topic 1: 0.020*"drink" + 0.020*"free" + 0.019*"niki"
* Topic 2: 0.017*"check" + 0.013*"return" + 0.011*"vienna"

** Reviews of  ryanair
* Topic 1: 0.023*"ryanair" + 0.012*"board" + 0.011*"check"
* Topic 2: 0.025*"ryanair" + 0.015*"board" +

<h3>Simple sentiment analysis</h3>
Compute the proportion of positive and negative words in a text

In [13]:
def get_words(url):
    """
    Get sentiment coded words from Hu and Liu's sentiment analysis lexicon.
    """
    import requests
    words = requests.get(url).content.decode('latin-1')
    word_list = words.split('\n')
    index = 0
    while index < len(word_list):
        word = word_list[index]
        if ';' in word or not word:
            word_list.pop(index)
        else:
            index+=1
    return word_list

#Get lists of positive and negative words
p_url = 'http://ptrckprry.com/course/ssd/data/positive-words.txt'
n_url = 'http://ptrckprry.com/course/ssd/data/negative-words.txt'
positive_words = get_words(p_url)
negative_words = get_words(n_url)

In [14]:
positive_words[:5]

['a+', 'abound', 'abounds', 'abundance', 'abundant']

In [15]:
negative_words[:5]

['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable']

In [16]:
df.groupby("airline_code", as_index=False).review.count()

Unnamed: 0,airline_code,review
0,air-berlin,488
1,british-airways,2863
2,eurowings,275
3,germanwings,146
4,klm-royal-dutch-airlines,1002
5,niki,42
6,ryanair,1610
7,vueling-airlines,967


In [17]:
def pos_neg_freq(text):
    """
    Compute sentiment by looking at the proportion of positive and negative words in the text.
    """
    from nltk import word_tokenize
    pos = neg = 0
    for word in word_tokenize(text):
        if word in positive_words:
            pos+=1
        if word in negative_words:
            neg+=1
    return { "Positive": round(pos/len(word_tokenize(text))*100,1),
            "Negative": round(neg/len(word_tokenize(text))*100,1),
            "Net": round((pos-neg)/len(word_tokenize(text))*100,1)}

In [18]:
def pos_neg_freg_documents(documents):
    """
    Compute sentiment dor the data frame by using pos_neg_freq() funciton.
    """
    freq_list = []
    for (review,airline_code) in zip(documents["review"], documents["airline_code"]):
        freq_dict = {"airline_code" : airline_code}
        freq_dict.update(pos_neg_freq(review))
        freq_list.append(freq_dict)
    freq_list = freq_list[-1:] + freq_list[:-1]
    freq = pd.DataFrame.from_dict(freq_list)
    return freq[["airline_code","Positive","Negative","Net"]]
        

In [None]:
pos_neg = pos_neg_freg_documents(pressed_reviews)

In [None]:
pos_neg.mean()

## Simple sentiment analysis using NRC data
NRC data codifies words with emotions where 14,182 words are coded into 2 sentiments and 8 emotions. For example, the word abandonment is associated with anger, fear, sadness and has a negative sentiment.

In [20]:
"""
Rear the NRC sentiment data.
"""
nrc = "NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
count=0
emotion_dict=dict()
with open(nrc,'r') as f:
    all_lines = list()
    for line in f:
        if count < 46:
            count+=1
            continue
        line = line.strip().split('\t')
        if int(line[2]) == 1:
            if emotion_dict.get(line[0]):
                emotion_dict[line[0]].append(line[1])
            else:
                emotion_dict[line[0]] = [line[1]]
    

In [21]:
emotion_dict['abandoned']

['anger', 'fear', 'negative', 'sadness']

In [22]:
def emotion_analyzer(text,emotion_dict=emotion_dict):
    """
    Analize 8 emotions and 2 sentiments using NRC data.
    """
    #Set up the result dictionary
    emotions = {x for y in emotion_dict.values() for x in y}
    # print(type(emotions),emotions)
    emotion_count = dict()
    for emotion in emotions:
        emotion_count[emotion] = 0

    #Analyze the text and normalize by total number of words
    total_words = len(text.split())
    for word in text.split():
        if emotion_dict.get(word):
            for emotion in emotion_dict.get(word):
                emotion_count[emotion] += 1/len(text.split())
    #rounding the values in emotion_count
    return emotion_count

emotion_analyzer(pressed_reviews["review"][3])

{'anticipation': 0.02571191595244684,
 'anger': 0.006773569256289754,
 'fear': 0.008501520597179998,
 'joy': 0.017348631462538053,
 'trust': 0.029858999170583427,
 'surprise': 0.009469173348078536,
 'negative': 0.023223666021564886,
 'positive': 0.03939729057229758,
 'sadness': 0.012164777439867318,
 'disgust': 0.004561791539950241}

In [23]:
def emotion_analyzer_documents(documents):
    """
    Analize emotions and sentiments for data frame.
    """
    freq_list = []
    for (review,airline_code) in zip(documents["review"], documents["airline_code"]):
        freq_dict = {"airline_code" : airline_code}
        freq_dict.update(emotion_analyzer(review))
        freq_list.append(freq_dict)
    freq_list = freq_list[-1:] + freq_list[:-1]
    freq = pd.DataFrame.from_dict(freq_list).round(2)
    return freq

In [25]:
emotions = emotion_analyzer_documents(pressed_reviews)
emotions

KeyboardInterrupt: 

## Word Clouds
Let's See what sort of words reviews use.

In [None]:
seperated_documents = df.groupby(["airline_code","recommended"], as_index=False)["review"].agg({"review": lambda x: "%s" % ' '.join(x)})
seperated_documents.head(3)

In [None]:
seperated_documents[seperated_documents["airline_code"]=="germanwings"]

In [None]:
pos = seperated_documents.loc[(seperated_documents["airline_code"] == "germanwings") &
                    (seperated_documents["recommended"] == 1), "review"].values[0]
neg = seperated_documents.loc[(seperated_documents["airline_code"] == "germanwings") &
                    (seperated_documents["recommended"] == 0), "review"].values[0]

In [None]:
def wordcloud(text, background_color='white', max_words=50):
    import matplotlib.pyplot as plt
    %matplotlib inline
    plt.style.use(['dark_background'])
    from wordcloud import WordCloud
    from nltk.corpus import stopwords
    newstopwords = stopwords.words("english")
    try:
        newstopwords.extend(extra_stopwords)
    except:
        pass
    wc = WordCloud(background_color=background_color, max_words=max_words,
                   stopwords = newstopwords)
    # Generate and plot wordcloud
    plt.imshow(wc.generate(text))
    plt.axis('off')
    plt.show()

In [None]:
print("Positive reviews of Germanwings")
wordcloud(pos)
print("Negative reviews of Germanwings")
wordcloud(neg, background_color='black')

In [None]:
for (review,airline_code) in zip(pressed_reviews["review"], pressed_reviews["airline_code"]):
    print("Reviews of ",airline_code)
    wordcloud(review)


## Text summarization ##
Text summarization is useful because you can generate a short summary of a large piece of text automatically. These summaries can serve as an input into a topic analyzer to figure out what the main topic of the text is. A naive form of summarization is to identify the most frequent words in a piece of text and use the occurrence of these words in sentences to rate the importance of a sentence.

In [None]:
def summariser(text):
    """
    Naive summarizer: Identify the most frequent words in a piece of text 
    and use the occurrence of these words in sentences 
    to rate the importance of a sentence.
    """
    from nltk.tokenize import word_tokenize
    from nltk.tokenize import sent_tokenize
    from nltk.probability import FreqDist
    from nltk.corpus import stopwords
    from collections import OrderedDict
    summary_sentences = []
    candidate_sentences = {}
    candidate_sentence_counts = {}
    striptext = text.replace('\n\n', ' ')
    striptext = striptext.replace('\n', ' ')
    words = word_tokenize(striptext)
    from nltk.corpus import stopwords
    newstopwords = stopwords.words("english")
    try:
        newstopwords.extend(extra_stopwords)
    except:
        pass
    lowercase_words = [word.lower() for word in words
                      if word not in newstopwords and word.isalpha() ]
    word_frequencies = FreqDist(lowercase_words)
    most_frequent_words = FreqDist(lowercase_words).most_common(20)
    sentences = sent_tokenize(striptext)
    for sentence in sentences:
        candidate_sentences[sentence] = sentence.lower()
    for long, short in candidate_sentences.items():
        count = 0
        for freq_word, frequency_score in most_frequent_words:
            if freq_word in short:
                count += frequency_score
                candidate_sentence_counts[long] = count   
    sorted_sentences = OrderedDict(sorted(
                        candidate_sentence_counts.items(),
                        key = lambda x: x[1],
                        reverse = True)[:4])
    return sorted_sentences   

In [None]:
import gensim.summarization
print(gensim.summarization.summarize(pressed_germanwings, word_count=100))

## Summarizer based on lexical similarity: gensim ##
Gensim uses a network with sentences as nodes and 'lexical similarity' as weights on the arcs between nodes

In [None]:
print(summariser(pressed_germanwings))