In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display # Allows the use of display() for DataFrames

pd.set_option('display.max_columns', None)
plt.style.use(['dark_background'])
%matplotlib inline

In [2]:
reviews_raw = pd.read_csv("reviews_raw.csv")

In [3]:
reviews_raw.head(2)

Unnamed: 0,cabin_service,country,date_flown,date_published,ground_service,id,name,rating,recommended,review_count,route,seat_comfort,seat_type,text,text_header,traveller_type,value_for_money,airline_name,airline_code
0,2.0,United Kingdom,2019-06-01 00:00:00,2019-06-25,1.0,anchor666859,Gyan Fernando,\n1/10\n,0.0,20,Moroni to Anjouan,2.0,Economy Class,✅ Trip Verified | Moroni to Anjouan. It is a v...,"""Not a good airline""",Solo Leisure,2.0,AB Aviation,ab-aviation
1,1.0,United Kingdom,2019-06-01 00:00:00,2019-06-25,1.0,anchor666802,Gyan Fernando,\n1/10\n,0.0,20,Anjouan to Dzaoudzi,2.0,Economy Class,✅ Trip Verified | Anjouan to Dzaoudzi. A very...,"""flight was fortunately short""",Solo Leisure,2.0,AB Aviation,ab-aviation


In [4]:
"""Clean rating"""
def clean_rating(text):
    try:
        rating = int(text[1:][:-4])
    except:
        rating = pd.np.nan
    return rating

print(reviews_raw["rating"].head())
reviews = reviews_raw
reviews["rating"] = reviews_raw["rating"].apply(lambda x: clean_rating(x))
print(reviews["rating"].head())

0    \n1/10\n
1    \n1/10\n
2    \n8/10\n
3    \n1/10\n
4    \n1/10\n
Name: rating, dtype: object
0    1.0
1    1.0
2    8.0
3    1.0
4    1.0
Name: rating, dtype: float64


In [5]:
"""Clean header text"""
def clean_header(header):
    #remove qoute
    text = header.replace('"', '').replace('  ', '')
    if text== "":
        pass
    #remove 'customer review'
    elif text[-15:] == 'customer review':
        text = ""
    elif not text[-1] == '.':
        text += '.'
    return text

reviews.text_header = reviews_raw.text_header.apply(clean_header)

In [6]:
reviews_raw.text.iloc[3][:14]

'Not Verified |'

In [7]:
""" Clean review text"""

def clean_text(text):
    if text== "":
        pass
    #remove 'customer review'
    elif text[0:17] =='✅ Trip Verified |':
        text = text[17:]
    elif text[:20] == '✅ Verified Review | ':
        text = text[20:]
    elif text[:14] == 'Not Verified |':
        text = text[14:]
    elif not text[-1] == '.':
        text += '.'
    return text.replace('"', '').replace('  ', '')

reviews.text = reviews_raw.text.apply(clean_text)

In [8]:
"""Germanwings Flight 9525 accident"""
def days_after_crash(date_published):
    """
    Count days after the accident on 2015.03.25.
    """
    from datetime import datetime
    try:
        days = (datetime.strptime(date_published, "%Y-%m-%d") - datetime(2015, 3, 24)).days
        return days if days>0 else 0
    except:
        return pd.np.nan

In [9]:
reviews["days_after_crash"] = reviews_raw["date_published"].apply(days_after_crash)

print(reviews[["date_published","days_after_crash"]].head())

  date_published  days_after_crash
0     2019-06-25              1554
1     2019-06-25              1554
2     2018-10-12              1298
3     2018-10-05              1291
4     2018-07-29              1223


In [10]:
def add_header(doc):
    text_header = doc[0].replace('"', '').replace('  ', '')
    text = doc[1].replace('"', '').replace('  ', '')
    return text_header + " " + text

In [11]:
reviews["review"] = reviews_raw[["text_header", "text"]].apply(add_header, axis=1)

In [12]:
"""Find germanwings' peers"""
reviews["germanwings"] = reviews["airline_code"] =="germanwings"
germanwings_routes = set(reviews[reviews["germanwings"]]["route"].dropna()) 

peers_set = set(reviews[reviews["route"].isin(germanwings_routes)]["airline_code"])
print(peers_set)

peers = reviews[reviews["airline_code"].isin(peers_set)]

{'germanwings', 'vueling-airlines', 'niki', 'ryanair', 'eurowings', 'air-berlin', 'klm-royal-dutch-airlines', 'british-airways'}


In [13]:
reviews

Unnamed: 0,cabin_service,country,date_flown,date_published,ground_service,id,name,rating,recommended,review_count,route,seat_comfort,seat_type,text,text_header,traveller_type,value_for_money,airline_name,airline_code,days_after_crash,review,germanwings
0,2.0,United Kingdom,2019-06-01 00:00:00,2019-06-25,1.0,anchor666859,Gyan Fernando,1.0,0.0,20,Moroni to Anjouan,2.0,Economy Class,Moroni to Anjouan. It is a very small airline...,Not a good airline.,Solo Leisure,2.0,AB Aviation,ab-aviation,1554,Not a good airline. Moroni to Anjouan. It is ...,False
1,1.0,United Kingdom,2019-06-01 00:00:00,2019-06-25,1.0,anchor666802,Gyan Fernando,1.0,0.0,20,Anjouan to Dzaoudzi,2.0,Economy Class,Anjouan to Dzaoudzi. A very small airline and ...,flight was fortunately short.,Solo Leisure,2.0,AB Aviation,ab-aviation,1554,flight was fortunately short. Anjouan to Dzaou...,False
2,4.0,Germany,2018-10-01 00:00:00,2018-10-12,5.0,anchor612920,M Jager,8.0,1.0,32,Ljubljana to Munich,4.0,Economy Class,Ljubljana to Munich. The homebase airport of ...,the crew was nice.,Family Leisure,5.0,Adria Airways,adria-airways,1298,the crew was nice. Ljubljana to Munich. The h...,False
3,1.0,Germany,2018-10-01 00:00:00,2018-10-05,1.0,anchor611417,Giulia Rossi,1.0,0.0,0,Zurich to Ljubljana,2.0,Economy Class,Zurich to Ljubljana. Very poor customer servic...,Very bad experience overall.,Business,1.0,Adria Airways,adria-airways,1291,Very bad experience overall. Zurich to Ljublja...,False
4,1.0,United States,2018-07-01 00:00:00,2018-07-29,4.0,anchor595049,Galya Slavov,1.0,0.0,0,Vienna to Sofia,4.0,Economy Class,Vienna to Sofia. The flight was delayed by 2 h...,bad customer service.,Family Leisure,1.0,Adria Airways,adria-airways,1223,bad customer service. Vienna to Sofia. The fli...,False
5,3.0,France,2018-05-01 00:00:00,2018-07-19,3.0,anchor592506,Loic Jouan,2.0,0.0,0,Paris to Skopje via Ljubljana,3.0,Economy Class,We were traveling from Paris to Skopje on May ...,overall very poor.,Solo Leisure,2.0,Adria Airways,adria-airways,1213,overall very poor. We were traveling from Pari...,False
6,2.0,Slovenia,2018-06-01 00:00:00,2018-06-30,2.0,anchor587586,P Gamirj,2.0,0.0,0,Ljubljana to Munich,1.0,Economy Class,Ljubljana to Munich. Adria's checkin system is...,Would not fly again.,Business,1.0,Adria Airways,adria-airways,1194,Would not fly again. Ljubljana to Munich. Adri...,False
7,3.0,Czech Republic,2018-06-01 00:00:00,2018-06-24,1.0,anchor586201,B Haruz,3.0,0.0,0,Ljubljana to Prague,3.0,Economy Class,A very unpleasant experience for my family. We...,very unpleasant experience.,Couple Leisure,1.0,Adria Airways,adria-airways,1188,very unpleasant experience. A very unpleasant ...,False
8,5.0,Slovenia,2018-04-01 00:00:00,2018-05-04,5.0,anchor567813,Michel Zombra,10.0,1.0,0,Frankfurt to Ljubljana,5.0,Economy Class,Frankfurt to Ljubljana. Flight was very comfor...,Flight was very comfortable.,Business,5.0,Adria Airways,adria-airways,1137,Flight was very comfortable. Frankfurt to Ljub...,False
9,1.0,Germany,2018-03-01 00:00:00,2018-03-11,1.0,anchor554731,S Hanarosic,1.0,0.0,0,Ljubljana to Frankfurt,2.0,Economy Class,Ljubljana to Frankfurt. Flight delayed for mor...,delayed for more than 2 hours.,Solo Leisure,1.0,Adria Airways,adria-airways,1083,delayed for more than 2 hours. Ljubljana to Fr...,False


In [14]:
reviews_text = reviews.groupby("airline_code")["review"].agg({"review": lambda x: "%s" % ' '.join(x)})

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


In [18]:
#peers.to_csv("peers.csv", index=False)

In [19]:
#reviews_text.to_csv("reviews_text.csv")

In [20]:
#reviews.to_csv("reviews.csv", index=False)