In [None]:
import pandas as pd
import sys
from dateutil import parser
import dateutil
import matplotlib.dates as dates
import re
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pyplot import *
import matplotlib.dates as mdates
from wordcloud import WordCloud, STOPWORDS

In [None]:
data = pd.read_csv("../data/tweets_public_ES.csv", encoding = "utf-16", sep = ",")

In [None]:
data = data.drop_duplicates(subset='text')

In [None]:
data['date'] = [dateutil.parser.parse(x) for x in data['tweet_created']]

## tweets' distribution

In [None]:
fig, ax = subplots()

data.groupby([pd.Grouper(key='date', freq='D'), 'airline_sentiment']).size().unstack().plot(figsize=(15,7), \
  color = ['#FF860C', 'grey', '#0080FF'], linewidth = 2, ax = ax)
ax.xaxis.set_major_locator(mdates.DayLocator(interval = 3))
ax.set_ylabel("number of tweets")
"""
savefig("sentiment_timeline_ES.png", papertype=None, format=None,
        transparent=False, bbox_inches='tight', pad_inches=0.1,
        frameon=None)
""";

In [None]:
from matplotlib.pyplot import *
fig, ax = subplots()
data.groupby([pd.Grouper(key='date', freq='D'), 'airline_sentiment']).size().unstack().plot(figsize=(15,7), \
  color = ['#FF860C', 'grey', '#0080FF'], linewidth = 2, ax = ax)
ax.xaxis.set_major_locator(mdates.DayLocator(interval = 3))
ax.set_ylabel("number of tweets")
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.legend(loc='upper left')
plt.xticks(rotation=45)
"""
savefig("sentiment_timeline_ES.png", papertype=None, format=None,
        transparent=False, bbox_inches='tight', pad_inches=0.1,
        frameon=None)
""";

In [None]:
df = data.copy()

## text processing

In [None]:
def remove_whitespace(x):
    """
    Helper function to remove any blank space from a string
    x: a string
    """
    try:
        # Remove spaces inside of the string
        x = " ".join(x.split())

    except:
        pass
    return x

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('spanish')

# found these stopwords below handy to take out
new_stoplist = ['https', 'el', 'de', 'co', 'lo', 'que', 'la', 'en', 'con', 'por', 'los', 'un', 'del', 'n', 't']

for i in new_stoplist:
    stop.append(i)

df["text"] = df["text"].apply(lambda x: x.lower())

df["text"] = df["text"].apply(lambda x: re.sub\
                              ("https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", "url", x)) # convert links to "url"
df["text_2"] = df["text"].apply(lambda x: re.sub("[^a-zA-Z@]", " ", x)) # remove all but alphabetical keeping "@"

df["text_2"] = \
    df['text_2'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) # remove stopwords
df["text_2"] = df["text_2"].apply(remove_whitespace)  # remove extra whitespaces

In [None]:
df = df.reset_index().set_index('date') 

dec14_neg = df[(df.index.get_level_values(0) >= '2017-12-14 00:00:00') & 
   (df.index.get_level_values(0) <= '2017-12-14 23:59:00') & 
   (df.airline_sentiment == "negative")]

In [None]:
ignore = set(('https', 'el', 'de', 'co', 'lo', 'que', 'la', 'en', 'con', 'por', 'los', 'un', 'del', 'n'))
fwc = {'stopwords': STOPWORDS | ignore, **wc}
text = dec14_neg['text_2'].str.lower().to_string()
wordcloud = WordCloud(**fwc, background_color='white',
                        #color_func = 'magma',
                        colormap = "inferno_r",).generate(''.join(text))
plt.imshow(wordcloud)
plt.axis("off")
"""
savefig("ES_2017_12_14_neg.png", papertype=None, format=None,
        transparent=False, bbox_inches='tight', pad_inches=0.1,
        frameon=None, dpi=500)
"""

In [None]:
dec14_neg.text_2.str.split(expand=True).stack().value_counts().head(10) # see most frequent words

## airline sentiments

In [None]:
df = df.reset_index().set_index('date') 

def create_columns_with_airlines(df):
    # a list curated based on word frequencies and search terms for data download
    list_of_airlines = ["iberia", "ryanair", "jetblue", "spanair", "vueling", "norwegian", "aireuropa"]
    for airline in list_of_airlines: 
        df[airline] = ""
        for index,row in df.iterrows():
            if airline in row["text"].lower():
                df.set_value(index,airline,1)
            else:
                df.set_value(index,airline,0)
    df[airline] = df[airline].astype(str).astype(int)
create_columns_with_airlines(df)

In [None]:
airlines_day = df.resample('D').apply({'iberia':'sum', 'spanair': 'sum', 'jetblue': 'sum',\
                                       'vueling': 'sum', 'ryanair': 'sum', 'norwegian': 'sum', 'aireuropa': 'sum'})

In [None]:
airlines_day_unstacked = airlines_day.unstack().reset_index()
airlines_day_unstacked.rename(columns={'level_0': 'airline', 0: 'count'}, inplace=True)

In [None]:
# remove zeros
airlines_day_unstacked = airlines_day_unstacked[(airlines_day_unstacked != 0).all(1)]

In [None]:
airlines_day_unstacked["date"] = airlines_day_unstacked["date"].apply(lambda x: x.strftime('%Y-%m-%d'))

In [None]:
import seaborn as sns

pivot = airlines_day_unstacked.pivot("airline", "date", "count")
pivot = pivot.fillna(0) # to remove Nones
sns.set(rc={'figure.figsize':(15,5)})
cmap1 = mpl.colors.ListedColormap(sns.color_palette("pink_r", 100))

ax = sns.heatmap(pivot, cmap = cmap1)
#ax = sns.heatmap(pivot)
#ax.set_ylabel("airline mentions")
ax.set_ylabel('')    
ax.set_xlabel('')
plt.yticks(fontsize=14)
"""
plt.savefig("airlines_by_day_ES.png", papertype=None, format=None,
        transparent=False, bbox_inches='tight', pad_inches=0.1,
        frameon=None, dpi = 600)
"""
;

In [None]:
#select only that day
mask = (df['tweet_created'] > "Thu Dec 14 00:00:00 +0000 2017") & (df['tweet_created'] <= "Thu Dec 14 23:59:59 +0000 2017")
data_2017_12_14 = df.loc[mask]

In [None]:
data_2017_12_14_iberia = data_2017_12_14.loc[data_2017_12_14.text_2.str.contains("iberia", case = False, na = False)]

In [None]:
pd.set_option('display.max_colwidth', -1)
data_2017_12_14_iberia["text_2"].head()

In [None]:
data_2017_12_14_iberia_negative = (data_2017_12_14_iberia.\
                                   loc[data_2017_12_14_iberia.airline_sentiment == "negative"])["text_2"]

In [None]:
data_2017_12_14_iberia_negative.head()

In [None]:
data_2017_12_14_iberia_negative.reset_index().text_2.str.split(expand=True).stack().value_counts().head()

In [None]:
def generate_wordcloud(text): # optionally add: stopwords=STOPWORDS and change the arg below
    ignore = set(('https', 'el', 'de', 'co', 'lo', 'que', 'la', 'en', 'con', 'por', 'los', 'un', 'del', 'iberia', 'vuelo'))
    fwc = {'stopwords': STOPWORDS | ignore, **wc}
    wordcloud = WordCloud(#relative_scaling = 0.75
                        **fwc,
                        background_color='white',
                        colormap = "inferno_r"
                        #width=800, height=400
                          ).generate(text)
    plt.imshow(wordcloud)
    plt.axis("off")
    # EN_2015_02_23_neg_american_tweets_without_strings_american_flights
    """
    savefig("ES_2017_12_14_neg_iberia.png", papertype=None, format=None,
        transparent=False, bbox_inches='tight', pad_inches=0.1,
        frameon=None, dpi=500)
    """
    
    plt.show()

generate_wordcloud(data_2017_12_14_iberia_negative.str.lower().str.strip().to_string())

In [None]:
# why tweets contain "felicidades"?
(data_2017_12_14[data_2017_12_14['text'].str.contains("felicidades", case = False)])["text"].head()

In [None]:
# why tweets contain "instantaneamente"?
(data_2017_12_14[data_2017_12_14['text'].str.contains("instantaneamente", case = False)])["text"].head()

## Iberia or not

In [None]:
def iberia_o_no(df):
    df["iberia_o_no"] = ""
    for index,row in df.iterrows():
        if "iberia" in row["text_3"].lower():
              df.set_value(index,'iberia_o_no',"iberia")
        else:
              df.set_value(index,'iberia_o_no',"not_iberia")
iberia_o_no(df)

In [None]:
df_iberia = df.loc[df.iberia_o_no == "iberia"]
df_NOT_iberia = df.loc[df.iberia_o_no != "iberia"]

In [None]:
iberia_values = df_iberia.text_3.str.split(expand=True).stack().value_counts()\
    .reset_index().rename(columns={'index': 'iberia_word', 0:'iberia_value'})

NOT_iberia_values = df_NOT_iberia.text_3.str.split(expand=True).stack().value_counts()\
    .reset_index().rename(columns={'index': 'NOT_iberia_word', 0:'NOT_iberia_value'})

In [None]:
# not iberia
NOT_iberia_merged = pd.merge(NOT_iberia_values, iberia_values, \
                             left_on='NOT_iberia_word', right_on='iberia_word', how = 'left')
del NOT_iberia_merged["iberia_word"]
NOT_iberia_merged["NOT_iberia_value_norm"] = \
    (NOT_iberia_merged["NOT_iberia_value"] - NOT_iberia_merged["NOT_iberia_value"].min()) / \
    (NOT_iberia_merged["NOT_iberia_value"].max()-NOT_iberia_merged["NOT_iberia_value"].min())

NOT_iberia_merged["iberia_value_norm"] = \
    (NOT_iberia_merged["iberia_value"] - NOT_iberia_merged["iberia_value"].min()) / \
    (NOT_iberia_merged["iberia_value"].max()-NOT_iberia_merged["iberia_value"].min())
del NOT_iberia_merged["NOT_iberia_value"]
del NOT_iberia_merged["iberia_value"]
NOT_iberia_merged.rename(columns={'NOT_iberia_word': 'word', \
                                  'NOT_iberia_value_norm': 'all_other', 'iberia_value_norm': 'Iberia'}, inplace=True)

In [None]:
import matplotlib
x=NOT_iberia_merged["all_other"].head(10)
y=NOT_iberia_merged["Iberia"].head(10)
label=NOT_iberia_merged["word"].head(10)

fig, ax = plt.subplots()

xy_line = (0, 1)
ax.scatter(x, y, c = "red", alpha = 0.5)
ax.set_xlabel("all other airlines")
ax.set_ylabel("Iberia")

for i, txt in enumerate(label):
    ax.annotate(txt, (x[i],y[i]))

ax.plot(xy_line, c = "grey")
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
"""
plt.savefig("non_Iberia_words_scatter_top10.png", 
        transparent=True, bbox_inches='tight', pad_inches=0.1,
        frameon=None, format=None, dpi=700)
"""
plt.show();

In [None]:
# Iberia
iberia_merged = pd.merge(iberia_values, NOT_iberia_values, left_on = "iberia_word",\
                          right_on = "NOT_iberia_word", how = "left")
del iberia_merged["NOT_iberia_word"]
iberia_merged["iberia_value_norm"] = \
    (iberia_merged["iberia_value"] - iberia_merged["iberia_value"].min()) / \
    (iberia_merged["iberia_value"].max()-iberia_merged["iberia_value"].min())

iberia_merged["NOT_iberia_value_norm"] = \
    (iberia_merged["NOT_iberia_value"] - iberia_merged["NOT_iberia_value"].min()) / \
    (iberia_merged["NOT_iberia_value"].max()-iberia_merged["NOT_iberia_value"].min())
del iberia_merged["iberia_value"]
del iberia_merged["NOT_iberia_value"]

In [None]:
x=iberia_merged["iberia_value_norm"].head(10)
y=iberia_merged["NOT_iberia_value_norm"].head(10)
label=iberia_merged["iberia_word"].head(10)

fig, ax = plt.subplots()

xy_line = (0, 1)
ax.scatter(x, y, c = "red", alpha = 0.5)
ax.set_xlabel("Iberia")
ax.set_ylabel("all other airlines")

for i, txt in enumerate(label):
    ax.annotate(txt, (x[i],y[i]))

ax.plot(xy_line, c = "grey")
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
"""
plt.savefig("Iberia_words_scatter_top10.png", 
        transparent=True, bbox_inches='tight', pad_inches=0.1,
        frameon=None, format=None, dpi=700)
"""
plt.show();

## top words by airline

In [None]:
def number_ocurrences(x):
    airlines = ["aireuropa", "iberia", "jetblue", "norwegian", "ryanair", "spanair", "vueling"]
    for airline in airlines:
        print(airline + ": "+ str((len(df[df["text_3"].str.contains(airline, case = False)]))))
number_ocurrences(df)

In [None]:
iberia = df[df["text_3"].str.contains("iberia", case = False)]
iberia.text_3.str.split(expand=True).stack().value_counts().head(10).reset_index()

In [None]:
ryanair = df[df["text_3"].str.contains("ryanair", case = False)]
ryanair.text_3.str.split(expand=True).stack().value_counts().head(10).reset_index()

In [None]:
vueling = df[df["text_3"].str.contains("vueling", case = False)]
vueling.text_3.str.split(expand=True).stack().value_counts().head(10).reset_index()

In [None]:
spanair = df[df["text_3"].str.contains("spanair", case = False)]
spanair.text_3.str.split(expand=True).stack().value_counts().head(10).reset_index()