In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pyplot import *
import matplotlib as mpl
import matplotlib.dates as mdates
from wordcloud import WordCloud, STOPWORDS

In [None]:
data = pd.read_csv("../data/tweets_public.csv", encoding = "utf-8", sep = ",")

## text processing
** list of airline names **
americanair, delta, jetblue, southwestair, united, usairways, virginamerica

In [None]:
def remove_whitespace(x):
    """
    Helper function to remove any blank space from a string
    x: a string
    """
    try:
        # Remove spaces inside of the string
        x = " ".join(x.split())

    except:
        pass
    return x

In [None]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
df["text"] = df["text"].apply(lambda x: x.lower()) # get lower
df["text"] = df["text"].apply(lambda x: re.sub\
                              ("https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", "url", x))
df["text_2"] = df["text"].apply(lambda x: re.sub("[^a-zA-Z@]", " ", x)) # remove all but alphabetical keeping "@"

df["text_3"] = \
    df['text_2'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) # remove stopwords
df["text_2"] = df["text_2"].apply(remove_whitespace)  # remove extra whitespaces

In [None]:
data.tweet_created = pd.to_datetime(data.tweet_created)

## sentiment 

In [None]:
fig, ax = subplots()
data.groupby([pd.Grouper(key='tweet_created', freq='D'), 'airline_sentiment']).size().unstack().plot(figsize=(15,7), \
  color = ['#FF860C', 'grey', '#0080FF'], linewidth = 2, ax = ax)
ax.xaxis.set_major_locator(mdates.DayLocator(interval = 3))
ax.set_ylabel("number of tweets")
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.legend(loc='upper left')
ax.grid(False)
ax.set_facecolor('#f3f3f3ff')

"""
savefig("sentiment_timeline_EN.png", papertype=None, format=None,
        transparent=True, bbox_inches='tight', pad_inches=0.1,
        frameon=None)
"""
;

In [None]:
df = data.copy()

## airline mention frequency

In [None]:
def create_columns_with_airlines(df):
    list_of_airlines = ["americanair", "delta", "jetblue", "southwestair", "united", "usairways", "virginamerica"]
    for airline in list_of_airlines:
        df[airline] = ""
        for index,row in df.iterrows():
            if airline in row["text"].lower():
                df.set_value(index,airline,1)
            else:
                df.set_value(index,airline,0)
        df[airline] = df[airline].astype(str).astype(int)
create_columns_with_airlines(df)

In [None]:
df = df.reset_index().set_index('tweet_created')
airlines_day = df.resample('D').apply({'americanair':'sum', 'delta': 'sum', 'jetblue': 'sum', 'southwestair': 'sum',
                                      'united': 'sum', 'usairways': 'sum', 'virginamerica': 'sum'})

In [None]:
airlines_day_unstacked = airlines_day.unstack().reset_index()
airlines_day_unstacked.rename(columns={'level_0': 'airline', 0: 'count'}, inplace=True)

In [None]:
# remove zeros
airlines_day_unstacked = airlines_day_unstacked[(airlines_day_unstacked != 0).all(1)]

In [None]:
airlines_day_unstacked["tweet_created"] = airlines_day_unstacked["tweet_created"].apply(lambda x: x.strftime('%Y-%m-%d'))

In [None]:
# http://jose-coto.com/styling-with-seaborn
import seaborn as sns

pivot = airlines_day_unstacked.pivot("airline", "tweet_created", "count")
sns.set(rc={'axes.facecolor':'#f3f3f3ff', 'figure.facecolor':'#f3f3f3ff'})
sns.set(rc={'figure.figsize':(25,10)})
sns.set(rc={'axes.labelsize': 14, 'legend.fontsize': 14,'axes.titlesize': 14,\
            'xtick.labelsize': 14, 'ytick.labelsize': 14}) # 'font.size': 20, # this is not working :
plt.figure(figsize=(25, 10))

cmap2 = sns.cubehelix_palette(light=1, as_cmap=True)
cmap1 = mpl.colors.ListedColormap(sns.color_palette("pink_r", 100))
ax = sns.heatmap(pivot, cmap = cmap1)
#ax = sns.heatmap(pivot)
#ax.set_ylabel("airline mentions")
ax.set_ylabel('')    
ax.set_xlabel('')
plt.yticks(fontsize=20)

"""
savefig("airlines_by_day_EN.png", papertype=None, format=None,
        transparent=True, bbox_inches='tight', pad_inches=0.1,
        frameon=None)
"""
;

## airline content

In [None]:
#select only that day
mask = (data['tweet_created'] > "2015-02-23 00:00:00") & (data['tweet_created'] <= "2015-02-23 23:59:59")
data_2015_02_23 = data.loc[mask]

In [None]:
#select American
data_2015_02_23_american = data_2015_02_23.loc[data_2015_02_23.airline == "American"]

In [None]:
pd.set_option('display.max_colwidth', -1)
data_2015_02_23_american_tweets = data_2015_02_23_american["text"]

In [None]:
wc = {'width': 600, 'height': 300, 'random_state': 0}
wordcloud = WordCloud(**wc).generate(''.join(data_2015_02_23_american_tweets))
plt.imshow(wordcloud)
plt.axis("off");

In [None]:
data_2015_02_23_american_tweets = data_2015_02_23_american_tweets.reset_index()

In [None]:
data_2015_02_23_american_tweets.text.str.split(expand=True).stack().value_counts().head()

In [None]:
wordcloud = WordCloud(**wc).generate(''.join(data_2015_02_23_american_tweets['text_2']))
plt.imshow(wordcloud)
plt.axis("off");

In [None]:
data_2015_02_23_american_tweets.text_2.str.split(expand=True).stack().value_counts().head()

In [None]:
text = data_2015_02_23_american_tweets["text_2"].to_string()
text2 = text.replace("americanair", "").replace("flight", "")

def generate_wordcloud(text): # optionally add: stopwords=STOPWORDS and change the arg below
    wordcloud = WordCloud(#relative_scaling = 0.75
                          #stopwords = {'to', 'of'} # set or space-separated string
                        background_color='#f3f3f3ff',
                        #color_func = 'magma',
                        #colormap = "inferno_r",
                        colormap = "inferno_r",
                        width=800, height=400
                          ).generate(text)
    plt.imshow(wordcloud)
    plt.axis("off")
        
    """
    savefig("EN_2015_02_23_american_tweets_without_strings_american_flights.png", papertype=None, format=None,
        transparent=True, bbox_inches='tight', pad_inches=0.1,
        frameon=None, dpi=500)
    """

    plt.show()

generate_wordcloud(text2)

In [None]:
data_2015_02_23_american_neg = (data_2015_02_23_american.\
                                loc[data_2015_02_23_american.airline_sentiment == "negative"])["text"]

In [None]:
text3 = data_2015_02_23_american_neg.to_string().lower()
text4 = text3.replace("americanair", "").replace("flight", "")

def generate_wordcloud(text): # optionally add: stopwords=STOPWORDS and change the arg below
    wordcloud = WordCloud(#relative_scaling = 0.75
                          #stopwords = {'to', 'of'} # set or space-separated string
                        background_color='#f3f3f3ff',
                        #color_func = 'magma',
                        colormap = "inferno_r",
                        width=800, height=400
                          ).generate(text)
    plt.imshow(wordcloud)
    plt.axis("off")
        
    """
    savefig("EN_2015_02_23_neg_american_tweets_without_strings_american_flights.png", papertype=None, format=None,
        transparent=True, bbox_inches='tight', pad_inches=0.1,
        frameon=None, dpi=500)
    """

    plt.show()

generate_wordcloud(text4)

## is there correlation between airline and sentiment?

In [None]:
def convert_airline_sentiment(df):
    df["airline_sentiment_coded"] = ""
    for index,row in df.iterrows():
        if row["airline_sentiment"] == "negative":
              df.set_value(index,'airline_sentiment_coded',"0")
        elif row["airline_sentiment"] == "neutral":
              df.set_value(index,'airline_sentiment_coded',"1")
        else:
              df.set_value(index,'airline_sentiment_coded',"2")
    df["airline_sentiment_coded"] = df["airline_sentiment_coded"].astype(str).astype(int)

convert_airline_sentiment(df)

In [None]:
df["airline_sentiment_coded"].value_counts()

In [None]:
print("american: " + str(df['americanair'].corr(df['airline_sentiment_coded'])))
print("delta: " + str(df['delta'].corr(df['airline_sentiment_coded'])))
print("jetblue: " + str(df['jetblue'].corr(df['airline_sentiment_coded'])))
print("southwestair: " + str(df['southwestair'].corr(df['airline_sentiment_coded'])))
print("united: " + str(df['united'].corr(df['airline_sentiment_coded'])))
print("usairways: " + str(df['usairways'].corr(df['airline_sentiment_coded'])))
print("virginamerica: " + str(df['virginamerica'].corr(df['airline_sentiment_coded'])))

In [None]:
# https://towardsdatascience.com/simple-and-multiple-linear-regression-in-python-c928425168f9
import statsmodels.api as sm # import statsmodels 

y = df["airline_sentiment_coded"]
X = df[["americanair", "delta", "jetblue", "southwestair", "united", "usairways", "virginamerica"]]

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

model.summary()

In [None]:
from sklearn import linear_model
lm = linear_model.LinearRegression()
y = df["airline_sentiment_coded"]
#X = df[["americanair", "delta", "jetblue", "southwestair", "united", "usairways", "virginamerica"]]
X = df[["southwestair"]]
model = lm.fit(X,y)
predictions = lm.predict(X)
lm.score(X,y)

## How word use related to airlines compare to others?

## Americanair

In [None]:
df_americanair = df.loc[df.airline == "American"]
df_NOT_americanair = df.loc[df.airline != "American"]

In [None]:
americanair_values = df_americanair.text_3.str.split(expand=True).stack().value_counts()\
    .reset_index().rename(columns={'index': 'americanair_word', 0:'americanair_value'})
NOT_americanair_values = df_NOT_americanair.text_3.str.split(expand=True).stack().value_counts()\
    .reset_index().rename(columns={'index': 'NOT_americanair_word', 0:'NOT_americanair_value'})

In [None]:
americanair_merged = pd.merge(americanair_values, NOT_americanair_values, \
                             left_on='americanair_word', right_on='NOT_americanair_word', how = 'left')
del americanair_merged["NOT_americanair_word"]

In [None]:
americanair_merged["americanair_value_norm"] = \
    (americanair_merged["americanair_value"] - americanair_merged["americanair_value"].min()) / \
    (americanair_merged["americanair_value"].max()-americanair_merged["americanair_value"].min())

americanair_merged["NOT_americanair_value_norm"] = \
    (americanair_merged["NOT_americanair_value"] - americanair_merged["NOT_americanair_value"].min()) / \
    (americanair_merged["NOT_americanair_value"].max()-americanair_merged["NOT_americanair_value"].min())
    
del americanair_merged["americanair_value"]
del americanair_merged["NOT_americanair_value"]
americanair_merged.rename(columns=\{'americanair_value_norm': 'Americanair', \
                                    'NOT_americanair_value_norm': 'all_others'}, inplace=True)

In [None]:
%matplotlib inline
americanair_merged_top200 = americanair_merged.sort_values(["Americanair"], ascending = False).head(200)

In [None]:
# Americanair focused
import matplotlib
x=americanair_merged_top200["Americanair"].head(10)
y=americanair_merged_top200["all_others"].head(10)
label=americanair_merged_top200["americanair_word"].head(10)

fig, ax = plt.subplots()
ax.set_facecolor('#f3f3f3ff')
xy_line = (0, 1)
ax.scatter(x, y, c = "red", alpha = 0.5)
ax.set_xlabel("Americanair")
ax.set_ylabel("all other airlines")

for i, txt in enumerate(label):
    ax.annotate(txt, (x[i],y[i]))

ax.plot(xy_line, c = "grey")
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
"""
plt.savefig("americanair_words_scatter_top10_@.png", 
        transparent=True, bbox_inches='tight', pad_inches=0.1,
        frameon=None, format=None, dpi=700)
"""
plt.show();

## United

In [None]:
df_united = df.loc[df.airline == "United"]
df_NOT_united = df.loc[df.airline != "United"]

united_values = df_united.text_3.str.split(expand=True).stack().value_counts()\
    .reset_index().rename(columns={'index': 'united_word', 0:'united_value'})

NOT_united_values = df_NOT_united.text_3.str.split(expand=True).stack().value_counts()\
    .reset_index().rename(columns={'index': 'NOT_united_word', 0:'NOT_united_value'})
    
united_merged = pd.merge(united_values, NOT_united_values, \
                             left_on='united_word', right_on='NOT_united_word', how = 'left')

del united_merged["NOT_united_word"]

united_merged["united_value_norm"] = \
    (united_merged["united_value"] - united_merged["united_value"].min()) / \
    (united_merged["united_value"].max()-united_merged["united_value"].min())

united_merged["NOT_united_value_norm"] = \
    (united_merged["NOT_united_value"] - united_merged["NOT_united_value"].min()) / \
    (united_merged["NOT_united_value"].max()-united_merged["NOT_united_value"].min())
    
del united_merged["united_value"]
del united_merged["NOT_united_value"]

united_merged.rename(columns={'united_value_norm': 'united', 'NOT_united_value_norm': 'all_others'}, inplace=True)

In [None]:
united_merged_top200 = united_merged.sort_values(["united"], ascending = False).head(200)
united_all_others_merged_top200 = united_merged.sort_values(["all_others"], ascending = False).head()

In [None]:
# united focused
x=united_merged_top200["united"].head(10)
y=united_merged_top200["all_others"].head(10)
label=united_merged_top200["united_word"].head(10)

fig, ax = plt.subplots()
ax.set_facecolor('#f3f3f3ff')

xy_line = (0, 1)
ax.scatter(x, y, c = "red", alpha = 0.5)
ax.set_xlabel("United airlines")
ax.set_ylabel("all other airlines")

for i, txt in enumerate(label):
    ax.annotate(txt, (x[i],y[i]))

ax.plot(xy_line, c = "grey")

ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)

"""
plt.savefig("united_words_scatter_top10_@.png", 
        transparent=True, bbox_inches='tight', pad_inches=0.1,
        frameon=None, format=None, dpi=700)
"""
plt.show();