In [None]:
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import datetime
import re
from textblob import TextBlob


In [None]:
fire_csv_list = sorted(glob.glob('../../tweets/megafires/clean_csvs/*.csv'))

In [None]:
base_df = pd.DataFrame(None, index=range(len(fire_csv_list)), columns=[])
base_df['origin_csv'] = [os.path.basename(f) for f in fire_csv_list]
base_df['name'] = ['-'.join(os.path.basename(os.path.splitext(f)[0])
                   .split('-')[1:]) for f in fire_csv_list]
base_df['year'] = [os.path.basename(f).split('-')[0] for f in fire_csv_list]

In [None]:
def read_df_timestamps(csv_path):
    df = pd.read_csv(csv_path)
    # Drop invalid timestamps
    valid_ts = df.Timestamp.str[-3:] == "UTC"
    df = df.loc[valid_ts]
    df['Timestamp'] = pd.to_datetime(df.Timestamp, format='%Y-%m-%d %H:%M:%S UTC')
    return df

def basic_plot(df, target_col, topn=50):
    plt.figure(figsize=(60,20), facecolor='white')
    plt.tick_params(axis='both', which='major')
    df = df.sort_values(target_col, ascending=False)
    if topn < df.shape[0]:
        df = df[0:(topn+1)]
    plt.bar(df['name'], df[target_col])
    plt.xticks(fontsize=45, rotation=90)
    plt.yticks(fontsize=40)
    plt.ylabel(target_col, fontsize=60)

    plt.show()

# Smoke (and other special terms) mentions

In [None]:
def count_smoke_tweets(csv_path):
    df = read_df_timestamps(csv_path)
    smoke_mask = df.Text.str.contains(r'(?i)smoke')
    print('Total Tweets: {}'.format(df.shape[0]))
    print('Total Smoke Tweets: {}'.format(smoke_mask.sum()))
    print(df.Text[smoke_mask])
    
def basic_term_counts(df):
    out_dict = {}
    out_dict['total_tweets'] = df.shape[0]
    term_regex_dict = {'smoke': r'(?i)\bsmoke',
                       'haze': r'(?i)\bhaz(y|i)',
                       'evac': r'(?i)\bevac'}
    for term in term_regex_dict.keys():
        out_dict['{}_tweets'.format(term)] = df.Text.str.contains(
            term_regex_dict[term]).sum()
        out_dict['{}_tweet_frac'.format(term)] = (out_dict['{}_tweets'.format(term)]
                                                  /df.shape[0])

    return out_dict

def get_term_counts(csv_list):
    out_dict_list = []
    for fcsv in csv_list:
        df = read_df_timestamps(fcsv)
        out_dict = basic_term_counts(df)
        out_dict['origin_csv'] = os.path.basename(fcsv)
        out_dict_list.append(out_dict)
    
    terms_df = pd.DataFrame.from_dict(out_dict_list)
    
    return terms_df

def plot_smoke_tweets_time(csv_path):
    df = read_df_timestamps(csv_path)
    fig, ax1 = plt.subplots(figsize=(20,10), facecolor='white');
    tweet_counts = df['Timestamp'].groupby(df['Timestamp'].dt.date).count()
    ax1.plot_date(tweet_counts.index.values, tweet_counts.values, 'b-')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Total Tweets', color='b')
    ax1.tick_params('y', colors='b')
    
    ax2 = ax1.twinx()
    smoke_mask = df.Text.str.contains(r'(?i)smoke')
    smoke_df = df[smoke_mask]
    smoke_tweet_counts = smoke_df['Timestamp'].groupby(
        smoke_df['Timestamp'].dt.date
        ).count()
    ax2.plot_date(smoke_tweet_counts.index.values, smoke_tweet_counts.values, 'r-')
    ax2.set_ylabel('Smoke Tweets', color='r')
    ax2.tick_params('y', colors='r')
    plt.show()

In [None]:
terms_df = get_term_counts(fire_csv_list)
atts_df = base_df.merge(terms_df, on='origin_csv')

In [None]:
basic_plot(atts_df, 'smoke_tweet_frac',topn=10)

In [None]:
plot_smoke_tweets_time('/home/tweets/megafires/clean_csvs/2012-waldo-canyon-fire-co.csv')

# NLP Cleanup
With help from: https://www.earthdatascience.org/courses/earth-analytics-python/get-data-using-apis/calculate-tweet-word-frequencies-sentiments-in-python/

In [None]:
def remove_url(txt):
    """Replace URLs found in a text string with nothing 
    (i.e. it will remove the URL from the string).

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove urls.

    Returns
    -------
    The same txt string with url's removed.
    """

    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

def clean_tweet_text(df, lower_case=True):
    """Clean Tweet Text, optionally lowercasing all"""
    df['clean_text'] = df['Text'].str.lower().apply(remove_url)
    return df

In [None]:
clean_tweet_text(read_df_timestamps('/home/tweets/megafires/clean_csvs/2010-four-mile-canyon-fire-co.csv'))

# Sentiment

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def add_sentiment(df):
    sid = SentimentIntensityAnalyzer()
    df['vader_sentiment'] = [sid.polarity_scores(tweet)['compound'] for tweet in df['clean_text']]
    df['textblob_sentiment'] = [TextBlob(tweet).sentiment.polarity for tweet in df['clean_text']]
    return df

def get_mean_sentiment(csv_list):
    out_dict_list = []
    for fcsv in csv_list:
        out_dict = {}
        df = clean_tweet_text(read_df_timestamps(fcsv))
        df = add_sentiment(df)
        out_dict['sent_mean_vader'] = df['vader_sentiment'].mean()
        out_dict['sent_mean_textblob'] = df['textblob_sentiment'].mean()
        out_dict['origin_csv'] = os.path.basename(fcsv)
        out_dict_list.append(out_dict)
    
    terms_df = pd.DataFrame.from_dict(out_dict_list)
    
    return terms_df

In [None]:
short_list = fire_csv_list[0:10]
short_list.append('../../tweets/megafires/clean_csvs/2013-yarnell-hill-az.csv')
full_sent_df = get_mean_sentiment(short_list)

In [None]:
atts_sent_df = base_df.merge(full_sent_df, on='origin_csv')
basic_plot(atts_sent_df, 'sent_mean_vader')