In [None]:
# basic use libraries
import re
import numpy as np
import datetime
import pandas as pd

# for dashboard
import panel as pn
import param

# for scraping the web
from bs4 import BeautifulSoup
import requests
from twitterscraper import query_tweets
import twitterscraper

# for visualizations
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from wordcloud import WordCloud

# file management
import csv
import json
import subprocess
import shutil

#for initial time-series modeling
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose as sd

In [None]:
# Facebook Prophet libraries
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, plot_cross_validation_metric
from fbprophet.diagnostics import cross_validation, performance_metrics
import plotly.offline as py
pd.plotting.register_matplotlib_converters() #necessary to maintain pd.plotting functionality

# for NLP
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from textblob import TextBlob 

# Functions

In [None]:
def make_dates_list(year=2018):
    # define a list of dates for a given year
    dates_year =[str(date)[:10] for date in pd.date_range(start=f'1/1/{year}', end=f'12/31/{year}')]
    # define a list of dates for generating file names
    dates_stripped_year = [date.replace('-','') for date in dates_year]
    return dates_year, dates_stripped_year

In [None]:
# Define functions to clean tweets and get tweet sentiment

# portions of the code below comes from :
# https://towardsdatascience.com/extracting-twitter-data-pre-processing-and-sentiment-analysis-using-python-3-0-7192bd8b47cf
def replace_emoticons(tweet):
    "This code replaces happy and sad emoticons with the words 'HAPPY' and 'SAD'"
    rhappy = '[' + re.escape(''.join(emoticons_happy)) + ']'
    re.sub(rhappy, ' HAPPY ', tweet)
    rsad = '[' + re.escape(''.join(emoticons_sad)) + ']'
    re.sub(rsad, ' SAD ', tweet)
    return tweet

def clean_tweet(tweet): 
    ''' 
    Utility function to clean tweet text by removing links, usernames, and
    special characters using simple regex statements. 
    '''
    tweet = replace_emoticons(tweet)
    # p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION)
    # tweet = p.clean(tweet)
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) \
                            |(\w+:\/\/\S+)", " ", tweet).split())
    return tweet

def double_clean_tweet(tweet):
    "This function goes a little further than the previous clean function"
    #removing mentions
    tweet = re.sub(r':', ' ', tweet)
    tweet = re.sub(r'‚Ä¶', ' ', tweet)
    #replace consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
    #remove emojis from tweet  (unless you want to later go through the UNICODE
    # charts and separate "happy" emojis from "sad" emojis and add them to 
    # the `replace_emoticons()` function)
    tweet = emoji_pattern.sub(r'', tweet)
    return tweet


# Sentiment analysis code below adapted from:
# https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/
def get_tweet_sentiment(tweet): 
    ''' 
    Utility function to classify sentiment of passed tweet 
    using textblob's sentiment method 
    '''
    # create TextBlob object of passed tweet text 
    analysis = TextBlob(tweet)
    # set sentiment 
    polarity = analysis.sentiment.polarity
    subjectivity = analysis.sentiment.subjectivity
    if analysis.sentiment.polarity > 0.1: 
        sentiment = 'positive'
    elif analysis.sentiment.polarity < -0.1: 
        sentiment = 'negative'
    else: 
        sentiment = 'neutral'
    return sentiment, polarity, subjectivity

In [None]:
# define a function to create a .CSV file that compiles the relevant 
# info from the JSONs, preprocesses the tweets, and performs sentiment analysis
def json_to_csv_tweets(output_filename='output.csv', year=2018):
    """
    Takes in JSON files of scraped tweets from the `./data/` folder,
    cleans the tweets, performs sentiment analysis, and then outputs
    the results to the provided destination CSV filename.
    """
    # create the csv writer object
    csvwriter = csv.writer(open(output_filename, 'w', newline=''))
    csvwriter.writerow(["timestamp", "text", "sentiment", "polarity", "subjectivity", "tally"])

    # iterate adding rows of JSON to the CSV file
    dates_year, dates_stripped_year = make_dates_list(year)
    for i in dates_stripped_year:
        f = open(f'./data/t{i}.json')
        data = json.load(f)
        for tweet in data:
            tw = tweet["text"]
            tw = replace_emoticons(tw)
            tw = clean_tweet(tw)
            tw = double_clean_tweet(tw)
            sentiment, polarity, subjectivity = get_tweet_sentiment(tw)
            csvwriter.writerow([i, tw, sentiment, polarity, subjectivity, 1])
        f.close()
        if float(i)%20 == 0:
            print(f"Finished working with:   ./data/t{i}.json")
    print("JOB IS COMPLETELY FINISHED.  HOORAY!!")
    pass

In [None]:
# Define functions to scrape and clean financial data for a given year
def fetch_data(symbol, object_type, stocks_apikey):
    # Making an API request for a certain stock's history
    if object_type == 'currency':
        credentials = {'function':'FX_DAILY', 
                       'from_symbol':symbol, 
                       'to_symbol':'USD', 
                       'outputsize':'full',
                       'apikey':stocks_API_key}
    elif object_type == 'cryptocurrency':
        credentials = {'function':'DIGITAL_CURRENCY_DAILY', 
                       'symbol':symbol, 
                       'market':'USD', 
                       'apikey':stocks_API_key}
    else:
        credentials = {'function':'TIME_SERIES_DAILY',
                       'symbol':symbol,
                       'outputsize':'full',
                       'apikey':stocks_apikey}

    r = requests.get('https://www.alphavantage.co/query', params=credentials)
    # checking to make sure request was successful
    print(r.status_code)
    if r.status_code == requests.codes.ok:
        print("Request Successful")
    return r


def clean_financials(r, object_type):
    # cleaning up the data to make it easier to work with
    if object_type == 'currency':
        df = pd.DataFrame(r.json()["Time Series FX (Daily)"])
    elif object_type == 'cryptocurrency':
        df = pd.DataFrame(r.json()["Time Series (Digital Currency Daily)"])
    elif object_type in ['stock','index']:
        df = pd.DataFrame(r.json()["Time Series (Daily)"])

    df = df.T.reset_index()

    if object_type == 'cryptocurrency':
        df.drop(columns=["1b. open (USD)","2b. high (USD)","3b. low (USD)",
            "4b. close (USD)","6. market cap (USD)"], inplace=True, axis=1)

    if object_type == 'currency':
        df.columns = ['date','open','high','low','close']
        df.date = pd.to_datetime(df.date)
        df[['open','high','low','close']] = df[['open',
            'high','low','close']].astype(float)
    else:
        df.columns = ['date','open','high','low','close','volume']
        df.date = pd.to_datetime(df.date)
        df[['open','high','low','close','volume']] = df[['open',
            'high','low','close','volume']].astype(float)

    # create a new column to account for after-hours trading
    # this uses the next day's open value as the prior day's close value
    cl24 = [df.loc[0].close]
    for val in df.open.values:
        cl24.append(val)
    cl24 = pd.DataFrame(cl24[:-1], columns=['close_24'])
    df = df.join(cl24)

    # now we must account for when afterhours trading exceeds high/low values
    df['high_24'] = df[['high', 'close_24']].values.max(1)
    df['low_24'] = df[['low', 'close_24']].values.min(1)
    # and add a few more columns that should be useful
    df['range'] = df['high'] - df['low']
    df['range_24'] = df['high_24'] - df['low_24']
    df['change_24'] = df['close_24'] - df['open']
    # setting date column as index to facilitate timeseries manipulation
    df.set_index('date', inplace=True)
    return df
    

def get_financial_data(symbol, object_type, stocks_apikey, year=2018, verbose=True):
    """
    Inputs:
    symbol         (string) Stock or Currency symbol
    object_type    (string) must be one of these:
                      'stock'
                      'index'
                      'currency'
                      'cryptocurrency'
    stocks_apikey  (string) your API key for 
                      https://www.alphavantage.co
    year           (int) the year you wish to examine
    verbose        if True, displays .info() and .head() of data
    =========================================
    Returns a DataFrame of daily financial information containing
    at least opening, closing, high, and low values.
    """
    valid_types = ['stock','index','currency','cryptocurrency']
    if object_type in valid_types:
        r = fetch_data(symbol, object_type, stocks_apikey)
        df = clean_financials(r, object_type)    #cleaning data
        year_df = df[f'{year}':f'{year}']        # getting 1 year's data
        if verbose:
            display(year_df.head(),year_df.info())
        plt.figure(figsize=(15,5))
        plt.plot(year_df['close_24'])
        plt.title(f"{symbol} Daily Performance for {year}", fontsize=16)
        plt.ylabel("Price (in USD)");
        return year_df
    else:
        print("""
        Invalid entry for 'object_type', must be one of these strings:
                'stock'
                'index'
                'currency'
                'cryptocurrency'
        """)
        pass

In [None]:
# Define a function to decompose a time series, in order to detect
# trends and seasonality, and allow for examining the residuals
def df_decompose(df):
    # Gather the trend, seasonality and noise of decomposed object
    trend = sd(df).trend
    seasonal = sd(df).seasonal
    residual = sd(df).resid

    # Plot gathered statistics
    plt.figure(figsize=(12,8))
    plt.title(f"Decomposition for {df}")
    plt.subplot(411)
    plt.plot(df, label='Original', color="blue")
    plt.legend(loc='best')
    plt.subplot(412)
    plt.plot(trend, label='Trend', color="blue")
    plt.legend(loc='best')
    plt.subplot(413)
    plt.plot(seasonal, label='Seasonality', color="blue")
    plt.legend(loc='best')
    plt.subplot(414)
    plt.plot(residual, label='Residuals', color="blue")
    plt.legend(loc='best')
    plt.tight_layout();
    
    return residual.dropna()

In [None]:
# let's plot sentiment trends
def plot_sentiments(csv_filename):
    year=2018
    tweets_df = pd.read_csv(csv_filename)
    tweets_df.timestamp = pd.to_datetime(tweets_df.timestamp, format='%Y%m%d')
    grouped = pd.DataFrame(tweets_df.groupby(['timestamp', 'sentiment'])['tally'].sum()).reset_index()
    for sentiment in grouped.sentiment.unique():
        temp_df = grouped[grouped.sentiment == sentiment].set_index('timestamp')
        temp_df['tally'].plot(figsize=(15,8), label=sentiment)
        plt.ylabel("Number of Tweets by Sentiment\n(~1,000/day total)", fontsize=16)
        plt.title(f"Daily Sentiment at midnight (UTC) in {year}", fontsize=20)
    plt.legend();

In [None]:
# Define functions to automate the process of inspecting lunar trends

def get_lunar_phases(year='2018'):
    "Given a year, returns a dataframe of lunar phases, dates, and times (UTC)."
    url = f"https://aa.usno.navy.mil/cgi-bin/aa_phases.pl?year={year}&nump=65&format=t"
    res_page = requests.get(url)
    soup = BeautifulSoup(res_page.content, 'html.parser')
    table_cells = soup.find_all("td")
    output = pd.DataFrame(columns=['phase','date','time'])
    for i in range(len(table_cells)):
        row = np.floor(i/2)
        if i%2 == 0:
            output.at[row,'phase'] = table_cells[i].text
        else:
            output.at[row,'date'] = table_cells[i].text[:12]  #need to grab just beggining of string
            output.at[row,'time'] = table_cells[i].text[-5:]  #need to grab just ending of string
    output.date = pd.to_datetime(output.date)
    output.reset_index(drop=True, inplace=True)
    return output


def lunar_phase_separator(phases_df, lower_window=0, upper_window=1):
    """
    Converts DataFrame of moon phases into FBProphet-friendly format.
    ---------------------
    Inputs:
    phases_df       DataFrame containing lunar phase dates for a given year
                        (the output of the `get_lunar_phases()` function)
    lower_window    (<=0) number of days prior to moon phase to include in 'holiday'
    upper_window    (>=0) number of days after to moon phase to include in 'holiday'
    ---------------------
    Returns:        FBProphet-friendly DataFrame for use in 'holiday' parameter
    """
    # let's separate the different moon phases
    phase_names = ['Full Moon','Last Quarter','New Moon','First Quarter']
    ph_list = []
    for phase in phase_names:
        moons = pd.DataFrame(phases_df.loc[phases_df['phase'] == phase]['date']).reset_index(drop=True)
        moons.columns = ['ds']
        moons['holiday'] = str(phase).lower().replace(" ", "")
        moons['lower_window'] = lower_window
        moons['upper_window'] = upper_window
        ph_list.append(moons)
    phases = pd.concat((ph_list[0], ph_list[1], ph_list[2], ph_list[3]))
    return phases

In [None]:
# Define functions to automate the process of inspecting lunar trends
def prep_data_for_FBP(data, column_name):
    """
    Given a DataFrame and the name of the column to be processed, 
    generates a FBProphet-ready DataFrame.
    """
    d = data.reset_index()
    prepped_data = d[['date', column_name]].sort_values(by=['date']).reset_index(drop=True)
    prepped_data.columns = ['ds','y']
    return prepped_data


def cross_val_FBP(model, metric='rmse', show_metric_scores=False):
    # cross validating using time horizons within the dataset
    df_cv = cross_validation(model, initial='90 days', period='15 days', horizon = '30 days')
    # performance metrics for the FBProphet model
    df_p = performance_metrics(df_cv)
    if show_metric_scores == True:
        display(df_p.head())
    # plotting performance metrics
    fig = plot_cross_validation_metric(df_cv, metric)
    pass


def get_weekends(year='2018'):
    weekends_df = pd.DataFrame(columns=['date','day_of_week'])
    weekends_df.date = [date for date in pd.date_range(start=f'1/1/{year}', periods=470)]
    weekends_df.day_of_week = [datetime.datetime(int(str(date)[:4]), 
        int(str(date)[5:7]), int(str(date)[8:10])).weekday() for date in weekends_df.date]
    weekends_df = weekends_df[weekends_df.day_of_week >= 5].reset_index(drop=True)
    return weekends_df


def lunar_stock_trend(df, column_name, phases_df, year='2018', lower_window=0, 
                      upper_window=1, trades_on_weekends=False,
                      cross_val=True, metric='rmse',
                      show_metric_scores=False):
    """
    This function takes in a DataFrame of stock data and the 
    column name for the feature to be examined, a DataFrame of lunar 
    phases for a year, the desired year, and the lower & upper 
    windows for 'holiday' dates.
    
    If the financial data contains values for weekends, set the
    'trades_on_weekends' parameter to 'True'.
    
    Additional option to cross-validate FBProphet model predictions, and
    select from a variety of metrics to use.
    
    ----------------------------------------------------
    
    Returns a list containing the model object, the 'future' dataframe
    used to make predictions, the forecast output DataFrame, a graph
    of the forecast, and a graph of forecast components.
    """
    phases = lunar_phase_separator(phases_df, lower_window, upper_window)
    if trades_on_weekends == False:
        weekends_df = get_weekends(year)
        phases = phases[~phases['ds'].isin(weekends_df.date)]
    data = prep_data_for_FBP(df, column_name)
    m = Prophet(holidays=phases)
    m.fit(data)
    future = m.make_future_dataframe(periods=60, freq='D')
    if trades_on_weekends == False:
        future = future[~future['ds'].isin(weekends_df.date)]
    forecast = m.predict(future)
    disp_length = 4 * (1 + abs(lower_window) + abs(upper_window))
    display(forecast[(forecast['fullmoon'] + forecast['lastquarter'] + 
                      forecast['newmoon'] + forecast['firstquarter']).abs() > 
                     0][['ds', 'fullmoon', 'lastquarter', 'newmoon',
                         'firstquarter']][:disp_length])        
    fig1 = m.plot(forecast);
    fig1.set_size_inches(15, 5);
    fig2 = m.plot_components(forecast)
    fig2.set_size_inches(15, 10);
    if cross_val:
        cross_val_FBP(m, metric, show_metric_scores)
        
    return [m, future, forecast, fig1, fig2]

In [None]:
def tweet_fbprophet(filename='tweets_nowords_2018.csv'):
    print(filename)
    tweets_df = pd.read_csv(filename)
    tweets_df.timestamp = pd.to_datetime(tw_df.timestamp, format='%Y%m%d')
    grouped = pd.DataFrame(tweets_df.groupby(['timestamp', 'sentiment'])['tally'].sum()).reset_index()

    #prepare grouped sentiment data for FBProphet processing
    #here, positive sentiment only
    grp_pos = grouped[grouped.sentiment == 'positive'].drop('sentiment', axis=1).reset_index(drop=True)
    grp_pos.columns = ['ds','y']

    m = Prophet(holidays=phases)
    m.fit(grp_pos)
    future = m.make_future_dataframe(periods=60, freq='D')
    forecast = m.predict(future)
    forecast[(forecast['fullmoon'] + forecast['lastquarter'] + 
              forecast['newmoon'] + forecast['firstquarter']).abs() > 
             0][['ds', 'fullmoon', 'lastquarter', 'newmoon','firstquarter']][:10]
    fig1 = m.plot(forecast);
    fig1.set_size_inches(15, 5);
    fig2 = m.plot_components(forecast)
    fig2.set_size_inches(15, 10);
    
    # cross validating using time horizons within the dataset
    df_cv = cross_validation(m, initial='90 days', period='15 days', horizon = '30 days')
#     # performance metrics for the FBProphet model
#     df_p = performance_metrics(df_cv)
#     display(df_p.head())

    # plotting performance metrics
    fig3 = plot_cross_validation_metric(df_cv, metric='rmse')

In [None]:
# function to get most frequent words
def get_top_n_words(corpus):
    stopwords = set(ENGLISH_STOP_WORDS)
    stopwords.update(['twitter','com','pic','ve','ll','just','like','don','really','00'])
    vec = CountVectorizer(stop_words=stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq


# function for plotting the most frequent words
def plot_phase_words(corpus, keywords, phase, n=None):
    comm_words = get_top_n_words(corpus.text)[:n]
    df2 = pd.DataFrame(comm_words, columns=['text', 'count'])
    total = df2['count'].sum()   ###################    
    # plotting
    fig, ax = plt.subplots(figsize=(15,5))
    plt.xticks(rotation=30, fontsize=14)
    hghts = ((df2['count'] / total) * 100)
    rects = ax.bar(df2.text, hghts)                                                  #################
    ax.set_title(f"Top {n} Words for {phase} at Midnight UTC", fontsize=16)
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width(), 1.005*height,
                f'{np.round(height, 2)}%', ha='right', va='bottom')
    plt.plot([], [], ' ', label=f"Search Keywords:\n{keywords}")
    plt.legend(fontsize=14)
    plt.show();


# function to parse a CSV file and then plot all results
def plot_CSV_words(filename, keywords, phases_df, n=None):
    """
    Inputs:
    filename       (str) CSV filename generated from Twitter scraping function
    keywords       (str) the query keywords used in Twitter scraping function
    phases_df      (df) dataframe of lunar phases for a given year
    n              (int) number of most common words to return
    ==================================================
    Returns:
    Histograms (5 total) of the 'n' most common words for each lunar phase and for
    all the nights not in a lunar phase.
    """
    df = pd.read_csv(filename)
    df.dropna(inplace=True)
    df.timestamp = pd.to_datetime(df.timestamp, format='%Y%m%d')  
    df.set_index('timestamp', inplace=True)
    
    phase_list = ['Full Moon','Last Quarter','New Moon','First Quarter','No Phase']
    for phase in phase_list:
        all_moons = list(phases_df.date.astype(str))
        if phase != 'No Phase':
            moon = list(phases_df[phases_df.phase == phase].date.astype(str))
            moon_data = df.loc[df.index.floor('D').isin(moon)]
            plot_phase_words(moon_data, keywords, phase, n=n)
        else:
            moon_data = df.loc[~df.index.floor('D').isin(all_moons)]
            plot_phase_words(moon_data, keywords, phase, n=n)

# Code that has to be executed

reading files and making dataframes and such

In [None]:
phases_2018_df = get_lunar_phases(year='2018')
phases = lunar_phase_separator(phases_2018_df, lower_window=0, upper_window=1)

In [None]:
tw_df = pd.read_csv('tweets_happysad_2018.csv')
tw_df.timestamp = pd.to_datetime(tw_df.timestamp, format='%Y%m%d')
grouped = pd.DataFrame(tw_df.groupby(['timestamp', 'sentiment'])['tally'].sum()).reset_index()

In [None]:
# for sentiment in grouped.sentiment.unique():
#     print(sentiment)
#     df_decompose(t2018_log[sentiment])

In [None]:
plot_sentiments('tweets_happysad_2018.csv', 2018)

In [None]:
stocks_API_key = 

In [None]:
MSFT_2018 = get_financial_data('MSFT', 'stock', stocks_API_key, year=2018)

In [None]:
lunar_stock_trend(df=MSFT_2018,
                  column_name='change_24',
                  phases_df=phases_2018_df,
                  year='2018',
                  lower_window=-1,
                  upper_window=1,
                  trades_on_weekends=False,
                  cross_val=True,
                  metric='rmse', 
                  show_metric_scores=False)

In [None]:
tweet_csv_files = ['tweets_lovehate_2018.csv',
                   'tweets_happysad_2018.csv',
                   'tweets_music_2018.csv',
                   'tweets_money_2018.csv',
                   'tweets_nowords_2018.csv',
                   'tweets_politics_2018.csv',
                   'tweets_coding_2018.csv']

queries = ['love OR peace OR hate OR war',
           'happy OR sad OR life OR death',
           'music OR tunes OR dance',
           'stocks OR money OR taxes',
           '(no keywords entered)',
           'politics OR government OR Trump',
           'data science OR coding OR programming'] 

for filename, key_words in zip(tweet_csv_files, queries):
    plot_CSV_words(filename=filename,
                   keywords=key_words,
                   phases_df=phases_2018_df,
                   n=20)

In [None]:
# Exploring daily positivity rates by query phrase....aggregated
for search in tweet_csv_files:
    plot_sentiments(search, 2018, positive_only=True)

In [None]:
# Showing graphs for each query separately
for search in tweet_csv_files:
    plot_sentiments(csv_filename=search, year=2018, positive_only=False)
    plt.show();

In [None]:
# Running each CSV file through FBPophet
for search in tweet_csv_files:
    tweet_fbprophet(search)

# Dash section

some of the websites with info:
   * Intro to Dash (blog) https://medium.com/plotly/introducing-dash-5ecf7191b503
   * Interactive Dashboards w Dash (blog) https://alysivji.github.io/reactive-dashboards-with-dash.html
   * Dash GitHub https://github.com/plotly/dash
   * Dash documentation https://dash.plot.ly/?_ga=2.22784251.1143889031.1570652152-637402008.1568664543

In [None]:
# Importing .CSV data for use in Dash
quers = ['lovehate','happysad','music','money','nowords','politics', 'coding']
symbols = ['MSFT','EUR','BTC','SP500','SBAC']

MSFT_df = pd.read_csv('MSFT.csv')
MSFT_df.date = pd.to_datetime(MSFT_df.date, infer_datetime_format=True)
EUR_df = pd.read_csv('EUR.csv')
EUR_df.date = pd.to_datetime(EUR_df.date, infer_datetime_format=True)
BTC_df = pd.read_csv('BTC.csv')
BTC_df.date = pd.to_datetime(BTC_df.date, infer_datetime_format=True)
SP500_df = pd.read_csv('SP500.csv')
SP500_df.date = pd.to_datetime(SP500_df.date, infer_datetime_format=True)
SBAC_df = pd.read_csv('SBAC.csv')
SBAC_df.date = pd.to_datetime(SBAC_df.date, infer_datetime_format=True)

In [4]:
# !pip install dash==1.4.0

In [5]:
# !pip install dash-daq==0.2.1

In [None]:
##########################################################
### This cell must be running in order to test whether ###
###  the code below which has been saved as `app.py`   ###
###   is properly functioning when output as HTML.     ###
##########################################################
!python app.py

In [None]:
# -*- coding: utf-8 -*-
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.graph_objs as go
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

###Imports for FBProphet plot.ly functionality
import numpy as np
from fbprophet.diagnostics import performance_metrics
import matplotlib.pyplot as plt
from matplotlib.dates import (
    MonthLocator,
    num2date,
    AutoDateLocator,
    AutoDateFormatter)
from matplotlib.ticker import FuncFormatter
from pandas.plotting import deregister_matplotlib_converters
deregister_matplotlib_converters()
from plotly import tools as plotly_tools




###############################
##### Necessary functions #####
###############################

# Code adapted from: https://github.com/facebook/prophet/blob/master/python/fbprophet/plot.py
def plot_holidays_component_plotly(m, fcst, figsize=(900, 300)):
    """Plot 'holidays' component of the forecast using Plotly.
    ----- Parameters -----
    m: Prophet model.
    fcst: pd.DataFrame output of m.predict.
    figsize: The plot's size (in px).
    ------- Returns a Plotly Figure. -------
    """
    range_margin = (fcst['ds'].max() - fcst['ds'].min()) * 0.05
    range_x = [fcst['ds'].min() - range_margin, fcst['ds'].max() + range_margin]
    text = None
    fcst = fcst[fcst['holidays'] != 0].copy()
    # Combine holidays into one hover text
    holiday_features, _, _ = m.make_holiday_features(fcst['ds'], m.holidays)
    holiday_features.columns = holiday_features.columns.str.replace('_delim_', '', regex=False)
    holiday_features.columns = holiday_features.columns.str.replace('+0', '', regex=False)
    text = pd.Series(data='', index=holiday_features.index)
    for holiday_feature, idxs in holiday_features.iteritems():
        text[idxs.astype(bool) & (text != '')] += '<br>'  # Add newline if additional holiday
        text[idxs.astype(bool)] += holiday_feature
    traces = []
    traces.append(go.Scatter(
        name='holidays',
        x=fcst['ds'],
        y=fcst['holidays'],
        mode='lines',
        line=go.scatter.Line(color='#0072B2', width=2),
        text=text,
    ))
    xaxis = go.layout.XAxis(
        title='Date',
        type='date',
        rangeslider={'visible': True},   ###this line may be a problem
        range=range_x)
    yaxis = go.layout.YAxis(rangemode='tozero',
                            title='Lunar Phase Correlation',
                            zerolinecolor='#AAA')
    layout = go.Layout(
        width=figsize[0],
        height=figsize[1],
        showlegend=False,
        xaxis=xaxis,
        yaxis=yaxis
    )
    figure = go.Figure(data=traces, layout=layout)
    return figure







#############################################
#### BELOW IS THE CODE FOR THE DASHBOARD ####
#############################################

app = dash.Dash(__name__)

markdown_paragraph = '''
### About this Project

#### By Matthew E. Parker

Data Science Bootcamp Capstone Project for Flatiron School. 
For more information about this project, please read my 
[Medium article]('https://medium.com/@matthewparker_1059/modeling-lunar-cycles-in-tweets-and-financial-markets-using-facebook-prophet-d6ec0e9e20f'). 
If you wish to look at the code for yourself, please refer to the project's
[GitHub repository]('https://github.com/magnawhale/capstone_project').

For a fascinating (but slow-loading) sample exploration of Tweets, 
download and then open view my 
[Scattertext html file]('https://github.com/magnawhale/capstone_project/blob/master/Scattertext_nowords_example.html').
Just continue loading if you get any error messages about responsiveness when opening this file.
'''


stopwords = set(ENGLISH_STOP_WORDS)
stopwords.update(['twitter','com','pic','ve','ll','just','like','don','really','00'])

tw_df = pd.read_csv('tw_sent.csv')
tw_df.date = pd.to_datetime(tw_df.date, infer_datetime_format=True)

moon_df = pd.read_csv('phases.csv')
moon_df.date = pd.to_datetime(moon_df.date, infer_datetime_format=True)

tw_word_freqs_df = pd.read_csv('tw_word_freqs.csv')
tw_word_freqs_df['count'] = tw_word_freqs_df['count'].astype(int)

queries = ['(no keywords entered)',
           'love OR peace OR hate OR war',
           'happy OR sad OR life OR death',
           'music OR tunes OR dance',
           'stocks OR money OR taxes',
           'politics OR government OR Trump',
           'data science OR coding OR programming']
moons = ['No Phase', 'Full Moon','Last Quarter','New Moon','First Quarter']


############################
#### Application layout ####
app.layout = html.Div(children=[    ### whole page
    html.H1(                        ### page header
        children='Lunar Cycles & Human Behavior',
        style={'textAlign': 'center'}
    ),

    ### two column area
    html.Div([    

        ####################################
        # The area with the dropdown menus #
        html.Div([

            # Adding a dropdown menu
            html.Div([
                html.P('Twitter Search Phrases:'),
                dcc.Dropdown(
                    id='query-dropdown',
                    options=[{'label': i, 'value': i} for i in queries],
                    value='(no keywords entered)'  # default initial value
                ),
                html.P(' d', style={'color':'#FFFFFF'}),
                html.P('Moon Phrase:'),
                dcc.Dropdown(
                    id='moon-dropdown',
                    options=[{'label': i, 'value': i} for i in moons],
                    value='No Phase'   # default initial value
                ),
                html.P(' d', style={'color':'#FFFFFF'}),
                dcc.Markdown(markdown_paragraph)
            ]),


            # setting the layout of the dropdown DIV area
            ], style = {'width': '20%',
                'height': '49%',
                'display': 'inline-block'
            }
        ),

        html.Div([], style={'width': '5%', 'display': 'inline-block'}),  ## just a spacer


        #############################
        # The area with the display #

        ### TABS ###
        html.Div([
            dcc.Tabs(id='tabs', value='tab-1', children=[
                dcc.Tab(id='tab1', label='Daily Sentiment', value='tab-1'),
                dcc.Tab(id='tab2', label='Word Frequencies', value='tab-2'),
                dcc.Tab(id='tab3', label='Facebook Prophet Seasonality', value='tab-3'),
            ]),

            ## displayed below tabs ##
            html.Div(id='tabs-content', children=[
                dcc.Graph(id='tw-graph'),

                dcc.Slider(
                    id='freq-slider',
                    min=1,
                    max=1000,
                    step=1,
                    value=50
                ),
                html.Div(id='slider-output-container')
            ]),
            ],
            style={'width': '75%', 'display': 'inline-block'}  ###setting the graph/tabs area to right of options
        )
    ])
])


####################################################
##### Callbacks section for linking everything #####
####################################################


@app.callback(
    Output('slider-output-container', 'children'),
    [Input('freq-slider', 'value'),
     Input('query-dropdown', 'value')])
def update_output(selected_value, selected_query):
    return f'Displaying the {selected_value} most frequent words from the query: "{selected_query}"'


@app.callback(
    Output('tw-graph', 'figure'),
    [Input('query-dropdown', 'value'),
     Input('moon-dropdown', 'value'),
     Input('tabs', 'value'),
     Input('freq-slider', 'value')])
def update_graph(selected_query, selected_moon, selected_tab, selected_n):
    tweets_df = tw_df[tw_df['query'] == selected_query]
    tweets_df.dropna(inplace=True)
    if selected_tab == 'tab-1':
        grouped = pd.DataFrame(tweets_df.groupby(['date', 'sentiment'])['tally'].sum()).reset_index()
        traces = []
        for sentiment in grouped.sentiment.unique():
            temp_df = grouped[grouped.sentiment == sentiment]
            traces.append(go.Scatter(
                                x=temp_df.date,
                                y=temp_df['tally'],
                                name=sentiment,
                                text=temp_df['sentiment'],
                                mode='lines',
                                opacity=0.8))
        figure = {'data': traces,
            'layout': go.Layout(colorway=["#5E0DAC", '#FF4F00', '#375CB1', '#FF7400', '#FFF400', '#FF0056'],
                                #height=600,
                                title=f"Daily Sentiment at Midnight (UTC) for : '{selected_query}'",
                                xaxis={"title":"Date",
                                       'rangeslider': {'visible': True},
                                       'type': 'date'},
                                yaxis={"title":"Sentiment Quantity (~1,000/day total)"})}
        return figure

    elif selected_tab == 'tab-2':
        df2 = tw_word_freqs_df[(tw_word_freqs_df['query'] == selected_query) &
                               (tw_word_freqs_df['phase'] == selected_moon)][:selected_n]
        ### plotting
        trace = [go.Bar(x=df2['text'], y=df2['count'], name='', )]
        figure = {'data': trace,
            'layout': go.Layout(title=f"Top {selected_n} Words for {selected_moon} at Midnight UTC",
                hovermode="closest",
                xaxis={
                    'title': f"Search Keywords: {selected_query}", 
                    'titlefont': {'color': 'black', 'size': 14},
                    'tickfont': {'size': 11, 'color': 'black'}},
                yaxis={'tickfont': {'color': 'black'}}
            )
        }
        return figure

    elif selected_tab == 'tab-3':
        #prepare data for FBProphet
        tw_df = pd.read_csv('tweets_happysad_2018.csv')
        tw_df.timestamp = pd.to_datetime(tw_df.timestamp, format='%Y%m%d')
        grouped = pd.DataFrame(tw_df.groupby(['timestamp', 'sentiment'])['tally'].sum()).reset_index()
        grp_pos = grouped[grouped.sentiment == 'positive'].drop('sentiment', axis=1).reset_index(drop=True)
        grp_pos.columns = ['ds','y']

        #make model and forecast
        m = Prophet(holidays=phases)
        m.fit(grp_pos)
        future = m.make_future_dataframe(periods=60)
        forecast = m.predict(future)

        #plot holidays (lunar seasonalities)
        figure = plot_holidays_component_plotly(m, forecast, figsize=(900, 300))   ### remove figsize?
        return figure



# automatically update HTML display if a change is made to code
if __name__ == '__main__':
    app.run_server(debug=True)