In [1]:
import pandas as pd
import re
import pysentiment2 as ps
import datetime as dt
import numpy as np

In [2]:
def get_search_term(terms): 
    search_term = ""
    for term in terms: 
        if len(search_term) > 0: 
            search_term += "|" + str(term)
        else: 
            search_term += str(term)
    return search_term

def get_subset_df(terms, df):
    contain_df = df[df['full_text'].str.contains(get_search_term(terms), case = False)]
    return contain_df.drop_duplicates(subset='full_text')
        
def get_num_stories_df(df): 
    df_new = df.set_index('date').index.value_counts().to_frame().sort_index().cumsum()
    start_date = list(df_new.index)[0]
    end_date = list(df_new.index)[-1]
    idx = pd.date_range(start_date, end_date)
    df_new = df_new.reindex(idx, fill_value='NaN')
    vals = []
    for i in range(len(df_new)):
        num = df_new.iloc[i].item()
        if df_new.iloc[i].item() != "NaN":
            vals.append(num)
        else: 
            vals.append(vals[-1])

    df_new['date'] = vals 
    return df_new


def get_prop_df(df1, df2): 
    df_new = (get_num_stories_df(df1) * (1/get_num_stories_df(df2))).fillna(method='ffill')
    start_date = list(df_new.index)[0]
    end_date = list(df_new.index)[-1]
    idx = pd.date_range(start_date, end_date)
    df_new = df_new.reindex(idx, fill_value='NaN')
    vals = []
    for i in range(len(df_new)):
        num = df_new.iloc[i].item()
        if df_new.iloc[i].item() != "NaN":
            vals.append(num)
        else: 
            vals.append(vals[-1])

    df_new['date'] = vals 
    df_new = df_new.fillna(0)
    return df_new

def get_num_stories_df_noncum(df): 
    return df.set_index('date').index.value_counts().to_frame().sort_index()

In [3]:
def get_rolling_av_df(days_back, df): 
    start_date = df['date'].min()
    end_date = df['date'].max()
    
    date_list = [] 
    sent_score_batch_list = [] 
    while start_date <= end_date: 
        date_list.append(start_date)
        sent_score_batch_list.append(get_sent_score_batch(days_back, start_date, df))
        start_date += dt.timedelta(1)
    
    df_roll = pd.DataFrame()
    df_roll['date'] = date_list 
    df_roll['sent_score'] = sent_score_batch_list
    df_roll = df_roll.set_index('date')
    return df_roll
             
def get_sent_score_batch(days_back, today, df): 
    return df[(df['date'] >= today - dt.timedelta(days_back)) & (df['date'] <= today)]['sent_score'].mean()

In [1]:
#for regression data
def get_reg_data(sent_df, ndq_df, freq_): 
    master_df = pd.merge(sent_df, ndq_df, left_on='date', right_on='Date', how='right').sort_values('Date')
    g = master_df.groupby(pd.Grouper(key='Date', freq=freq_))
    dfs = [group for _,group in g]
    returns_list, sent_list, date_list, sent_overval_list, sent_non_overval_list, vol_returns_list, volume_list, volume_list_raw, disagreement = ([] for i in range(9))
    
    for df in dfs: 
        try: 
            if freq_ != 'D': 
                returns_list.append((list(df['Close'])[-1]/list(df['Close'])[0] - 1) * 100)
            else: 
                returns_list.append(list(df['Return'])[0])
            date_list.append(list(df['Date'])[-1])
            sent_list.append(df['sent_score'].dropna().mean())
            sent_overval_list.append(df[df['flag'] == 1]['sent_score'].dropna().mean())
            sent_non_overval_list.append(df[df['flag'] == 0]['sent_score'].dropna().mean())
        except: 
            pass
#         try: 
#             volume_list.append((list(df['Volume'])[-1]/list(df['Volume'])[0] - 1) * 100)
#             volume_list_raw.append(sum(list(df['Volume'])))
#         except: 
#             volume_list.append(np.nan)
#             volume_list_raw.append(np.nan)
#         try: 
#             vol_returns_list.append(np.array(list(set(df['close_return']))).std())
#         except: 
#             vol_returns_list.append(np.nan)
#         try: 
#             disagreement.append(np.array(list(set(df['sent_score']))).std())
#         except: 
#             disagreement.append(np.nan)
            
    reg_df = pd.DataFrame({'date': date_list, 
                           'sent': sent_list, 
                           'returns': returns_list,
#                            'volat': vol_returns_list,
#                            'vol_ch': volume_list,
#                            'vol_raw': volume_list_raw,
                           'sent_overval': sent_overval_list, 
                           'sent_non_overval': sent_non_overval_list, 
#                            'disagree': disagreement
                          })
    
    return reg_df 


def get_flags(fdata_twitter): 
    overval_terms = ['bubble', 'overval', "too high",'crash', 'collapse', 'mania', 'burst', 
                     'sky-high', 'lost its senses', 'strange', 'bizarre', 'psychology', 'implode', 
                     'black hole', 'unwarrant', 'irrational', 'tulip', 'euphori',
                     'short sell', 'bandwagon']

    overval_df = get_subset_df(overval_terms, fdata_twitter)
    nonoverval_df = pd.concat([overval_df, fdata_twitter]).drop_duplicates(keep=False)

    list_id = list(overval_df['full_text'])
    id_flag = [] 
    for i in range(len(fdata_twitter)): 
        storeid = fdata_twitter.iloc[i]['full_text']
        if storeid in list_id: 
            id_flag.append(1)
        else: 
            id_flag.append(0)

    fdata_twitter['flag']=id_flag
    return fdata_twitter

def get_pct_change(df, col): 
    ret = [(df[col].iloc[i] - df[col].iloc[i-1])/df[col].iloc[i-1] for i in range(len(df))]
    return ret