In [164]:
import os
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import pykalman as pk
from textblob import TextBlob

In [165]:
#path = "./Data/SENSEX/<SYMBOL>.csv"
def error_percentage(value1, value2):
    print "the error in percentage is" , (abs(value1 - value2) / value1) * 100

In [166]:
def symbol_to_path(symbol, base_dir = "./Data/SENSEX"):
    """Returns
     CSV file path for given symbol """
    return os.path.join(base_dir, "{}.csv".format(str(symbol)))


In [167]:
def get_data_frame(symbols, dates):

    df = pd.DataFrame(index = dates)

    #sensex file acts as base;  this is the index file for sensex
    if 'SENSEX' not in symbols:
        symbols.insert(0, 'SENSEX')

    for symbol in  symbols:
        df_temp = pd.read_csv(symbol_to_path(symbol),
                               index_col = "Date", parse_dates = True,
                               usecols = ['Date', 'Adj Close'],na_values = ['nan'])

        df_temp = df_temp.rename(columns={'Adj Close' : symbol})
        if symbol != 'SENSEX':
            df = df.join(df_temp)
        else:
            df = df.join(df_temp, how = 'inner')
        """
            This is important as SENSEX csv file is base for all stocks
             we have an entry for this whenever SENSEX trades,
             but for any compamy listed on sensex, it may not trade for all
             days sensex has worked,, so we take a left join
        """
        #using fillna() to fill missing values
        #use ffill first to avoid peeping in future
        #bfill is used if data is missing from beginning
        df.fillna(method = "ffill", inplace = "TRUE")
        df.fillna(method = "bfill", inplace = "TRUE")

    return df

In [168]:
def normalize_data(df):
    #normalize using first row of data
    #observe use of row slicing
    return df/df.ix[0,:]

In [169]:
def plot_data(df, theTitle = "Stock Prices"):
    """plot stock prices """
    df = normalize_data(df)
    ax = df.plot(title = theTitle)
    ax.set_xlabel("Date")
    ax.set_ylabel("Price")
    #df[['ITC', 'TCS']].plot(title = theTitle)
    plt.show()

In [170]:
def compute_stats(df):
    print "the standard deviation is \n", df.std()
    print "the mean is\n", df.mean()

In [171]:
def compute_bollinger_bands(df, length=30, numsd=2):
    """ returns average, upper band, and lower band
        there are three steps , find rolling mean, then rolling std deviation
        then find upper and lower bands
    """
    rm = pd.stats.moments.rolling_mean(df,length)
    rmstd = pd.stats.moments.rolling_std(df,length)
    upperband = rm + (rmstd*numsd)
    lowerband = rm - (rmstd*numsd)
    #plotting these stats
    ax  = df['TCS'].plot(title = "Bollinger Bands", label = "TCS")
    rm.plot(label = "Rolling mean", ax = ax)
    upperband.plot(label='upper band', ax = ax)
    lowerband.plot(label = 'lower band ', ax = ax)
    plt.show()

In [172]:
def compute_daily_returns(df):
    daily = df.copy()
    daily[1:] = (df[1:] / df[:-1].values) - 1
    #hey using values only for second is necessary bcoz
    # else it will do index wise stuff, so our goal os shifting
    #by 1 and dividing is destroyed,
    daily.ix[0, :] = 0
    print "the cumulative returns \n", daily.sum()

In [173]:
def get_sentimentstemp(df):
    in_sample = df['2014-05-13']
    #here the data frame has 3 columns SENSEX, TCS, ITC
    #lets first predict for SENSEX
    print "the values on date 2014-05-13 is " , in_sample['SENSEX']

    # hey now i need to make a vector to store the polarity
    # hey i may need to store in a matrix,, 
    
    dates = ["2014-05-13.txt", "2014-05-14.txt"]
    sentiments = []
    for date in dates:
        lines = [line.rstrip('\n') for line in open('./Data/TwitterScraps/' + date)]
        #lines is a vector of these lines
        #sentiMatrix = TextBlob(lines[:]) hey textblob accepts a string not a list
        sumSentiment = 0
        for l in lines:
            sumSentiment += TextBlob(l).sentiment.polarity
        
        sentiments.append(sumSentiment/float(len(lines)))

    #print "hey the sentiments are" , sentiments
    temp = df['2014-05-13']['SENSEX']
    print "hey temp is " , temp
    return sentiments[0:1]

In [174]:
def get_sentiments():
    start_date = "2014-05-07"
    end_date = "2014-05-16"
    # hey we will make a pandas data frame;; indexed by dates like we have created earlier
    # this data frame will store sentiments for different dates
    
    dates = pd.date_range(start_date, end_date)
    sentiments = pd.DataFrame(index = dates)
    d1 = dt.datetime(2014, 05, 07, 0, 0)
    d2 = dt.datetime(2014, 05, 16, 0, 0)

    delta = d2 - d1
    
    dateList = []
    for i in range(delta.days + 1):
        dateObject = d1 + dt.timedelta(days=i)
        dateList.append(dateObject.strftime('%Y-%m-%d'))
        
    for date in dateList:
        lines = [line.rstrip('\n') for line in open('./Data/TwitterScraps/' + date + '.txt')]
        #lines is a vector of these lines
        #sentiMatrix = TextBlob(lines[:]) hey textblob accepts a string not a list
        sumSentiment = 0
        for l in lines:
            sumSentiment += TextBlob(l).sentiment.polarity
        if date != start_date:
            sentiments[date] = ( (sumSentiment/float(len(lines))) + sentiments[date - datetime.timedelta(days=1)])
        else:
            sentiments[date] = (sumSentiment/float(len(lines)))    
        return sentiments
    

In [175]:
def kalman_filtertemp(df, sentiMatrix):
    #creating a kalman filter object,,  
    
    #hey problem ,, df dimension not equal to sentiMatrix as twitter sentiment analysis daily,, but 
    #stock market does not open everyday, so we need to do something,, 
    
   
    
    obs_mat = np.vstack([df['2014-05-13']['SENSEX'], sentiMatrix[0:1]]).T[:, np.newaxis]
    #obs_mat = np.vstack([df['SENSEX'], sentiMatrix]).T[:, np.newaxis]
    #obs_mat = np.vstack([df['SENSEX'], sentiMatrix]).T[:, np.newaxis]
    
    #obs_mat = np.vstack([sentiMatrix, np.ones(1)]).T[:, np.newaxis]
    
    
    delta = 1e-5
    trans_cov = delta / (1 - delta) * np.eye(2)

    kf = pk.KalmanFilter(n_dim_obs=1, n_dim_state=2,
                  initial_state_mean=np.zeros(2),
                  initial_state_covariance=np.ones((2, 2)),
                  transition_matrices=np.eye(2),
                  observation_matrices=obs_mat,
                  observation_covariance=1.0,
                  transition_covariance=trans_cov)
    
    
    state_means, state_covs = kf.filter([df['2014-05-13']['SENSEX']])
    
    value1 = df['2014-05-14']['SENSEX']
    print "the price on day " , value1

    myvalue = df['2014-05-13']['SENSEX']
    value2 = myvalue*state_means[0][0]
    print "the price on day 2014-05-14 as predicted by noise and price of " , value2
     #print "state_covs is ", state_covs

In [176]:
def kalman_filter(df, sentiMatrix):
    #creating a kalman filter object,,  
    
    #hey problem ,, df dimension not equal to sentiMatrix as twitter sentiment analysis daily,, but 
    #stock market does not open everyday, so we need to do something,, 
    
    start_date = "2014-05-07"
    end_date = "2014-05-16"

    # i will first retrieve all indexes from pandas dataframe
    
    dates = pd.date_range(start_date, end_date)
    adj_close_df = pd.DataFrame(index = dates)
    adj_close_df = adj_close_df.join(df['SENSEX'], how = "inner")
   
    # this step is necessary as the dates are in reverse order in df which is supplied as arg
    adj_close_df = adj_close_df.sort_index(ascending=True, axis=0)
    print "The new adjusted close  data frame is ", adj_close_df

    index_List = adj_close_df.index.tolist()
    sentiment_df = pd.DataFrame(index = index_List)
    sentiment_df = sentiment_df.join(sentiMatrix)
    
    obs_mat = np.vstack([adj_close_df['SENSEX'], sentiment_df]).T[:, np.newaxis]
    #obs_mat = np.vstack([sentiMatrix, np.ones(1)]).T[:, np.newaxis]
    
    delta = 1e-5
    trans_cov = delta / (1 - delta) * np.eye(2)
    
    # it is the fastest way,, better than len() or count()
    num_of_obs = adj_close_df.shape[0]
    print num_obs
    kf = pk.KalmanFilter(n_dim_obs=num_obs, n_dim_state=2,
                  initial_state_mean=np.zeros(8),
                  initial_state_covariance=np.ones((8, 8)),
                  transition_matrices=np.eye(8),
                  observation_matrices=obs_mat,
                  observation_covariance=1.0,
                  transition_covariance=trans_cov)
    
    
    state_means, state_covs = kf.filter([df['SENSEX']])

In [177]:
def test_run():

    start_date = '2014-01-01'
    end_date = '2016-10-07'
    dates = pd.date_range(start_date, end_date)
    symbols = ['SENSEX']
    df = get_data_frame(symbols, dates)
    #plot_data(df)
    #compute_bollinger_bands(df, 20, 2)
    #compute_daily_returns(df)
    
    #intially sentiMatrix has only one value,, 
    sentiMatrix = get_sentiments()
    
    #print sentiMatrix.shape
    
    
    
    #hey problem is i have sentiment analysis of all days
    # the stock market does not open everyday;; lets implement an inner join,,
    
    kalman_filter(df, sentiMatrix)
    

In [178]:
if __name__ == "__main__":
    test_run()

The new adjusted close  data frame is                    SENSEX
2014-05-07  22323.900391
2014-05-08  22344.039062
2014-05-09  22994.230469
2014-05-12  23551.000000
2014-05-13  23871.230469
2014-05-14  23815.119141
2014-05-15  23905.599609
2014-05-16  24121.740234
The sentiment data frame is              2014-05-07
2014-05-07   -0.013472
2014-05-08   -0.013472
2014-05-09   -0.013472
2014-05-12   -0.013472
2014-05-13   -0.013472
2014-05-14   -0.013472
2014-05-15   -0.013472
2014-05-16   -0.013472
the shape of adjusted close dataframe is  (8, 1)
The shape of sentiment data frame is  (8, 1)


ValueError: all the input array dimensions except for the concatenation axis must match exactly