# Walkthrough of sentiment_analysis_prep

In [None]:
# Initial Imports
import sklearn
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt

In [None]:
# Load Data
#stock_returns_daily_df = pd.read_csv('C:/stock_returns_daily.csv')                         # Load in stock return data
stock_returns_daily_df = pd.read_csv('C:/EDGAR/example_shares_output2.csv')                 # Load in stock return data
#print(stock_returns_daily_df)

#sentiment_factors_df = pd.read_csv('C:/sentiment_factors.csv')                             # Load in sentiment word count data
sentiment_factors_df = pd.read_csv('C:/EDGAR/example_sentiment_analysis.csv')               # Load in sentiment word count data
#print(sentiment_factors_df)

### Pre-processing?
### Feature Engineering

In [None]:
# stock_returns_daily
def stock_returns_prep(df): 
    # Do I need to format the dates?
    new_df = df.drop(['high', 'low', 'price','Symbol'], axis = 1)                       # Remove unnessary columns # sample_stock_returns_daily_df
    new_df.rename(columns={"date": "Date"}, inplace= True)                              # Change date column name for later merge
    # Potentially do not drop symbol so they can later be matched
    
    # Make in classiication columns? Up, Down (& stagnant)
    return new_df

In [None]:
# sentiment_factors
def sentiment_factors_prep(df):  
    # Do I need to format the dates?
    new_df = df.drop(['Symbol', 'ReportType'], axis = 1)                                # Remove unnessary columns
    new_df.rename(columns={"FilingDate": "Date"}, inplace= True)                        # Change date column name for later merge
    new_df['word_sum'] = new_df.sum(axis = 1)                                           # Calculate total number of categorised words from the report
    # Potentially do not drop symbol so they can later be matched
    
    # Normalise wordcounts as a % of sum of word counts or report word count? 
    new_df['perc_Negative'] = new_df.apply(lambda row: row['Negative'] / row['word_sum'], axis = 1)
    new_df['perc_Positive'] = new_df.apply(lambda row: row['Positive'] / row['word_sum'], axis = 1)
    new_df['perc_Uncertainty'] = new_df.apply(lambda row: row['Uncertainty'] / row['word_sum'], axis = 1)
    new_df['perc_Litigious'] = new_df.apply(lambda row: row['Litigious'] / row['word_sum'], axis = 1)
    new_df['perc_Constraining'] = new_df.apply(lambda row: row['Constraining'] / row['word_sum'], axis = 1)
    new_df['perc_Superfluous'] = new_df.apply(lambda row: row['Superfluous'] / row['word_sum'], axis = 1)
    new_df['perc_Interesting'] = new_df.apply(lambda row: row['Interesting'] / row['word_sum'], axis = 1)
    new_df['perc_Modal'] = new_df.apply(lambda row: row['Modal'] / row['word_sum'], axis = 1)
    #S entiment Scores
    new_df['sentiment_score'] = new_df.apply(lambda row: round((row['Positive'] - row['Negative']) / (row['Positive'] + row['Negative']), 2), axis = 1)  
    # Using Positive and Negative Word Count – With Normalization for Calculating Sentiment Score
    new_df['sentiment1'] = new_df.apply(lambda row: round((row['Positive'] - row['Negative']) / row['word_sum'], 2), axis = 1)       # Calculate a Sentiment score using positive & negative word counts 
    # Using Positive and Negative Word Counts – With Semi Normalization to calculate Sentiment Score
    new_df['sentiment2'] = new_df.apply(lambda row: round(row['Positive'] / (row['Negative'] + 1), 2), axis = 1) 
    # round(df['pos_count'] / (df['neg_count']+1), 2)

    # or report word count? 
    
    # Normalise wordcounts over dataset instead of individual reports?
    # def min_max(s):
    #     new_s = (s - s.min())/(s.max() - s.min())
    # return new_s

    # df['Positive'] = min_max(df['Positive'])                                               # normalise 'Positive'
    # df['Negative'] = min_max(df['Negative'])                                               # normalise 'Negative'
    # df['Uncertainty'] = min_max(df['Uncertainty'])                                         # normalise 'Uncertainty'
    # df['Litigious'] = min_max(df['Litigious'])                                             # normalise 'Litigious'
    # df['Constraining'] = min_max(df['Constraining'])                                       # normalise 'Constraining'
    # df['Superfluous'] = min_max(df['Superfluous'])                                         # normalise 'Superfluous'
    # df['Interesting'] = min_max(df['Interesting'])                                         # normalise 'Interesting'
    # df['Modal'] = min_max(df['Modal'])                                                     # normalise 'Modal'
    
    # Drop unnessary columns, raw word category counts
    new_df.drop(['Negative', 'Positive', 'Uncertainty', 'Litigious', 'Constraining', 'Superfluous', 'Interesting', 'Modal'], axis = 1, inplace = True)

    return new_df

In [None]:
# View resulting DataFrames

sample_stock_returns_daily_df = stock_returns_prep(stock_returns_daily_df)
sample_sentiment_factors_df = sentiment_factors_prep(sentiment_factors_df)
#print(sample_stock_returns_daily_df)
#print(sample_stock_returns_daily_df.columns)
print(sample_sentiment_factors_df)

In [None]:
# Combining dataframes using merge
# merge on date and ticker symbol?
combined_df = pd.merge(sample_stock_returns_daily_df, sample_sentiment_factors_df, on = 'Date', how = 'left') # on = ['Date', 'ticker'] for larger set...

print(combined_df)
print(combined_df.columns)

In [None]:
# Potential Features & Targets
all_features = ['Negative', 'Positive', 'Uncertainty', 'Litigious', 'Constraining', 'Superfluous', 'Interesting', 'Modal',
                'perc_Negative', 'perc_Positive', 'perc_Uncertainty', 'perc_Litigious', 'perc_Constraining', 'perc_Superfluous', 'perc_Interesting', 'perc_Modal',
                 'sentiment_score', 'sentiment1','sentiment2']
all_sentiment_features = ['Negative', 'Positive', 'Uncertainty', 'Litigious', 'Constraining', 'Superfluous', 'Interesting', 'Modal']
neg_pos_features = ['Negative', 'Positive']
neg_feature = ['Negative']

selected_features = all_sentiment_features
target = ['1daily return'] # ['2daily return'], ['3daily return'], ['5daily return'], ['10daily return'], ['volume']

# X_train = df_train[selected_features]                     # NB -- we use upper case 'X' because it is a matrix (math term for df)
# y_train = df_train[target]                                # NB -- we use lower case 'y' because it is a vactor (math term for series)

# X_test = df_test[selected_features]                       # NB -- we use upper case 'X' because it is a matrix (math term for df)
# y_test = df_test[target]                                  # NB -- we use lower case 'y' because it is a vactor (math term for series)


In [None]:
# write a function that takes 
# yahoo data
# sentiment word counts 

# and returns merged dataframe