In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import nltk                                         #Natural language processing tool-kit

from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer                 # Stemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec  

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

import datetime
import string
import xgboost as xgb
import spacy
from spacy import displacy

from textblob import TextBlob 
import warnings
warnings.filterwarnings('ignore')
import pickle
import re
from datetime import timedelta
import pyLDAvis
import pyLDAvis.sklearn

In [2]:
df = pd.read_csv("sentiment.csv")

### Vectorizers

In [3]:
def vectorize_data(text, ngram=(1,1)):
    vectorizer = CountVectorizer(analyzer='word',       
                                 min_df=10,                        # minimum reqd occurences of a word 
                                 stop_words='english',             # remove stop words
                                 lowercase=True,                   # convert all words to lowercase
                                 token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                                 max_features=50000, ngram_range = ngram
                                 # max number of uniq words
                                )
    data_vectorized = vectorizer.fit_transform(text)
    return vectorizer, data_vectorized

def tfidf_word(text, ngram=(1,1)):
    
    vectorizer = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=ngram,dtype=np.float32)
    tr_vect = vectorizer.fit_transform(text)
   
    return vectorizer, tr_vect

In [4]:
cv_unigram = vectorize_data(df["body"], ngram=(1,1))
cv_bigram = vectorize_data(df["body"], ngram=(2,2))

tfidf_unigram = tfidf_word(df["body"], ngram=(1,1))
tfidf_bigram = tfidf_word(df["body"], ngram=(2,2))

## Data Processing

In [5]:
def data_split(data_vec, data_labels):
    

    # Split the 30% data from last step to val size = 15% and test size = 15% with 50:50 split
    X_train, X_mid, y_train, y_mid = train_test_split(data_vec, data_labels, train_size=0.7, random_state=1234)
    X_valid, X_test, y_valid, y_test = train_test_split(data_vec, data_labels, train_size=0.5, random_state=1234)

    return X_train, X_test, y_train, y_test, X_valid, y_valid

In [6]:

unigram_cv = pd.DataFrame(cv_unigram[1].toarray())
bigram_cv = pd.DataFrame(cv_bigram[1].toarray())
unigram_tf = pd.DataFrame(tfidf_unigram[1].toarray())
bigram_tf = pd.DataFrame(tfidf_bigram[1].toarray())

In [7]:
unigram_cv.columns = cv_unigram[0].get_feature_names()
bigram_cv.columns = cv_bigram[0].get_feature_names()
unigram_tf.columns = tfidf_unigram[0].get_feature_names()
bigram_tf.columns = tfidf_bigram[0].get_feature_names()

In [8]:
cv_bigram[0].get_feature_names()

['000 000',
 '000 acr',
 '000 associ',
 '000 barrel',
 '000 boe',
 '000 contract',
 '000 custom',
 '000 day',
 '000 employe',
 '000 foot',
 '000 job',
 '000 live',
 '000 locat',
 '000 lot',
 '000 megawatt',
 '000 member',
 '000 metric',
 '000 net',
 '000 new',
 '000 ounc',
 '000 patient',
 '000 peopl',
 '000 plu',
 '000 pound',
 '000 quarter',
 '000 room',
 '000 share',
 '000 squar',
 '000 squarefoot',
 '000 store',
 '000 ton',
 '000 total',
 '000 unit',
 '000 wafer',
 '000 year',
 '000foot later',
 '001 002',
 '001 share',
 '002 003',
 '002 quarter',
 '002 share',
 '003 004',
 '003 share',
 '004 005',
 '004 share',
 '005 share',
 '006 share',
 '007 share',
 '008 share',
 '009 share',
 '010 share',
 '011 share',
 '012 share',
 '013 share',
 '014 share',
 '015 share',
 '016 share',
 '017 share',
 '018 share',
 '019 share',
 '020 share',
 '021 share',
 '025 share',
 '030 share',
 '035 share',
 '040 share',
 '050 share',
 '060 share',
 '100 billion',
 '100 million',
 '101 billion',
 '101 

### Merging

In [None]:
companies = pd.read_csv("data.csv")
companies.rename(columns={'Ticker': 'ticker', 'Date':'date'}, inplace=True)
companies["date"] = companies["date"].apply(lambda x: pd.to_datetime(x).date())

In [None]:
df["datetime"] = df["date"].apply(lambda x: pd.to_datetime(x))
df['time'] = df['datetime'].apply(lambda x: x.time())
df['date'] =  df['datetime'].apply(lambda x: x.date())

df.loc[df.time > pd.to_datetime("13:00").time(), 'date'] = df["date"] + timedelta(days=1)
df.loc[df.time < pd.to_datetime("13:00").time(), 'date'] = df["date"]
final = df.merge(companies, how='left', on=['ticker', 'date'])


In [None]:
final = final.dropna()
final.head()

In [None]:
final.to_csv('merged_sentime.csv')

In [None]:
leadlag = pd.read_csv('datadict.csv')

In [None]:
final2 = final.merge(leadlag, how='left', on='ticker')

In [None]:
final2.columns

In [None]:
feature_Set1 = final2[['sich', 'total_assets', 'Rdiff_lag1', 'Rdiff_lag2',
       'Rdiff_lag5', 'Rdiff_lead1', 'Rdiff_lead2', 'Rdiff_lead5', 'polarity', 'Returns' ]]

feature_Set1 = feature_Set1[feature_Set1.Rdiff_lag1 != "#NAME?"]
feature_Set1['Rdiff_lag1'] = feature_Set1['Rdiff_lag1'].apply(lambda x: float(x))
feature_Set1['Rdiff_lag2'] = feature_Set1['Rdiff_lag2'].apply(lambda x: float(x))
feature_Set1['Rdiff_lag5'] = feature_Set1['Rdiff_lag5'].apply(lambda x: float(x))
feature_Set1['Rdiff_lead1'] = feature_Set1['Rdiff_lead1'].apply(lambda x: float(x))
feature_Set1['Rdiff_lead2'] = feature_Set1['Rdiff_lead2'].apply(lambda x: float(x))
feature_Set1['Rdiff_lead5'] = feature_Set1['Rdiff_lead5'].apply(lambda x: float(x))
feature_Set1 = feature_Set1.dropna()
feature_Set1.head()

In [None]:
feature_Set1.to_csv('feature_Set1.csv')

In [None]:
X_train, X_test, y_train, y_test, X_valid, y_valid = data_split(feature_Set1[['sich', 'total_assets', 'Rdiff_lag1', 'Rdiff_lag2',
       'Rdiff_lag5', 'Rdiff_lead1', 'Rdiff_lead2', 'Rdiff_lead5', 'polarity']], feature_Set1['Returns'])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

SVR

In [None]:
clf = svm.SVR().fit(X_train, y_train)
y_pred = clf.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
clf = LinearRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
clf = xgb.XGBRegressor(n_estimators=1000)
clf.fit(X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=100, #stop if 50 consequent rounds without decrease of error
    verbose=False)
xgb.plot_importance(clf, height=0.9)
y_pred = clf.predict(X_test)
mean_absolute_error(y_test, y_pred)