In [0]:
import numpy as np
import pandas as pd

from tqdm import tqdm
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error

In [0]:
df = pd.read_csv('/content/drive/My Drive/alldata.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103624 entries, 0 to 103623
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    103624 non-null  int64  
 1   corpus_index  103624 non-null  int64  
 2   date          103624 non-null  object 
 3   direction     103624 non-null  object 
 4   ex_return     101996 non-null  float64
 5   ticker        103624 non-null  object 
 6   whole         103624 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 5.5+ MB


In [0]:
# preprocessing data: change format



df = df.dropna()
df['date'] = pd.to_datetime(df.date)

df['dir'] = df['direction'].apply(lambda x: 1 if x == 'up' else -1)
df = df.reset_index()
df.head()

Unnamed: 0.1,index,Unnamed: 0,corpus_index,date,direction,ex_return,ticker,whole,dir
0,0,62740,16,2001-01-25,down,-5.80473,LLY,on that managed short term symptoms are then b...,-1
1,1,62737,13,2001-01-25,down,-5.80473,LLY,is from Steve Tie's line from Merrill Lynch. P...,-1
2,2,62730,6,2001-01-25,down,-5.80473,LLY,"6%. Excluding the effect of exchange rates, an...",-1
3,3,62739,15,2001-01-25,down,-5.80473,LLY,year when we will be losing execution of the m...,-1
4,4,62729,5,2001-01-25,down,-5.80473,LLY,last year totalling $419 million. The decline ...,-1


In [0]:
df.groupby(['date', 'ticker']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,Unnamed: 0,corpus_index,direction,ex_return,whole,dir
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-25,LLY,0,62740,16,down,-5.804730,on that managed short term symptoms are then b...,-1
2001-03-19,PAYX,17,80229,9,up,2.379739,"a short period of time, but over time, you're ...",1
2001-04-03,BBY,32,13919,22,up,3.539244,a modest selection of consumer electronics. So...,1
2001-04-04,THC,55,100259,16,down,-0.898221,"but we keep coming back to the demographics, a...",-1
2001-04-11,HOG,76,50069,13,up,0.650465,quarter and that's just a time issue that went...,1
...,...,...,...,...,...,...,...,...
2005-12-22,GIS,103502,45911,13,up,1.705193,"General Mills, Inc. - EVP, CFO [50] Thank ...",1
2005-12-22,PAYX,103532,80689,2,down,-2.484627,workers' compensation insurance risk. And were...,-1
2005-12-22,SLR,103509,93443,4,up,0.215506,"of 2%, and representing 18% of total revenue. ...",1
2005-12-22,TMO,103523,101980,0,down,-4.708141,"Operator [1] Good morning, ladies and gentl...",-1


In [0]:
# train test split

val_size = int(df.shape[0] * 0.15)
test_size = int(df.shape[0] * 0.15)
train_size = int(df.shape[0] * 0.7)

train_ticker, train_date, train_corp, train_texts, train_ret, train_dir = df.ticker[:train_size], df.date[:train_size], df.corpus_index[:train_size], df.whole[:train_size], df.ex_return[:train_size], df.dir[:train_size]
val_ticker, val_date, val_corp, val_texts, val_ret, val_dir = df.ticker[train_size:train_size+val_size], df.date[train_size:train_size+val_size], df.corpus_index[train_size:train_size+val_size], df.whole[train_size:train_size+val_size], df.ex_return[train_size:train_size+val_size], df.dir[train_size:train_size+val_size]
test_ticker, test_date, test_corp, test_texts, test_ret, test_dir = df.ticker[train_size+val_size:], df.date[train_size+val_size:], df.corpus_index[train_size+val_size:], df.whole[train_size+val_size:], df.ex_return[train_size+val_size:], df.dir[train_size+val_size:]

In [0]:
# preprocess data: vectorize it


from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

def tfidf(data, max_features):
    import nltk
    try: 
      stop_words = stopwords.words('english')
    except:
      nltk.download('stopwords')
      stop_words = stopwords.words('english')

    tfidf = TfidfVectorizer(stop_words = stop_words, max_features = max_features)
    model = tfidf.fit(data)

    return model


In [0]:
def accuracy_score(y_true, y_pred): 

    # Calculate accuracy of the model's prediction
    """
    YOUR CODE GOES HERE
    """
    true = 0
    total = int(len(y_true))
    for i in range(len(y_true)):
      if y_true[i] == y_pred[i]:
        true += 1
    
    accuracy = float(true)/float(total)
    return accuracy

def f1_score(y_true, y_pred): 
    tp=0
    fp=0
    tn=0
    fn=0
    for i in range(0,len(y_true)):
      if(y_true[i]==1 and y_pred[i]==1):
        tp+=1
      elif(y_true[i]==-1 and y_pred[i]==1):
        fp+=1
      elif(y_true[i]==-1 and y_pred[i]==-1):
        tn+=1
      elif(y_true[i]==1 and y_pred[i]==-1):
        fn+=1
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = (2*precision*recall) / (precision+recall)
    return f1


def get_accuracy(y_pred, df, set):

  data_set = {'train' : df[:train_size], 'val' : df[train_size:train_size+val_size], 'test' : df[train_size+val_size:]}
  df1 = data_set[set]

  df1['pred'] = y_pred
  agg_pred = df1.groupby(['ticker', 'date']).sum()['pred'].apply(lambda x: 1 if x > 0 else -1).values.tolist()
  true_pred = df1.groupby(['ticker', 'date']).first().dir.values.tolist()
  acc = accuracy_score(true_pred,agg_pred)
  f1 = f1_score(true_pred, agg_pred)
  return acc, f1


def get_r2_mse(y_pred, df, set):

  data_set = {'train' : df[:train_size], 'val' : df[train_size:train_size+val_size], 'test' : df[train_size+val_size:]}
  df1 = data_set[set]

  df1['pred'] = y_pred
  agg_pred = df1.groupby(['ticker', 'date']).mean()['pred'].values.tolist()
  true_pred = df1.groupby(['ticker', 'date']).first().ex_return.values.tolist()


  return r2_score(true_pred, agg_pred), mean_squared_error(true_pred, agg_pred)



In [0]:
# transform data

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.naive_bayes import ComplementNB

def surprise_me(train_texts, val_texts, test_texts, df, max_features, alpha):
    vectorizer = tfidf(train_texts, max_features = max_features)
    X_train= vectorizer.transform(train_texts)
    X_val= vectorizer.transform(val_texts)
    X_test= vectorizer.transform(test_texts)


    y_train = np.array(train_dir)
    y_val = np.array(val_dir)
    y_test = np.array(test_dir)


    model = ComplementNB(alpha = alpha)

    model.fit(X_train, y_train)

    # Make prediction using the trained model
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train_pred, y_train)
    train_f1 = f1_score(y_train_pred, y_train)
    val_acc = accuracy_score(y_val_pred, y_val) 
    val_f1 = f1_score(y_val_pred, y_val)
    test_acc = accuracy_score(y_test_pred, y_test)
    test_f1 = f1_score(y_test_pred, y_test)
    
    report = pd.DataFrame({'acc': [train_acc,val_acc,test_acc], 'f1':[train_f1, val_f1,test_f1]})
    report.index = ['train', 'val', 'test']

    print('surprise mf!')
    print('classification with Naive Bayes')
    return report

In [0]:
surprise_me(train_texts, val_texts, test_texts, df, 500, 1)

KeyboardInterrupt: ignored

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline



def get_scores(train_texts, val_texts, test_texts, df, max_features_list):
    score = []
    for x in tqdm(max_features_list):
      for a in alpha_list:
        vectorizer = tfidf(train_texts, max_features = x)
        X_train= vectorizer.transform(train_texts)
        X_val= vectorizer.transform(val_texts)


        y_train = np.array(train_dir)
        y_val = np.array(val_dir)

        model = ComplementNB(alpha = a)
        model.fit(X_train, y_train)
        # Make prediction using the trained model
        y_val_pred = model.predict(X_val)
        val_acc = accuracy_score(y_val_pred, y_val) 
        val_f1 = f1_score(y_val_pred, y_val)
        score.append([x,a, val_acc, val_f1])
    return score

def get_top(features, alpha, score):
    highest_score = sorted(score, reverse = True)[0]
    highest_idx = score.index(highest_score)
    max_feature_idx = highest_idx//len(features)
    # max_feature_idx = np.floor(float(highest_idx)/float(len(features)))
    a_idx = highest_idx % len(features)
    return features[max_feature_idx], alpha[a_idx], highest_score
    

def show_me_max_feature(max_features_list,alpha_list, alpha, accs, f1s):
    max_features_big_list = []
    alpha_big_list = []

    for x in max_features_list:
      for a in alpha_list:
        max_features_big_list.append(x)
        alpha_big_list.append(a)

    df = pd.DataFrame({'max_feature': max_features_big_list, 'alpha': alpha_big_list, 
                       'accs' : accs, 'f1s' : f1s})
    data = df[df['alpha'] == alpha].sort_values(by = 'max_feature')
    accs = data.accs
    f1s = data.f1s
    plt.plot(max_features_list, accs,'r--', max_features_list,f1s)
    plt.legend(['acc', 'f1'])
    plt.xlabel('max_features')
    plt.ylabel('scores')
    plt.show()
    
def show_me_alpha(max_feature_list, alpha_list, max_feature, accs, f1s):
    max_features_big_list = []
    alpha_big_list = []

    for x in max_features_list:
      for a in alpha_list:
        max_features_big_list.append(x)
        alpha_big_list.append(a)

    df = pd.DataFrame({'max_feature': max_features_big_list, 'alpha_list': alpha_big_list, 
                       'accs' : accs, 'f1s' : f1s})
    data = df[df['max_feature'] == max_feature].sort_values(by = 'alpha_list')

    accs = data.accs

    f1s = data.f1s
    plt.plot(alpha_list, accs,'r--', alpha_list,f1s)
    plt.legend(['acc', 'f1'])
    plt.xlabel('alpha')
    plt.ylabel('scores')
    plt.show()

    

In [0]:
max_features_list = [500, 1000,1500,2000,2500,3000]
alpha_list = [0.0001, 0.001, 0.01, 0.1]
clf_report = get_scores(train_texts, val_texts, test_texts, df, max_features_list)


In [0]:
clf_result = pd.DataFrame.from_records(clf_report)
clf_result.sort_values(by = 2, ascending=False).head()


In [0]:
surprise_me(train_texts, val_texts, test_texts, df, 3000, 0.1)

In [0]:
def r2_score(y_true, y_pred):
  sst = 0
  ssr = 0
  mu = np.array(y_true).mean()
  for i in range(len(y_true)):
    ssr += (y_pred[i] - y_true[i])**2
    sst += (y_true[i] - mu)**2
  return round((1 - ssr/sst), 4)



In [0]:


from sklearn.linear_model import Lasso
def better_not_surprise_me(train_texts, val_texts, test_texts, df, max_features, alpha):
    vectorizer = tfidf(train_texts, max_features = max_features)
    X_train= vectorizer.transform(train_texts)
    X_val= vectorizer.transform(val_texts)
    X_test= vectorizer.transform(test_texts)

    y_train = np.array(train_ret)
    y_val = np.array(val_ret)
    y_test = np.array(test_ret)


    model = Lasso(alpha = alpha)

    model.fit(X_train, y_train)

    # Make prediction using the trained model
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)

    train_r2 = round(r2_score( y_train, y_train_pred), 3)
    train_mse = mean_squared_error( y_train, y_train_pred)
    val_r2 = round(r2_score( y_val, y_val_pred), 3)
    val_mse = mean_squared_error( y_val, y_val_pred)
    test_r2 = round(r2_score( y_test, y_test_pred), 3)
    test_mse = mean_squared_error( y_test, y_test_pred)

    report = pd.DataFrame({'r2': [train_r2,val_r2,test_r2], 'mse': [train_mse, val_mse, test_mse]})
    report.index = ['train', 'val', 'test']

    print('surprise mf!')
    return report

In [0]:
better_not_surprise_me(train_texts, val_texts, test_texts, df, 3000, 0.1)

-3.1822455789065545e+33
0.0
surprise mf!


Unnamed: 0,r2,mse
train,-3.1822460000000003e+33,9.806051
val,0.0,11.830985
test,-3.764515e+33,11.600308


In [0]:
import matplotlib.pyplot as plt
%matplotlib inline



def get_scores2(train_texts, val_texts, test_texts, df, max_features_list):
    scores = []
    mse = []
    for max_features in tqdm(max_features_list):
      for a in alpha_list:
        vectorizer = tfidf(train_texts, max_features = max_features)
        X_train= vectorizer.transform(train_texts)
        X_val= vectorizer.transform(val_texts)



        y_train = np.array(train_ret)
        y_val = np.array(val_ret)


      
        model = Lasso(alpha = a)
        model.fit(X_train, y_train)
        # Make prediction using the trained model
        y_val_pred = model.predict(X_val)
        r2 = r2_score(y_val,y_val_pred)
        mse = mean_squared_error( y_val, y_val_pred)
        scores.append([max_features, a, r2, mse])

    return scores


In [0]:
max_features_list = np.linspace(200, 1000, 5, dtype = int)
max_features_list = [250, 1000,1500,2000,2500,3000]
alpha_list = [0.0001, .001, 0.01, 0.1]
scores = get_scores2(train_texts, val_texts, test_texts, df, max_features_list)






  0%|          | 0/6 [00:00<?, ?it/s][A[A[A[A



 17%|█▋        | 1/6 [02:48<14:02, 168.47s/it][A[A[A[A



 33%|███▎      | 2/6 [05:51<11:31, 172.89s/it][A[A[A[A



 50%|█████     | 3/6 [08:59<08:52, 177.41s/it][A[A[A[A



 67%|██████▋   | 4/6 [12:13<06:04, 182.25s/it][A[A[A[A



 83%|████████▎ | 5/6 [15:34<03:07, 187.92s/it][A[A[A[A



100%|██████████| 6/6 [19:18<00:00, 193.00s/it]


In [0]:

reg_result = pd.DataFrame.from_records(scores)
reg_result.sort_values(by = 2, ascending=False).head()

Unnamed: 0,0,1,2,3
21,3000,0.001,-0.0596,11.709455
17,2500,0.001,-0.0602,11.715217
5,1000,0.001,-0.0612,11.726831
9,1500,0.001,-0.0621,11.736115
0,250,0.0001,-0.0622,11.737554


In [0]:
better_not_surprise_me(train_texts, val_texts, test_texts, df, 3000, 0.0001)

surprise mf!


Unnamed: 0,r2,mse
train,0.098,8.847331
val,-0.076,11.896247
test,-0.073,11.747469


In [0]:
r2_score([1,2,3], [0,1,2])

-0.5

In [0]:
y_true = [1,2,3]
y_pred = [1,1,2]
sst = 0
ssr = 0
mu = np.array(y_true).mean()
for i in range(len(y_true)):
  ssr += (y_pred[i] - y_true[i])**2
  sst += (y_true[i] - mu)**2

In [0]:
ssr

2