In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from tqdm import tqdm

In [None]:
def read_emb(path):
    df = pd.read_csv(path)
    return df

In [None]:
PERMCO_LIST = [ 2381, 11937, 20064, 20253, 20333, 20440, 20678, 20868, 20972,
       21322, 21401, 21576, 21771, 21795, 22113, 26024, 29122, 29139,
       29634, 34829, 34920, 35222, 37900, 38393, 42001]

In [None]:
def parse_list(string):
    return [float(x.strip()) for x in string.strip('[]').split(',')]

def split_train_test(df):
    train = df.loc[df['datatype']=='train']
    test = df.loc[df['datatype']=='test']

    train['feature'] = train['feature'].apply(parse_list)
    test['feature'] = test['feature'].apply(parse_list)

    return train, test

def split_train_test_sample(df):
    df = df.sample(n=int(0.5 * df.shape[0]))
    train = df.loc[df['datatype']=='train']
    test = df.loc[df['datatype']=='test']

    train['feature'] = train['feature'].apply(parse_list)
    test['feature'] = test['feature'].apply(parse_list)

    return train, test

def get_data_for_model(train, test):
    ## Get embedding as numpy
    X_train = np.array(train['feature'].values.tolist())
    X_test = np.array(test['feature'].values.tolist())

    y_train = train['moving_prc_dir']
    y_test = test['moving_prc_dir']

    return X_train, X_test, y_train, y_test

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import GridSearchCV

In [None]:
start_PERMCO_idx = 0
end_PERMCO_idx = 24

import time

def train_and_get_sentiment_scores(df, model_option):
    start_time = time.time()

    results = []
    scores = []
    for PERMCO in tqdm(PERMCO_LIST[start_PERMCO_idx:end_PERMCO_idx+1]):
        for y in range(2005, 2013):
            df_iter = df.loc[(df['year'] == y) & (df['PERMCO'] == PERMCO)]

            print('..splitting data')
            train, test = split_train_test(df_iter)

            print('..preparing data for model')
            X_train, X_test, y_train, y_test = get_data_for_model(train, test)

            parameters = [{'C':[0.001, 0.01, 0.1, 1]}]

            ## Train the Model
            print('..tuning hyperparameters and training')
            lr_model = LogisticRegression(penalty='l2', max_iter=200,random_state=1234)
            grid_search = GridSearchCV(estimator=lr_model,
                                        param_grid=parameters,
                                        scoring='accuracy',
                                        cv=5)

            grid_search.fit(X_train, y_train)

            # Validate the model
            print('..getting prediction')
            predictions = grid_search.predict(X_test)
            sentiment_score = grid_search.predict_proba(X_test)[:, 1]
            accuracy = accuracy_score(y_test, predictions)
            precision = precision_score(y_test, predictions)

            test['sentiment_score'] = sentiment_score
            test['prediction'] = predictions

            scores.append(test)

            results.append({
                'PERMCO': PERMCO,
                'start_year': y,
                'model': model_option,
                'accuracy': accuracy,
                'precision': precision,
                'best_param_C': grid_search.best_params_['C']
            })
    all_scores = pd.concat(scores, ignore_index=True)
    all_scores.to_csv(f'./scores/scores_{model_option}.csv', index=False)

    all_result = pd.DataFrame(results)
    all_result.to_csv(f'./scores/results_{model_option}.csv', index=False)

    print(all_result)

    print("--- %s mins ---" % ((time.time() - start_time)/60))

In [None]:
bert_emb = read_emb('./embeddings/embeddings_bert.csv')

In [None]:
train_and_get_sentiment_scores(bert_emb, 'bert')

In [None]:
bert_finetuned_emb = read_emb('./embeddings/embeddings_finetuned_bert_every3Y_wEarlyStopping.csv')

In [None]:
train_and_get_sentiment_scores(bert_finetuned_emb, 'bert_finetuned')

In [None]:
roberta_emb = read_emb('./embeddings/embeddings_roberta.csv')

In [None]:
train_and_get_sentiment_scores(roberta_emb, 'roberta')

In [None]:
roberta_finetuned_emb = read_emb('./embeddings/embeddings_finetuned_roberta_every3Y_wEarlyStopping.csv')

In [None]:
train_and_get_sentiment_scores(roberta_finetuned_emb, 'roberta_finetuned')

In [None]:
dBert_emb = read_emb('./embeddings/embeddings_distilbert.csv')

In [None]:
train_and_get_sentiment_scores(dBert_emb, 'distilbert')

In [None]:
dBert_finetuned_emb = read_emb('./embeddings/embeddings_finetuned_distilbert_every3Y_wEarlyStopping.csv')

In [None]:
train_and_get_sentiment_scores(dBert_finetuned_emb, 'distilbert_finetuned')

In [None]:
dRoberta_emb = read_emb('./embeddings/embeddings_distilroberta.csv')

In [None]:
train_and_get_sentiment_scores(dRoberta_emb, 'distilroberta')

In [None]:
dRoberta_finetuned_emb = read_emb('./embeddings/embeddings_finetuned_distilroberta_every3Y_wEarlyStopping.csv')

In [None]:
train_and_get_sentiment_scores(dRoberta_finetuned_emb, 'distilroberta_finetuned')

In [None]:
finbert_emb = read_emb('./embeddings/embeddings_finbert.csv')

In [None]:
train_and_get_sentiment_scores(finbert_emb, 'finbert')

In [None]:
finbert_finetuned_emb = read_emb('./embeddings/embeddings_finetuned_finbert_every3Y_wEarlyStopping.csv')

In [None]:
train_and_get_sentiment_scores(finbert_finetuned_emb, 'finbert_finetuned')