# Preparing

In [1]:
import numpy as np
import pandas as pd
import datetime
import math
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Raw Data

In [3]:
SEED=229

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
class Raw_Dataset:
    def __init__(self, tweet_file_path, nasdaq_file_path):
        self.tweet = Raw_Dataset.__read_tweet_dataset(tweet_file_path)
        self.nasdaq = Raw_Dataset.__read_nasdaq_dataset(nasdaq_file_path)
        self.data_df = self.__combine_data()
        self.train_val_df, self.test_df = self.__split_train_test()
        self.reweighted_train_val_df = self.__re_weighting()

    @staticmethod
    def __read_tweet_dataset(tweet_file_path):
        tweet = pd.read_csv(tweet_file_path)[['date', 'content']]
        tweet['date'] = pd.to_datetime(tweet['date'])
        tweet = tweet[tweet['date'] >= '2016-11-09'].reset_index(drop=True)
        return tweet
    
    @staticmethod
    def __read_nasdaq_dataset(nasdaq_file_path):
        nasdaq = pd.read_csv(nasdaq_file_path)[['Date', 'Close']]
        nasdaq['Date'] = pd.to_datetime(nasdaq['Date']).dt.strftime('%Y-%m-%d')

        nasdaq['Tweet_Start_Date'], nasdaq['Tweet_End_Date'] = np.NaN, np.NaN
        nasdaq['Return'], nasdaq['Return_Bucket'] = np.NaN, np.NaN
        for i, row in nasdaq.iterrows():
            if i - 1 < 0:
                continue
            nasdaq.loc[i, 'Tweet_Start_Date'] = nasdaq.iloc[i - 1]['Date']
            nasdaq.loc[i, 'Tweet_End_Date'] = nasdaq.iloc[i]['Date']
            ret = math.log(nasdaq.iloc[i]['Close'] / nasdaq.iloc[i - 1]['Close'])
            nasdaq.loc[i, 'Return'] = ret
            nasdaq.loc[i, 'Return_Bucket'] = 1 if ret < 0 else 0

        return nasdaq.iloc[1:]

    def __combine_data(self):
        tweet, nasdaq = self.tweet, self.nasdaq
        result = pd.DataFrame(columns=['date', 'content', 'nasdaq_date', 'return', 'return_bucket'])
        for i, row in nasdaq.iterrows():
            tweet_filtered = tweet[(tweet['date'] >= row['Tweet_Start_Date']) & \
                                   (tweet['date'] < row['Tweet_End_Date'])]
            tweet_filtered = tweet_filtered[tweet_filtered['content'].apply(lambda x: len(x.split(' '))) >= 5] 
            tweet_filtered['nasdaq_date'] = row['Tweet_End_Date']
            tweet_filtered['return'] = row['Return']
            tweet_filtered['return_bucket'] = row['Return_Bucket']
            tweet_filtered = tweet_filtered[['date', 'content', 'nasdaq_date', 'return', 'return_bucket']]
            result = result.append(tweet_filtered, ignore_index = True) 
        return result

    def __split_train_test(self):
        data_df = self.data_df
        train_val_df, test_df, _, _ = train_test_split(data_df, data_df['return_bucket'], test_size=0.20, random_state=SEED)
        print(f"Dataset Shape --- data_df {data_df.shape}, train_val_df {train_val_df.shape}, test_df {test_df.shape}")
        return train_val_df.reset_index(), test_df.reset_index()

    def __re_weighting(self):
        train_val_df = self.train_val_df
        ratio = len(train_val_df[train_val_df['return_bucket'] == 0]) // len(train_val_df[train_val_df['return_bucket'] == 1])
        if ratio < 1:
            return

        reweighted_train_val_df = train_val_df[train_val_df['return_bucket'] == 0]
        for i in range(ratio):
            reweighted_train_val_df = reweighted_train_val_df.append(train_val_df[train_val_df['return_bucket'] == 1])
        print(f'Reweighted Dataset Shape --- reweighted_ratio: {ratio}, reweighted_train_df: {reweighted_train_val_df.shape}')
        return reweighted_train_val_df

In [6]:
raw_dataset = Raw_Dataset(tweet_file_path='realdonaldtrump.csv', nasdaq_file_path='^IXIC.csv')
raw_dataset.data_df.head()

Dataset Shape --- data_df (11227, 5), train_val_df (8981, 5), test_df (2246, 5)
Reweighted Dataset Shape --- reweighted_ratio: 1, reweighted_train_df: (8981, 6)


Unnamed: 0,date,content,nasdaq_date,return,return_bucket
0,2016-11-09 05:36:58,Such a beautiful and important evening! The fo...,2016-11-10,-0.008082,1.0
1,2016-11-10 13:31:27,Happy 241st birthday to the U.S. Marine Corps!...,2016-11-11,0.00542,0.0
2,2016-11-10 20:10:46,A fantastic day in D.C. Met with President Oba...,2016-11-11,0.00542,0.0
3,2016-11-10 20:19:44,Just had a very open and successful presidenti...,2016-11-11,0.00542,0.0
4,2016-11-11 05:14:20,Love the fact that the small groups of protest...,2016-11-14,-0.003579,1.0


# Bag of Word Model

Encoding

In [7]:
class OneHot_Encoded_Dataset:
    def __init__(self, train_df):
        messages = train_df['content'].to_list()
        word_count, self.word_dictionary = OneHot_Encoded_Dataset.__create_dictionary(messages)
        train_df['text_in_array'] = train_df['content'].\
            apply(lambda x: OneHot_Encoded_Dataset.__transform_text(x, self.word_dictionary))
        self.X_train, self.y_train = OneHot_Encoded_Dataset.__to_numpy(train_df)
        self.X_test, self.y_test = None, None

    def encode_test_df(self, test_df):
        test_df['text_in_array'] = test_df['content'].\
            apply(lambda x: OneHot_Encoded_Dataset.__transform_text(x, self.word_dictionary))
        self.X_test, self.y_test = OneHot_Encoded_Dataset.__to_numpy(test_df)

    @staticmethod
    def __create_dictionary(messages):
        count = {}
        for message in messages:
            for word in OneHot_Encoded_Dataset.__get_words(message):
                revised_word = OneHot_Encoded_Dataset.__revise_word(word)
                if revised_word is not None:
                    count[revised_word] = count.get(revised_word, 0) + 1

        result, index = {}, 0
        for revised_word in count.keys():
            if count[revised_word] >= 5:
                result[revised_word] = index
                index += 1
        print(f'One-hot Encoding Dictionary Size --- {len(result)}')
        return count, result

    @staticmethod
    def __transform_text(message, word_dictionary):
        result = np.zeros(len(word_dictionary))
        for word in OneHot_Encoded_Dataset.__get_words(message):
            revised_word = OneHot_Encoded_Dataset.__revise_word(word)
            if revised_word is not None and revised_word in word_dictionary:
                result[word_dictionary[revised_word]] += 1
        return list(result)

    @staticmethod
    def __revise_word(word):
        if 'pic.twitter.com' in word or 'http' in word:
            return None
        word = ''.join(e for e in word if e.isalnum())
        return word

    @staticmethod
    def __get_words(message):
        return [word.lower() for word in message.split(' ')]
    
    @staticmethod
    def __to_numpy(df):
        x, y = [], []
        for _, row in df.iterrows():
            x_i, y_i = list(row['text_in_array']), row['return_bucket']
            x.append(x_i)
            y.append(y_i)
        return np.array(x), np.array(y)

In [8]:
onehot = OneHot_Encoded_Dataset(raw_dataset.train_val_df)
onehot.encode_test_df(raw_dataset.test_df)

One-hot Encoding Dictionary Size --- 3987


In [9]:
X_train, y_train = onehot.X_train, onehot.y_train
X_test, y_test = onehot.X_test, onehot.y_test
print(f'X_train Shape: {X_train.shape} | X_test Shape: {X_test.shape}')

X_train Shape: (8981, 3987) | X_test Shape: (2246, 3987)


Train

In [10]:
log_reg = LogisticRegression(penalty='l2')
log_reg.fit(X_train, y_train)

LogisticRegression()

In [11]:
y_pred = log_reg.predict(X_test).astype(int)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[815 484]
 [558 389]]
              precision    recall  f1-score   support

         0.0       0.59      0.63      0.61      1299
         1.0       0.45      0.41      0.43       947

    accuracy                           0.54      2246
   macro avg       0.52      0.52      0.52      2246
weighted avg       0.53      0.54      0.53      2246



# N-gram Bag of Word Model

Encoding

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(raw_dataset.train_val_df['content'])
X_test = vectorizer.transform(raw_dataset.test_df['content'])
print(f'Length of Vocabulary: {len(vectorizer.vocabulary_)}')
print(f'X_train Shape: {X_train.shape} | X_test Shape: {X_test.shape}')

Length of Vocabulary: 128304
X_train Shape: (8981, 128304) | X_test Shape: (2246, 128304)


In [14]:
y_train = raw_dataset.train_val_df['return_bucket']
y_test = raw_dataset.test_df['return_bucket']

Train

In [15]:
log_reg = LogisticRegression(penalty='l2')
log_reg.fit(X_train, y_train)

LogisticRegression()

In [16]:
y_pred = log_reg.predict(X_test).astype(int)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[809 490]
 [538 409]]
              precision    recall  f1-score   support

         0.0       0.60      0.62      0.61      1299
         1.0       0.45      0.43      0.44       947

    accuracy                           0.54      2246
   macro avg       0.53      0.53      0.53      2246
weighted avg       0.54      0.54      0.54      2246

