In [1]:
import pandas as pd
import numpy as np
import os.path
import datetime
import BoW
from sklearn.model_selection import train_test_split
import string
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import pickle

In [2]:
# Data Loading
path = os.getcwd()
parent_folder, current_folder = os.path.split(path)
df = pd.read_csv(parent_folder + '/0.Raw_data/train/Combined_News_DJIA_train.csv')   # please check if Training data is in the same location on your PC

In [3]:
def cleaning(mess):
    """
    Takes in a string of text, then performs the following:
    1. Lower case of all words
    2. Remove all punctuation
    3. Remove all stopwords
    4. Returns a list of the cleaned text
    """

    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)

    # Now just remove any stopwords
    return [word for word in nopunc.split() if word not in stopwords.words('english')]

In [4]:
# Data Cleaning and merging all headlines to one single column

df.iloc[:,2:27] = df.iloc[:,2:27].applymap(str)

# replace the b' and b" which are in the beginning of some headlines
df.iloc[:,2:27] = df.iloc[:,2:27].replace(regex="b'",value="")
df.iloc[:,2:27] = df.iloc[:,2:27].replace(regex='b"',value='')
df.iloc[:,2:27] = df.iloc[:,2:27].apply(lambda x: x.astype(str).str.lower())

#df1['lengths'] = df1['headlines'].apply(len)
df['Date'] = pd.to_datetime(df['Date'])

In [5]:
for i in range(2,27):
    df.iloc[:,i] = df.iloc[:,i].apply(cleaning)

In [6]:
val_size = 0.2
train_length = int(len(df)*(1-val_size))
train = df.iloc[:train_length+1,:]
val = df.iloc[train_length:,:]

In [7]:
X_train = train.loc[:,]
X_val = val.loc[:,]
y_train = df.iloc[:train_length+1,:]
y_train = y_train['Label']
y_val = df.iloc[train_length:,:]
y_val = y_val['Label']

In [58]:
def w2v_create(X_train):
    sentences = []
    col = X_train.loc[:,X_train.dtypes == object]
    for i in range(col.shape[1]):
        sentences.extend(col.iloc[:,i].tolist())
    w2v_model = Word2Vec(sentences, min_count=4)
    with open('w2v_model.pk', 'wb') as fin:
        pickle.dump(w2v_model, fin)
    pass

def transform_vocab(wordlist):
    '''
    Use dataframe.apply(transform_vocab) to transform the list of words in each Dataframe column to a list of numpy array vectors
    :param wordlist: list of words
    :return: list of numpy arrays with word vector (length: 100) and padded sequence (length:20)
    '''
    with open('w2v_model.pk', 'rb') as f:
        model = pickle.load(f)
    filtered_wl = [word for word in wordlist if word in model.wv.vocab]
    if not filtered_wl: 
        filtered_wl = ['nan']
    vectorlist = model.wv[filtered_wl]
    paddings = 20 - len(vectorlist)
    padded_vectorlist = paddings * [100 * [0]] + vectorlist[0:20].tolist()
    return np.asarray(padded_vectorlist)

def w2v_transform(X_input):
    '''
    Transforms the cleaned Dataframe from containing a list of words to containing a padded sequence of word vectors.
    :param X_train:
    :return: padded vectorized Dataframe
    '''
    arraylist = []
    dn = np.array((X_input.shape[0],X_input.shape[1], 20, 100))    
    idx = [X_input.columns.get_loc(c) for c in X_input.filter(like='Top').columns if c in X_input]
    for j in idx:
        for i in range(X_input.shape[0]):
            arraylist.append(transform_vocab(X_input.iloc[i][j]))         
    return arraylist

In [13]:
w2v_create(X_train)

In [59]:
X_t = w2v_transform(X_train)

In [60]:
X = np.asarray(X_t)
X.shape

(31700, 20, 100)

In [61]:
X = X.reshape(-1,25,20,100)
X.shape

In [None]:
with open('W2V_X_Train.pk', 'wb') as fin:
        pickle.dump(X, fin)

In [64]:
X_va = w2v_transform(X_val)
Xval = np.asarray(X_va)
Xval = Xval.reshape(-1,25,20,100)

In [68]:
with open('W2V_X_Val.pk', 'wb') as fin:
        pickle.dump(Xval, fin)