## Imports

In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from sklearn import linear_model, model_selection
import gensim
import nltk

In [None]:
# for padding
max_len = 30

# load pre-trained word2vec model, see notes.txt for how to download the .bin file
w2v_model_name = '../GoogleNews-vectors-negative300.bin'
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_name, binary = True)

## Helper Functions

In [None]:
def clean_text(text_arr):
    '''remove non-alphabetic tokens and filter out stopwords
    
    Args:
        text_arr: list of strings, each representing sentences
    
    Returns:
        2d list of strings, each representing words
    '''
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokenizer = nltk.tokenize.TweetTokenizer()
    clean = []
    
    for sentence in text_arr:
        tokens = tokenizer.tokenize(sentence)
        list_sentence = []
        for word in tokens:
            word = word.strip('#').lower()
            if word.isalpha() and not word in stop_words:
                list_sentence.append(word)
        clean.append(list_sentence)
    return clean

def text_to_vec_addition(text_arr, embedding_model):
    '''embed sentences to vectors by adding word vectors together
    
    Args:
        text_arr: 2d list of strings, each representing words
    
    Returns:
        numpy array of shape (number of sentences, word vector_size)
    '''
    np_array = np.zeros((len(text_arr), embedding_model.vector_size))
    for i, sentence in enumerate(text_arr):
        sentence_embedding = np.zeros(embedding_model.vector_size)
        for word in sentence:
            if word in embedding_model.vocab:
                sentence_embedding += embedding_model[word]
        np_array[i] = sentence_embedding
    return np_array

def text_to_vec_concatenation(text_arr, embedding_model):
    '''embed sentences to vectors by concatenating word vectors together
    
    Args:
        text_arr: 2d list of strings, each representing words
    
    Returns:
        numpy array of shape (number of sentences, word vector_size*max_len)
    '''
    vector_array = []
    for i, sentence in enumerate(text_arr):
        sentence_embedding = []
        for word in sentence:
            if word in embedding_model.vocab:
                sentence_embedding += embedding_model[word].tolist()
        vector_array.append(sentence_embedding)
    np_array = sequence.pad_sequences(vector_array, maxlen=max_len*embedding_model.vector_size, padding='post', truncating='post', dtype='float32')
    return np_array

## Train

In [None]:
# load text data
train_df = pd.read_csv('../train.csv')
train_x_str = clean_text([each for each in train_df['text']])
train_y = [each for each in train_df['target']]

# convert text to word2vec vectors
train_x = text_to_vec_addition(train_x_str, w2v_model)
print(train_x.shape)

In [None]:
# set up classifier
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, train_x, train_y, cv=3, scoring='f1')
print(scores)

# fit classifier
clf.fit(train_x, train_y)

## Test

In [None]:
# load text data
test_df = pd.read_csv('../test.csv')
test_x_str = clean_text([each for each in test_df['text']])

# convert text to word2vec vectors, using addition
test_x = text_to_vec_addition(test_x_str, w2v_model)

# get test results
sample_submission = pd.read_csv("../sample_submission.csv")
sample_submission["target"] = clf.predict(test_x)
sample_submission.to_csv("submission.csv", index=False)