In [1]:
import pandas as pd
import numpy as np
import re
from gensim.models import word2vec
from joblib import dump
import nltk

df_train = pd.read_csv('train.txt', sep='\t')
df_valid = pd.read_csv('valid.txt', sep='\t')
df_test = pd.read_csv('test.txt', sep='\t')
df = pd.concat([df_train, df_valid, df_test], ignore_index=True)

In [2]:
def preprocess(sentence):
    new_sentence = []
    nltk.download('stopwords')
    stopwords_set = set(nltk.corpus.stopwords.words('english'))
    ps = nltk.stem.porter.PorterStemmer()
    for word in sentence.split():
        if word not in stopwords_set:
            new_word = re.sub(r'[^\w\s]', '', ps.stem(word.lower()))
            new_word = re.sub(r'[0-9]+', '0', new_word)
            new_sentence.append(new_word)
    return new_sentence

In [3]:
df = pd.concat([df_train, df_valid, df_test], ignore_index=True)
sentences = [preprocess(sentence) for sentence in df['title']]
model = word2vec.Word2Vec(sentences, min_count=1, seed=0)
dump(model, '051-wvmodel.joblib')
vecs = np.array([np.mean([model.wv[word] for word in sentence], axis=0) for sentence in sentences])
vecs.shape

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kogayurie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kogayurie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kogayurie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kogayurie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kogayurie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kogayurie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kogayurie/nltk_data...
[nltk_data]   Package stopwords i

(13340, 100)

In [4]:
df_feature = pd.concat([df.replace({'category': {'b': 0, 't': 1, 'e': 2, 'm': 3}}), pd.DataFrame(vecs)], axis=1)
df_feature_train = df_feature.iloc[:len(df_train), :]
df_feature_valid = df_feature.iloc[len(df_train):-len(df_test), :]
df_feature_test = df_feature.iloc[-len(df_test):, :]
print(len(df_feature_train), len(df_feature_valid), len(df_feature_test))
df_feature_train.head()

10672 1334 1334


Unnamed: 0,category,title,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,0,Bouygues confirms improved offer for Vivendi's...,-0.021988,-0.2302,-0.134264,-0.205677,0.146253,0.126102,0.077435,-0.18085,...,-0.040067,0.154747,0.114556,-0.214083,-0.070017,0.270001,-0.040857,0.04641,-0.235225,0.141587
1,0,"US stocks end off highs, Dow up for week; gold...",-0.057278,-0.514741,-0.161651,-0.444684,0.228405,0.202019,0.15487,-0.395289,...,-0.086756,0.329965,0.178112,-0.481202,-0.123144,0.667989,-0.069372,0.103533,-0.456702,0.236879
2,2,'Chef' Director: The Creative Driving Force Be...,-0.034246,-0.401016,-0.313533,-0.333279,0.288354,0.261323,0.122265,-0.296679,...,-0.069937,0.278954,0.238995,-0.334421,-0.140912,0.385918,-0.0799,0.101443,-0.43541,0.294425
3,0,South African labor minister to meet with stri...,-0.023189,-0.198104,-0.110058,-0.172944,0.117299,0.104816,0.063847,-0.158881,...,-0.034982,0.133081,0.098431,-0.185119,-0.054206,0.235719,-0.03526,0.044124,-0.20091,0.123526
4,2,Kaley Cuoco and her Big Bang Theory co-stars h...,-0.032282,-0.338466,-0.208339,-0.293357,0.214457,0.193731,0.110653,-0.266127,...,-0.057642,0.228295,0.181456,-0.302763,-0.10233,0.377941,-0.065904,0.078833,-0.350714,0.223618


In [5]:
df_feature_train.to_csv('train.feature.txt', index=False)
df_feature_valid.to_csv('valid.feature.txt', index=False)
df_feature_test.to_csv('test.feature.txt', index=False)