# NLP Training


## import pkgs


In [1]:
import gensim
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

## load dataset

In [2]:
DATASET_DIR = './data'
TRAIN_DATASET = '{}/train_set.csv'.format(DATASET_DIR)
TEST_DATASET = '{}/test_set.csv'.format(DATASET_DIR)

df_train = pd.read_csv(TRAIN_DATASET)
df_test = pd.read_csv(TEST_DATASET)


## explore the dataset


In [3]:
df_train.head()

Unnamed: 0,id,article,word_seg,class
0,0,7368 1252069 365865 755561 1044285 129532 1053...,816903 597526 520477 1179558 1033823 758724 63...,14
1,1,581131 165432 7368 957317 1197553 570900 33659...,90540 816903 441039 816903 569138 816903 10343...,3
2,2,7368 87936 40494 490286 856005 641588 145611 1...,816903 1012629 957974 1033823 328210 947200 65...,12
3,3,299237 760651 299237 887082 159592 556634 7489...,563568 1239563 680125 780219 782805 1033823 19...,13
4,4,7368 7368 7368 865510 7368 396966 995243 37685...,816903 816903 816903 139132 816903 312320 1103...,12


In [4]:
df_train.describe()

Unnamed: 0,id,class
count,102277.0,102277.0
mean,51138.0,10.262356
std,29524.971078,5.370785
min,0.0,1.0
25%,25569.0,6.0
50%,51138.0,10.0
75%,76707.0,15.0
max,102276.0,19.0


In [5]:
df_test.head()

Unnamed: 0,id,article,word_seg
0,0,7368 146447 316564 42610 55736 297797 93042 53...,816903 565958 726082 764656 335008 75094 20282...
1,1,985531 473628 1044285 1121849 206763 462208 11...,729468 520477 529032 101368 335130 520477 1113...
2,2,7368 7368 7368 7368 7368 7368 7368 7368 7368 7...,816903 816903 816903 816903 816903 816903 8169...
3,3,529819 1226459 856005 1177293 663773 272235 93...,231664 1033823 524850 330478 507199 520477 618...
4,4,42610 1252069 1077049 955883 1125260 1044285 2...,545370 379223 162767 520477 1194630 1197475 11...


In [6]:
df_test.describe()

Unnamed: 0,id
count,102277.0
mean,51138.0
std,29524.971078
min,0.0
25%,25569.0
50%,51138.0
75%,76707.0
max,102276.0


## split training set


In [7]:
X = df_train.drop(columns = ['class'])
y = df_train['class']


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 2019)


In [9]:
X_train.head()


Unnamed: 0,id,article,word_seg
64388,64388,768900 151943 151943 180386 1044285 768900 155...,572782 887986 323159 520477 572782 668734 3231...
93829,93829,114495 721845 893126 57871 114495 721845 75060...,471784 1145236 1159844 460600 805193 45883 103...
2103,2103,1191611 348926 964907 7368 1191611 466409 7184...,597592 1243744 816903 221847 1082433 1060048 8...
2753,2753,42610 1252069 316188 316188 839598 143738 3163...,545370 965863 138140 502070 816903 327218 1312...
58622,58622,7368 1209583 961786 7368 755561 345037 994077 ...,816903 153705 1224594 816903 769051 526832 122...


In [10]:
y.head()


0    14
1     3
2    12
3    13
4    12
Name: class, dtype: int64

## preprocessing


In [11]:
X_train.drop(columns='article', inplace=True)
X_test.drop(columns='article', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


## feature engineering


In [12]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, sublinear_tf=True)
vectorizer.fit(X_train['word_seg'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
vectorizer.fit(X_test['word_seg'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
with open('data/tfidf.pkl', 'wb') as fp:
    pickle.dump((X_train, y_train, X_test), fp)

## training data


In [15]:
def X2list(sentence):
    return sentence.strip().split()

X = list(df_train.loc[:, 'word_seg'].apply(X2list)) + list(df_test.loc[:, 'word_seg'].apply(X2list))
model = gensim.models.Word2Vec(sentences = X, 
                               size = 100, window = 5, min_count = 5, 
                               workers = 8, sg = 0, iter = 5)
