In [19]:
import news_cnn_model
import numpy as np
import os
import pandas as pd
import pickle
import shutil
import tensorflow as tf
import yaml
from tensorflow.contrib.learn.python import SKCompat

from sklearn import metrics

learn = tf.contrib.learn

stream = open("config.yml", "r")
load = yaml.load(stream)
config = load['default']['news_topic_modeling_service']

REMOVE_PREVIOUS_MODEL = config['trainer']['REMOVE_PREVIOUS_MODEL']

MODEL_OUTPUT_DIR = config['key_config']['MODEL_DIR']
DATA_SET_FILE = config['key_config']['Labeled_news_cvs_address']
#RANDOM_DATA_SET_FILE = config['key_config']['Labeled_news_random_address']
VARS_FILE = config['key_config']['VARS_FILE_ADDRESS']
VOCAB_PROCESSOR_SAVE_FILE = config['key_config']['VOCAB_PROCESSOR_SAVE_FILE_ADDRESS']
MAX_DOCUMENT_LENGTH = 300
N_CLASSES = config['key_config']['CLASSES_NUMS']
NUM_OF_TEST_DATA = config['key_config']['NUM_OF_TEST_DATA']
# Training parms
STEPS = 200



In [20]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords


from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()
wnl = WordNetLemmatizer()

In [21]:
def data_clean( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text()
    #
    # 2. Remove non-letters number
    letters_only = re.sub("[^a-zA-Z0-9]", " ", review_text)
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    #
    # 5. Remove stop words 35.33
    meaningful_words = [w for w in words if not w in stops]
    #
    # 6. Stem words(bad result) 24
    stemming_words = [ps.stem(w) for w in meaningful_words]
    #
    # 7.lemmatize_words  30
    lemmatize_words = [wnl.lemmatize(w) for w in stemming_words]
    # 7. Join the words back into one string separated by space,
    # and return the result.
    return  ( " ".join( stemming_words )).encode('utf-8')


In [22]:
def data_col_process(data):
    # Create an empty list and append the clean reviews one by one
    clean_data = []
    for i in data:
        #print "len of i %s" % type(i)
        clean_review = data_clean( str(i) )
        clean_data.append( clean_review )
        #print "########### %s" % type(clean_review)
    #newDF = pd.DataFrame() #creates a new dataframe that's empty
    #newDF = newDF.append(clean_data, ignore_index = True) # ignoring index is optional
    return clean_data

In [23]:
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data, encode UTF-8, uoting=3 tells Python to ignore doubled quotes
    df = pd.read_csv(DATA_SET_FILE, header=None, encoding='utf8')

    # Random shuffle
    df.sample(frac=1)

    #drop NaN value row
    df = df.dropna(axis=0, how='any')
    df.apply(lambda x: pd.api.types.infer_dtype(x.values))

    #df = nltk.word_tokenize(str(df))

    test_df = df[0:NUM_OF_TEST_DATA]
    train_df = df.drop(test_df.index)

    # x - 1 for news title 2 for news text, y - class
    x_train = data_col_process(train_df[1]+train_df[2])
    y_train = map(int, data_col_process(train_df[0]))
    x_test = data_col_process(test_df[1]+test_df[2])
    y_test = map(int, data_col_process(test_df[0]))

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    #print x_train
    #print x_test

    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)
    #print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'w') as f:
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    classifier = learn.Estimator(
        model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words),
        model_dir=MODEL_OUTPUT_DIR)

    # Train and predict
    classifier.fit(x_train, y_train, steps=STEPS)

    # Evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))

if __name__ == '__main__':
    tf.app.run(main=main)


Total words: 9082
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': None, '_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_tf_random_seed': None, '_task_type': None, '_environment': 'local', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f653b8d9110>, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_num_worker_replicas': 0, '_task_id': 0, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_evaluation_master': '', '_keep_checkpoint_every_n_hours': 10000, '_master': ''}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn 

SystemExit: 