In [18]:
import pandas as pd
import numpy as np
import nltk
import sys
import re
import os.path
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import defaultdict, OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
nltk.download('stopwords')
# Initialize Stemmer
ps = PorterStemmer()

# Word match regex to exclude digits
word_pattern = re.compile(r'[^\W\d]')
# Set of stopwords
stopword_set = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jinshin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
dataset = pd.read_csv('uci-news-aggregator.csv')

# List containing each title as item
titles = [title for title in dataset['TITLE']]
true_label = [label for label in dataset['CATEGORY']]

# Only create stemmed word file if no file exists
if (not os.path.exists('stemmed.txt') or not os.path.exists('stemmed_sentences.txt')):
    print("Creating text files to extract from......")
    with open('stemmed.txt', 'w') as file, open('stemmed_sentences.txt', 'w') as file1:
        for title in titles:
            temp_title = []
            # split title into vector of words
            for word in title.split():
                word = word.lower().rstrip('?:!.,;')
                if word not in stopword_set and 'http://' not in word and 'www' not in word:
                    if word_pattern.match(word):
                        # Add stemmed words to text file
                        word_temp = ps.stem(word).rstrip("'")
                        file.write(word_temp + '\n')
                        temp_title.append(word_temp)
            # Write titles to stemmed_sentences.txt
            file1.write(' '.join(temp_title) + '\n')
else:
    print ("stemmed text files already present.")

stemmed text files already present.


In [5]:
# List of all stemmed words (bag of words)
all_words = []
corpus_freq = defaultdict(int)
feature = OrderedDict()
with open('stemmed.txt', 'r') as file:
    for word in file:
        word = word.strip('\n')
        all_words.append(word)
        corpus_freq[word] += 1.0
        feature[word] = 0
titles_doc_vector = []
with open('stemmed_sentences.txt', 'r') as file1:
    for title in file1:
        new_doc_vec = defaultdict(int)
        for word in title.split():
            new_doc_vec[word] += 1
        titles_doc_vector.append(new_doc_vec)
print("Finished Creating bag of words")

Finished Creating bag of words


In [6]:
freq_count = defaultdict(int)
for key, val in corpus_freq.items():
    freq_count[val] += 1
print("Number of words with freq = 1: " + str(freq_count[1]))
print("Number of words with freq = 2: " + str(freq_count[2]))
print("Number of words with freq = 3: " + str(freq_count[3]))
print("Number of words with freq = 4: " + str(freq_count[4]))
print("Number of words with freq = 5: " + str(freq_count[5]))
print("Number of words with freq = 6: " + str(freq_count[6]))
print("Number of words with freq = 7: " + str(freq_count[7]))
print("Number of words with freq = 8: " + str(freq_count[8]))
print("Number of words with freq = 9: " + str(freq_count[9]))
print("Number of words with freq = 10: " + str(freq_count[10]))

#removing unnecessary
for key, val in corpus_freq.items():
    if val >= 0 and val <= 10:
        del corpus_freq[key]
        del feature[key]
print('\n')
print('finished processing')
print("length of features is " +str(len(feature)))
print("length of features is " +str(len(corpus_freq)))
print("length of examples is " + str(len(titles_doc_vector)))

Number of words with freq = 1: 24982
Number of words with freq = 2: 6598
Number of words with freq = 3: 3313
Number of words with freq = 4: 2066
Number of words with freq = 5: 1419
Number of words with freq = 6: 1096
Number of words with freq = 7: 898
Number of words with freq = 8: 724
Number of words with freq = 9: 620
Number of words with freq = 10: 505


finished processing
length of features is 12239
length of features is 12239
length of examples is 422419


In [7]:
feature_vector = OrderedDict()
counter = 0
json_counter = 0
for freq_dict in titles_doc_vector:
    counter += 1
    instance = feature.copy()
    for word, freq in freq_dict.items():
        if word in corpus_freq and corpus_freq[word] != 0:
            instance[word] = freq / corpus_freq[word]
    feature_vector[counter] = instance.values()
    if counter == 1000:
        json_counter += 1
        with open('feature_json/feature_vector' + str(json_counter) + '.json', 'w') as fp:
            json.dump(feature_vector, fp)
            feature_vector.clear()
            print('finished ' + str(json_counter) + ' json file')
            counter = 0

with open('feature_vector' + str(json_counter) + '.json', 'w') as fp:
    json.dump(feature_vector, fp)
    feature_vector.clear()
    print('finished ' + str(json_counter) + ' json file')

In [10]:
#load json file
data_vector = []
for i in range(1, 21):
    data_json = json.load(open('feature_json/feature_vector' + str(i) + '.json'), object_pairs_hook=OrderedDict)
    data_vector += data_json.values()
print("Completely loaded data")

Completely loaded data


In [None]:
train, test, train_labels, test_labels = train_test_split(data_vector,
                                                      true_label[:20000],
                                                      test_size=0.33)
#using Decision Tree Classifier to see performance
model = DecisionTreeClassifier()
model.fit(train, train_labels)
preds = model.predict(test)
print ('%s %d %s %.3f %s %s %d %s %.3f %s' % ("### DECISION TREE OVERALL CORRECT: ", 
accuracy_score(test_labels, preds) * len(test_labels), " = ",  
accuracy_score(test_labels, preds) * 100, "%   ", "INCORRECT: ", 
len(test_labels) - accuracy_score(test_labels, preds)*len(test_labels), " = ",  
100 - accuracy_score(test_labels, preds) * 100, "%")) #print out results

In [None]:
#Using Naive Bayes
model = SVC()
y_pred = model.fit(train, train_labels).predict(test)
print ('%s %d %s %.3f %s %s %d %s %.3f %s' % ("### Naive Bayes OVERALL CORRECT: ", 
accuracy_score(test_labels, y_pred) * len(test_labels), " = ",  
accuracy_score(test_labels, y_pred) * 100, "%   ", "INCORRECT: ", 
len(test_labels) - accuracy_score(test_labels, y_pred)*len(test_labels), " = ",  
100 - accuracy_score(test_labels, y_pred) * 100, "%")) #print out results