# HW1

In [99]:
from os import chdir, getcwd
from glob import glob
import pyspark
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import time
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
import re
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.tokenize import RegexpTokenizer

%matplotlib inline

In [128]:
path = getcwd()
chdir(path)

In [129]:
f_list = glob('reuters21578/*.sgm')

In [130]:
topic_list = ["money", "fx", "crude", "grain", "trade", "interest", "wheat", 
              "ship", "corn", "oil", "dlr", "gas", "oilseed", "supply", "sugar", 
              "gnp", "coffee", "veg", "gold", "soybean", "bop", "livestock", "cpi"]

In [131]:
def if_topic_in(topic, topic_list = topic_list):
    """function to determine if each entry belongs to our topic list
    ---------------------------------------------
    
    :param topic: list of many topics of one article
    :param topic_list: list of pre-defined topics
    
    :returns: index of first element in the topic list that belongs to topic_list
    """
    try:
        ans = list(set(topic).intersection(topic_list))
    except:
        ans = ""
    
    return ans

In [132]:
def cleanbody(text):
    """function to clean text by removing punctuations, and numbers
    ---------------------------------------------
    
    :param text: a string
    
    :returns: string with punctuations and numbers removed
    """
    stemmer = PorterStemmer()
    text = text.replace('\n',' ').lower().strip()
    text = re.sub("[^a-z, A-Z]+", "", text)
    processed = ''.join(stemmer.stem(i) for i in text)
    return(processed)

In [170]:
doi_list = list()
for filename in f_list:
    print('Start parsing {0}'.format(filename))
    file = open(filename, 'rb')
    soup = BeautifulSoup(file, 'html.parser')
    file.close()
    for topic_raw in soup.find_all('topics'):
        topic = topic_raw.get_text().split('-')
        topic = if_topic_in(topic)
        if len(topic) != 0:
            body = topic_raw.find_next('body').get_text()
            for t in topic:
                tb_tup = (t, body)
                doi_list.append(tb_tup)
    print('Finished parsing {0}'.format(filename))

Start parsing reuters21578/reut2-000.sgm
Finished parsing reuters21578/reut2-000.sgm
Start parsing reuters21578/reut2-001.sgm
Finished parsing reuters21578/reut2-001.sgm
Start parsing reuters21578/reut2-002.sgm
Finished parsing reuters21578/reut2-002.sgm
Start parsing reuters21578/reut2-003.sgm
Finished parsing reuters21578/reut2-003.sgm
Start parsing reuters21578/reut2-004.sgm
Finished parsing reuters21578/reut2-004.sgm
Start parsing reuters21578/reut2-005.sgm
Finished parsing reuters21578/reut2-005.sgm
Start parsing reuters21578/reut2-006.sgm
Finished parsing reuters21578/reut2-006.sgm
Start parsing reuters21578/reut2-007.sgm
Finished parsing reuters21578/reut2-007.sgm
Start parsing reuters21578/reut2-008.sgm
Finished parsing reuters21578/reut2-008.sgm
Start parsing reuters21578/reut2-009.sgm
Finished parsing reuters21578/reut2-009.sgm
Start parsing reuters21578/reut2-010.sgm
Finished parsing reuters21578/reut2-010.sgm
Start parsing reuters21578/reut2-011.sgm
Finished parsing reuters

In [174]:
data = pd.DataFrame(doi_list)
data.columns = (['topic', 'body'])
data['body'] = data['body'].apply(cleanbody)
print('A total number of {0} items were retrieved. Articles with multiple classes are recorded multiple times.'.format(len(data)))
data.head()

A total number of 3625 items were retrieved. Articles with multiple classes are recorded multiple times.


Unnamed: 0,topic,body
0,veg,argentine grain board figures show crop regist...
1,money,sens alan cranston dcal and daniel evans rwash...
2,supply,sens alan cranston dcal and daniel evans rwash...
3,coffee,"international coffee organization, ico, produc..."
4,sugar,sugar imports subject to the us sugar import q...


In [175]:
data.to_csv('training_test_data.txt')

In [176]:
data.loc[0:10].to_csv('top10.txt')

In [103]:
body_list = list(data['body'])
start = time.clock()
skl_vectorizer = TfidfVectorizer().fit_transform(data['body'])
print ("sklearn TFIDF processing time: {0:.5f} s".format(time.clock() - start))
freq_term_matrix = TfidfVectorizer.transform(, raw_documents=data['body'])
#X = skl_vectorizer.transform(data['body'])

sklearn TFIDF processing time: 0.53331 s


KeyError: 'freq_term'

In [177]:
from pyspark.sql import SparkSession
from pyspark.sql import Column
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.ml.feature import HashingTF, IDF, OneHotEncoder, StringIndexer
from pyspark.ml.classification import NaiveBayes

In [178]:
spark = SparkSession\
        .builder\
        .appName("NewsClassification")\
        .getOrCreate()

df = spark.read.csv("training_test_data.txt",header=True,inferSchema=True)
stopwords_set = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

#This function removes stopwords (e.g. 'the', 'a') from the text
def stop_stem(tokens):
    tokens = tokens.split()
    stemmed = [word for word in tokens if word not in stopwords_set]
    return stemmed

stop_stem_udf = udf(stop_stem, ArrayType(StringType()))
df = df.withColumn("tokenized", stop_stem_udf("body"))

#following section transforms the text using TFIDF
start = time.clock()
hashingTF = HashingTF(inputCol="tokenized", outputCol="term_freq")
df = hashingTF.transform(df)
idf = IDF(inputCol="term_freq", outputCol="tfidf", minDocFreq=5)
idfModel = idf.fit(df)
df = idfModel.transform(df)
print ("pyspark TFIDF processing time: {0:.5f} s".format(time.clock() - start))

pyspark TFIDF processing time: 0.00971 s


In [None]:
#Using the OneHotEncoder to convert the topics into discrete integers
stringIndexer = StringIndexer(inputCol="topic", outputCol="topicIndex")
model = stringIndexer.fit(df)
indexed = model.transform(df)

In [205]:
val_dict = dict()
train_test_val_split_params = {'50/40/10': [0.5, 0.4, 0.1],
                               '60/30/10': [0.6, 0.3, 0.1], 
                               '70/20/10': [0.7, 0.2, 0.1]}

for split_param in train_test_val_split_params.keys():
    for seed in np.arange(10):
        train,test,val = indexed.select("tfidf","topicIndex").randomSplit(train_test_val_split_params[split_param],seed=seed)

        #Naive bayes
        nb = NaiveBayes(featuresCol="tfidf", labelCol="topicIndex", predictionCol="NB_pred",
                        probabilityCol="NB_prob", rawPredictionCol="NB_rawPred")
        nbModel = nb.fit(train)
        val = nbModel.transform(val)
        total = val.count()
        correct = val.where(test['topicIndex'] == val['NB_pred']).count()
        accuracy = correct/total
        val_dict[(split_param, seed)] = accuracy

In [208]:
val_dict

{('50/40/10', 0): 0.4376731301939058,
 ('50/40/10', 1): 0.506631299734748,
 ('50/40/10', 2): 0.5028248587570622,
 ('50/40/10', 3): 0.4585492227979275,
 ('50/40/10', 4): 0.4869109947643979,
 ('50/40/10', 5): 0.5014005602240896,
 ('50/40/10', 6): 0.4883720930232558,
 ('50/40/10', 7): 0.42857142857142855,
 ('50/40/10', 8): 0.44972067039106145,
 ('50/40/10', 9): 0.5068119891008175,
 ('60/30/10', 0): 0.45706371191135736,
 ('60/30/10', 1): 0.4986737400530504,
 ('60/30/10', 2): 0.5028248587570622,
 ('60/30/10', 3): 0.4637305699481865,
 ('60/30/10', 4): 0.5026178010471204,
 ('60/30/10', 5): 0.5070028011204482,
 ('60/30/10', 6): 0.4935400516795866,
 ('60/30/10', 7): 0.42587601078167114,
 ('60/30/10', 8): 0.46368715083798884,
 ('60/30/10', 9): 0.5040871934604905,
 ('70/20/10', 0): 0.4709141274238227,
 ('70/20/10', 1): 0.5013262599469496,
 ('70/20/10', 2): 0.5084745762711864,
 ('70/20/10', 3): 0.4740932642487047,
 ('70/20/10', 4): 0.49476439790575916,
 ('70/20/10', 5): 0.5070028011204482,
 ('70/2

In [219]:
max(val_dict, key = val_dict.get)

('70/20/10', 2)

In [194]:
train

DataFrame[tfidf: vector, topicIndex: double]

In [203]:
val_dict[([1],2)]=3

TypeError: unhashable type: 'list'

In [204]:
[0.5, 0.4, 0.1]

[0.5, 0.4, 0.1]