In [1]:
from scipy.io import arff
import pandas as pd

# Load the .arff file into Python
data = arff.loadarff('weather.nominal.arff')
df = pd.DataFrame(data[0])

# Display the first 10 rows
df.head(10)

Unnamed: 0,outlook,temperature,humidity,windy,play
0,b'sunny',b'hot',b'high',b'FALSE',b'no'
1,b'sunny',b'hot',b'high',b'TRUE',b'no'
2,b'overcast',b'hot',b'high',b'FALSE',b'yes'
3,b'rainy',b'mild',b'high',b'FALSE',b'yes'
4,b'rainy',b'cool',b'normal',b'FALSE',b'yes'
5,b'rainy',b'cool',b'normal',b'TRUE',b'no'
6,b'overcast',b'cool',b'normal',b'TRUE',b'yes'
7,b'sunny',b'mild',b'high',b'FALSE',b'no'
8,b'sunny',b'cool',b'normal',b'FALSE',b'yes'
9,b'rainy',b'mild',b'normal',b'FALSE',b'yes'


In [2]:
from sklearn.preprocessing import LabelEncoder


# Converts all attributes (but not the class) into dummy vairables (binary)
df_dummies = pd.get_dummies(df, columns = ['outlook', 'temperature', 'windy', 'humidity'])


# Convert class from string into integers
enc = LabelEncoder()
class_num = enc.fit_transform(df_dummies["play"])
df_dummies["play"] = class_num

df_dummies.head(5)


Unnamed: 0,play,outlook_b'overcast',outlook_b'rainy',outlook_b'sunny',temperature_b'cool',temperature_b'hot',temperature_b'mild',windy_b'FALSE',windy_b'TRUE',humidity_b'high',humidity_b'normal'
0,0,0,0,1,0,1,0,1,0,1,0
1,0,0,0,1,0,1,0,0,1,1,0
2,1,1,0,0,0,1,0,1,0,1,0
3,1,0,1,0,0,0,1,1,0,1,0
4,1,0,1,0,1,0,0,1,0,0,1


In [4]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
import numpy as np


# Run the data through an NBC. https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB
clf = BernoulliNB()

# 3 fold cross-validation. See https://scikit-learn.org/stable/modules/model_evaluation.html
accuracies = cross_val_score(clf, df_dummies.drop(["play"], axis = 1), df_dummies["play"], scoring='accuracy',  cv=3)
print("Accuracies from 3-fold cross-validation:", accuracies)
print("Mean accuracy:", np.mean(accuracies))

Accuracies from 3-fold cross-validation: [0.8 0.8 0.5]
Mean accuracy: 0.7000000000000001


In [4]:
import pandas as pd

# Text processing

abstract1 = """The Moon is an astronomical body that orbits planet Earth and is 
Earth's only permanent natural satellite. It is the fifth-largest natural satellite in 
the Solar System, and the largest among planetary satellites relative to the size of the 
planet that it orbits (its primary). The Moon is after Jupiter's satellite Io the second-densest 
satellite in the Solar System among those whose densities are known. """

abstract2 = """Mars is the fourth planet from the Sun and the second-smallest planet in 
the Solar System after Mercury. In English, Mars carries a name of the Roman god of war, 
and is often referred to as the "Red Planet"[15][16] because the reddish iron oxide prevalent 
on its surface gives it a reddish appearance that is distinctive among the astronomical bodies
visible to the naked eye.[17] Mars is a terrestrial planet with a thin atmosphere, having surface
features reminiscent both of the impact craters of the Moon and the valleys, deserts, and polar 
ice caps of Earth. """

abstract3 = """The Sun is the star at the center of the Solar System. It is a nearly perfect 
sphere of hot plasma,[15][16] with internal convective motion that generates a magnetic field 
via a dynamo process.[17] It is by far the most important source of energy for life on Earth. 
Its diameter is about 1.39 million kilometers (864,000 miles), or 109 times that of Earth, and 
its mass is about 330,000 times that of Earth. It accounts for about 99.86% of the total mass 
of the Solar System.[18] Roughly three quarters of the Sun's mass consists of hydrogen (~73%); 
the rest is mostly helium (~25%), with much smaller quantities of heavier elements, including 
oxygen, carbon, neon, and iron.[19]"""


text_df = pd.DataFrame( [[abstract1, 0], [abstract2, 1], [abstract3, 0]], columns = ["abstract", "is_planet"])
text_df.head()


Unnamed: 0,abstract,is_planet
0,The Moon is an astronomical body that orbits p...,0
1,Mars is the fourth planet from the Sun and the...,1
2,The Sun is the star at the center of the Solar...,0


In [5]:
# A simple boolean representation of word appearances
import numpy as np 

# Find all the words
words = []
for i in range(0, text_df.shape[0]):
    
    abstract = text_df["abstract"][i]
    words_abstract = abstract.split(" ") # Split when there is a space
    words.extend(words_abstract)

    
# Remove duplicate words and sort them alphabetically
words = list(np.unique(np.sort(words)))
#print(words)

# There are still many things you should fix:
#         - uppercase vs lowercase
#         - full stops, newlines \n, and other punctuation
#         - [citations]

# Create a boolean describing whether each word is in each abstract
text_boolean_df = pd.DataFrame( np.zeros((text_df.shape[0], len(words) + 1), dtype = int) , columns = words + ["is_planet"])
text_boolean_df["is_planet"] = text_df["is_planet"]
for i in range(0, text_df.shape[0]):
    
    abstract = text_df["abstract"][i]
    words_abstract = abstract.split(" ") # Split when there is a space
    for j in range(0, len(words_abstract)):
        word = words_abstract[j]
        text_boolean_df[word][i] = 1     
text_boolean_df.head()

# Do some more preprocessing? Then run NBC.

Unnamed: 0,Unnamed: 1,Earth's,Its,and,ice,its,of,on,"oxygen,",planet,...,those,three,times,to,total,"valleys,","war,",whose,with,is_planet
0,1,1,0,0,0,0,0,0,0,1,...,1,0,0,1,0,0,0,1,0,0
1,1,0,0,1,1,0,0,1,0,0,...,0,0,0,1,0,1,1,0,1,1
2,0,0,1,0,0,1,1,0,1,0,...,0,1,1,0,1,0,0,0,1,0


In [1]:
import pandas as pd

# Load in the training set .csv
training_set = pd.read_csv("trg.csv")
training_set.head()


# Process the text, find a 'good model' with cross-validation
print("Text processing...")


# Train the NBC with this data (your own NBC code)
print("Training the NBC...")


# Use this 'good model' to generate classifications. 
def classify(abstracts):
    
    # Text processing, cleaning, outlier removal, attribute selection etc. 
    # This function must be deterministic 
    # eg. if you select the 100 most frequent words, it must be the 100 most frequent words in the TRAINING set not
    # in the 'abstracts' parsed
    print("Processing the test abstracts...")
    
    
    # Run processed abstracts through the pre-trained naive bayes classifier
    print("Classifying the test abstracts...")
    
    
    
    # Temporary: use the null model. Assign everything to "E"
    return ["E" for x in abstracts]
    
    
# Load in the test set .csv
test_set = pd.read_csv("tst.csv")

# Apply the model to the test set
test_set_class_predictions = classify(test_set["abstract"])
test_set["class"] = test_set_class_predictions


# Write the test set classifications to a .csv so it can be submitted to Kaggle
test_set.drop(["abstract"], axis = 1).to_csv("tst_kaggle.csv", index=False)
test_set.head()



Text processing...
Training the NBC...
Processing the test abstracts...
Classifying the test abstracts...


Unnamed: 0,id,abstract,class
0,1,in a previous work all three components of com...,E
1,2,we compared morphology of two geographically c...,E
2,3,factor xiii mr 320000 is a blood coagulation f...,E
3,4,we report the characterisation of a human gene...,E
4,5,fat tissue plays a critical role in the regula...,E


In [25]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import BernoulliNB
import numpy as np
from sklearn.preprocessing import LabelEncoder


# 10 x 10-fold cross-validation on weka's data/ionosphere.arff
data = arff.loadarff('ionosphere.arff')
ionosphere_df = pd.DataFrame(data[0])


# Convert class from string into integers
enc = LabelEncoder()
ionosphere_df["class"] = enc.fit_transform(ionosphere_df["class"])


# Pick a classifier
nbc = BernoulliNB()


# Select some random seeds
num_times = 10
num_folds = 10
random_seeds = [int(x) for x in np.random.uniform(0, 10000, num_times)]
print("The random seeds are", random_seeds, "\n")


mean_accuracies = []

# Perform {num_folds}-fold cross-validation {num_times}-times
for i in range(0, num_folds):
    cv = KFold(num_folds, shuffle = True, random_state = random_seeds[i])
    accuracies = cross_val_score(nbc, ionosphere_df.drop(["class"], axis = 1), ionosphere_df["class"], scoring='accuracy',  cv=cv)
    mean_accuracies.append(np.mean(accuracies))

    
print("The", num_times, "accuracies are", mean_accuracies, "\n")
print("The mean accuracy from", num_times, "times", num_folds, "fold cross-validation is", np.mean(mean_accuracies), "with a standard deviation of", np.sqrt(np.var(mean_accuracies)))



The random seeds are [7154, 6533, 7896, 8723, 124, 7118, 853, 5057, 784, 1419] 

The 10 accuracies are [0.8602380952380952, 0.8631746031746032, 0.8545238095238096, 0.8660317460317459, 0.8576190476190476, 0.8632539682539683, 0.8573809523809522, 0.8630952380952381, 0.8605555555555556, 0.851904761904762] 

The mean accuracy from 10 times 10 fold cross-validation is 0.8597777777777779 with a standard deviation of 0.004177921307902549
