Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
148 lines (147 sloc) 3.54 KB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Wikipedia articles in Weka"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import weka and libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%maven nz.ac.waikato.cms.weka:weka-stable:3.8.3\n",
"\n",
"import weka.core.*;\n",
"import weka.core.converters.ConverterUtils;\n",
"import weka.filters.Filter;\n",
"import weka.filters.unsupervised.attribute.StringToWordVector;\n",
"import weka.classifiers.bayes.NaiveBayesMultinomial;\n",
"import weka.classifiers.*;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Read data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"// Read raw data (all words are in one string)\n",
"ConverterUtils.DataSource source = new ConverterUtils.DataSource(\"../data/wikipedia_300.arff\");\n",
"Instances raw = source.getDataSet();\n",
"\n",
"// Convert to bag-of-words using the StringToWordVector filter\n",
"StringToWordVector stw = new StringToWordVector(10000);\n",
"stw.setLowerCaseTokens(true);\n",
"stw.setInputFormat(raw);\n",
"\n",
"Instances data = Filter.useFilter(raw, stw);\n",
"\n",
"// If StringToWordVector is used, Weka puts the\n",
"// class attribute first (in contrast to the default\n",
"// where class attribute is last)\n",
"data.setClassIndex(0);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Build classifier"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"Classifier cl = new NaiveBayesMultinomial();\n",
"cl.buildClassifier(data);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Evaluate results"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Correctly Classified Instances 283 94.3333 %\n",
"Incorrectly Classified Instances 17 5.6667 %\n",
"Kappa statistic 0.8867\n",
"Mean absolute error 0.058 \n",
"Root mean squared error 0.2392\n",
"Relative absolute error 11.6036 %\n",
"Root relative squared error 47.8303 %\n",
"Total Number of Instances 300 \n",
"\n",
"=== Confusion Matrix ===\n",
"\n",
" a b <-- classified as\n",
" 143 7 | a = Programming\n",
" 10 140 | b = Games\n",
"\n"
]
}
],
"source": [
"Evaluation eval = new Evaluation(data);\n",
"eval.crossValidateModel(cl, data, 10, new java.util.Random(1));\n",
"System.out.println(eval.toSummaryString());\n",
"System.out.println(eval.toMatrixString());"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Java",
"language": "java",
"name": "java"
},
"language_info": {
"codemirror_mode": "java",
"file_extension": ".jshell",
"mimetype": "text/x-java-source",
"name": "Java",
"pygments_lexer": "java",
"version": "11.0.3+12-LTS"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
You can’t perform that action at this time.