## Spam Classification

In [1]:
%%loadFromPOM
<dependency>
    <groupId>org.apache.opennlp</groupId>
    <artifactId>opennlp-tools</artifactId>
    <version>1.9.0</version>
</dependency>

In [2]:
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.DocumentSampleStream;
import opennlp.tools.doccat.DoccatFactory;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.MarkableFileInputStreamFactory;

In [17]:
ArrayList<String> testList = new ArrayList();
try (BufferedWriter spamBufferedWriter = new BufferedWriter(
        new FileWriter(new File("../data/spamtraining.train")))) {
    
        String rootDirectoryName = "../data/lingspam_public/stop";
    File rootDirectory = new File(rootDirectoryName);
    for (String directoryName : rootDirectory.list()) {
        File file = new File(rootDirectoryName + "/" + directoryName);
        
//         // Open one file at a time
        String fileNames[] = file.list();
        if (fileNames != null) {
            for (String fileName : fileNames) {
                    if (!fileName.equals(".ipynb_checkpoints")){
                    String filePath = rootDirectoryName + "/" + directoryName + 
                        "/" + fileName;
                    StringBuilder lineStringBuilder = new StringBuilder();

                    // OpenNLP requires a SPAM or HAM header, followed by single line text 
                    BufferedReader br = new BufferedReader(new FileReader(new File(filePath)));
                    String line = null;
                    if (fileName.contains("spms")) {
                        lineStringBuilder.append("spam\t");
                    } else {
                        lineStringBuilder.append("ham\t");
                    }
                    while ((line = br.readLine()) != null) {
                        lineStringBuilder.append(line);
                    }
                    if (directoryName.equals("part10")) {
                        testList.add(lineStringBuilder.toString());
                    } else {
                        spamBufferedWriter.write(lineStringBuilder.toString() + "\n");
                    }
                    lineStringBuilder.setLength(0);
                }
            }
        }
    }
} catch (IOException ex) {
    // Handle exceptions
    System.out.println("Files can't be found");
}

In [18]:
// Test the model using the test data in testListArrayList

try (InputStream dataInputStream = new FileInputStream("../data/spamtraining.train")) {
    // Create input stream for training data
    InputStreamFactory isf = new InputStreamFactory() {
    public InputStream createInputStream() throws IOException {
            return dataInputStream;
        }
    };
    
    ObjectStream<String> objectStream = new PlainTextByLineStream(isf, StandardCharsets.UTF_8);
    ObjectStream<DocumentSample> documentSampleStream = new DocumentSampleStream(objectStream);

    DoccatModel documentCategorizationModel = DocumentCategorizerME.train("en", documentSampleStream,
                                                                         TrainingParameters.defaultParams(), new DoccatFactory());
    DocumentCategorizerME documentCategorizer = new DocumentCategorizerME(documentCategorizationModel);
    for (int i=0; i<testList.size(); i++) {
        String testItem = testList.get(i);
        String[] testWords = testItem.replaceAll("[^A-Za-z]", " ").split(" ");
        double[] probabilities = documentCategorizer.categorize(testWords);
        String bestCategory = documentCategorizer.getBestCategory(probabilities);
        System.out.println("The best fit for: [" + testItem.subSequence(0, 32) + "...] is: " + bestCategory);
    }
    
} catch (FileNotFoundException ex) {
    // Handle exceptions
    System.out.println("Cant find files");
} catch (IOException ex) {
    // Handle exceptions
    System.err.print("ERROR: File containing _______ information not found:\n");
    ex.printStackTrace();
    //System.exit(1);
}

Indexing events with TwoPass using cutoff of 5

	Computing event counts...  done. 867 events
	Indexing...  done.
Sorting and merging events... done. Reduced 867 events to 865.
Done indexing in 0.33 s.
Incorporating indexed data for training...  
done.
	Number of Event Tokens: 865
	    Number of Outcomes: 2
	  Number of Predicates: 6589
...done.
Computing model parameters ...
Performing 100 iterations.
  1:  ... loglikelihood=-600.9586055454669	0.8339100346020761
  2:  ... loglikelihood=-563.4439818431681	0.8673587081891581
  3:  ... loglikelihood=-534.494142891077	0.8846597462514417
  4:  ... loglikelihood=-510.9776348762483	0.895040369088812
  5:  ... loglikelihood=-491.1479986818805	0.9100346020761245
  6:  ... loglikelihood=-473.99868163524843	0.9192618223760092
  7:  ... loglikelihood=-458.8998105080639	0.9250288350634371
  8:  ... loglikelihood=-445.4270915060151	0.9296424452133795
  9:  ... loglikelihood=-433.2783196776066	0.9377162629757786
 10:  ... loglikelihood=-422.229207397

### Hard-coding Regex pattern

In [19]:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

In [23]:
public void isSpam(String text) {
    try (BufferedReader br = new BufferedReader(new FileReader(new File("../data/spam-examples.txt")))) {
        String line = null;
        while ((line = br.readLine()) != null) {
            Pattern pattern = Pattern.compile(line);
            Matcher matcher = pattern.matcher(text);
            if (matcher.find() == true) {
                System.out.println("Spam detected");
                break;
            }
        }
    } catch (FileNotFoundException e) {
        // Handle exceptions
        System.out.println("spam-examples.txt not found");
    } catch (IOException e) {
        // Handle exceptions
        System.err.print("ERROR: File containing _______ information not found:\n");
        e.printStackTrace();
    }
}

In [24]:
// test String
String testString = "Congratualtions! You have won! Click here...";
isSpam(testString);

Spam detected
