Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #420 from aayushKumarJarvis/master
Stop Words - A Text Mining Problem
- Loading branch information
Showing
4 changed files
with
155 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import java.util.Scanner; | ||
|
||
public class Factorial { | ||
|
||
public static void main(String[] args) { | ||
Scanner scanner = new Scanner(System.in); | ||
System.out.print("Enter the number whose factorial is to be found: "); | ||
int n = scanner.nextInt(); | ||
int result = factorial(n); | ||
System.out.println("The factorial of " + n + " is " + result); | ||
} | ||
|
||
public static int factorial(int n) { | ||
int result = 1; | ||
for (int i = 1; i <= n; i++) { | ||
result = result * i; | ||
} | ||
return result; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import java.util.*; | ||
import java.io.*; | ||
|
||
public class StopWords { | ||
|
||
private static String OUTPUT_FILE = "YOUR_FILE_LOCATION"; | ||
|
||
public static Boolean searchForStopWord(String word, List<String> textForCheck) { | ||
|
||
int indexOfWord = Collections.binarySearch(textForCheck,word); | ||
|
||
if(indexOfWord < 0) | ||
return false; | ||
else | ||
return true; | ||
} | ||
|
||
public static List<String> readStopWords(String stopWordsFilename) throws Exception { | ||
|
||
FileInputStream fStream = new FileInputStream(stopWordsFilename); | ||
|
||
DataInputStream dataStreamObject = new DataInputStream(fStream); | ||
BufferedReader objectForBuffer = new BufferedReader(new InputStreamReader(dataStreamObject)); | ||
|
||
String strLine; | ||
String oneLinerString = ""; | ||
|
||
while ((strLine = objectForBuffer.readLine()) != null) { | ||
strLine.trim(); | ||
oneLinerString = oneLinerString + "," + strLine; | ||
} | ||
|
||
List<String> tokenizedList = Arrays.asList(oneLinerString.split(",")); | ||
fStream.close(); | ||
|
||
return tokenizedList; | ||
} | ||
|
||
public static void removeStopWords(String textFilename, List<String> stopWords) { | ||
|
||
try { | ||
FileOutputStream outputStream = new FileOutputStream(OUTPUT_FILE); | ||
PrintStream outputFileWriter = new PrintStream(outputStream); | ||
|
||
FileInputStream fStream = new FileInputStream(textFilename); | ||
DataInputStream dataStreamObject = new DataInputStream(fStream); | ||
BufferedReader objectForBuffer = new BufferedReader(new InputStreamReader(dataStreamObject)); | ||
|
||
String strLine; | ||
|
||
while ((strLine = objectForBuffer.readLine()) != null) { | ||
|
||
boolean flag = false; | ||
|
||
List<String> tokenizedList = Arrays.asList(strLine.split("([^a-zA-z0-9])")); | ||
|
||
for(int i=0;i<tokenizedList.size();i++) { | ||
|
||
flag = searchForStopWord(tokenizedList.get(i), stopWords); | ||
|
||
if (!flag) | ||
outputFileWriter.print(tokenizedList.get(i)+" "); | ||
|
||
flag = false; | ||
} | ||
|
||
outputFileWriter.print("\n"); | ||
} | ||
} | ||
|
||
catch(Exception e){ | ||
System.err.println(e.getMessage()); | ||
} | ||
} | ||
|
||
public static void main(String[] arg) throws Exception { | ||
|
||
Scanner keyboard = new Scanner(System.in); | ||
|
||
System.out.print("Please type stop words file name: "); | ||
List<String> stopWords = readStopWords(keyboard.next()); | ||
|
||
System.out.print("Please type text file name: "); | ||
removeStopWords(keyboard.next(), stopWords); | ||
|
||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
mport org.junit.Test; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
import static org.junit.Assert.*; | ||
|
||
public class TestStopWords { | ||
|
||
@Test | ||
public void testSearchForStopWord() { | ||
|
||
String testWord1 = "Aayush"; | ||
String testWord2 = "Kumar"; | ||
String testWord3 = "Srivastava"; | ||
String testWord4 = "Random Text"; | ||
String testWord5 = "Text"; | ||
|
||
String[] listOfNames = {"Aayush","Kumar","Srivastava"}; | ||
List<String> testString = Arrays.asList(listOfNames); // Converting Array into List of String | ||
|
||
assertEquals(StopWords.searchForStopWord(testWord1, testString), true); | ||
assertEquals(StopWords.searchForStopWord(testWord2,testString),true); | ||
assertEquals(StopWords.searchForStopWord(testWord3,testString),true); | ||
assertEquals(StopWords.searchForStopWord(testWord4,testString),false); | ||
assertEquals(StopWords.searchForStopWord(testWord5,testString),false); | ||
} | ||
|
||
@Test | ||
public void testReadStopWords() throws Exception { | ||
|
||
List<String> tokenizedText = StopWords.readStopWords("FILE_NAME"); | ||
System.out.println(tokenizedText); | ||
} | ||
|
||
@Test | ||
public void testForRemoveStopWords() throws Exception { | ||
|
||
List<String> tokenizedText = StopWords.readStopWords("FILE_NAME"); | ||
StopWords.removeStopWords("FILE_NAME",tokenizedText); | ||
} | ||
|
||
} | ||
|