Permalink
Browse files

Done!

  • Loading branch information...
1 parent 0e3b98a commit 97d5913f74748bb3dbb7e69c5a2c5dddbf9cd69f @hp685 committed May 4, 2012
Showing with 0 additions and 168 deletions.
  1. +0 −130 Clustering/#Cluster.cpp#
  2. +0 −26 Clustering/test.cpp
  3. +0 −12 Clustering/wrapper.sh
View
@@ -1,130 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <string>
-#include <fstream>
-#include "Clustering.h" //Defines the Parser
-
-void Cluster_Node::init(){
- this->NodeID = 0;
- this->ClusterID = 0;
- this->text ="";
- this->href ="";
- this->link_title ="";
- this->color= "white"; //white/black
- this->words.clear();
-}
-vector<Cluster_Node> N;
-vector<Cluster_Node>::iterator it;
-vector<string> stopwords;
-void init(){
- int counter = 1;
- for(it = N.begin(); it != N.end(); it++){
- it->NodeID = counter;
- counter++;
- }
-}
-
-/*DO a BFS using a BFS driver to find connected components.
-The BFS Driver ensures that all disjoint connected components
-are found. */
-void cluster(){
-
-}
-
-/*
- Checks two Cluster_Nodes for common patterns, if a match, add an edge bi-directional edge.
-*/
-void connect_nodes(){
-
-}
-
-void stem(){ //Returns a vector of zero as well as non-zero elements.
- string run = "";
-
- for(it = N.begin(); it != N.end(); it++){
- run="";
- for(int j = 0; j < it->text.length(); j++){
- if(it->text[j] == ' '){
- it->words.push_back(run);
- run = "";
- }
- else{
- run += it->text[j];
- }
- }
- //Perform Stemming.
- for(int k = 0; k < stopwords.size(); k++){
- for(int j = 0; j < it->words.size(); j++){
- if(it->words[j].length() < 3){
- it->words[j] ="";
- }
- else if(it->words[j] == stopwords[k]){
- it->words[j] = "";
- }
- else if(it->words[j][it->words[j].length() - 1] == 's'){
- it->words[j][it->words[j].length()-1] = '\0';
- }
- }
- }/*
- for(int j = 0; j < it->words.size(); j++){ // FOR DEBUGGING
- cout<<it->words[j]<<endl;
- }*/
- }
-}
-
-int main()
-{
- string sword;
- ifstream wordlist("stop-words");
- while(wordlist>>sword){
- stopwords.push_back(sword);
- }
-
- ifstream html("html_input");
-
- string query,running="";
- string para = "<p>";
- Cluster_Node current;
- int count;
- while(getline(html,query)){ //Can use tolower anytime later.
- /*
- Accumulates the string into running until the next <p> is found.
- */
- for(int i = 0; i < query.length(); i++){
- query[i] = tolower(query[i]);
- }
- if(query.length() == 3){
- count = 0;
- for(int i = 0; i < query.length(); i++){
- if(query[i] == para[i]){
- count++;
- }
- }
- if(count == 3){
- /*
- Perform Extraction
- */
- current.text = current.parse_text(running);
- current.href = current.parse_href(running);
- current.link_title = current.parse_title(running);
- N.push_back(current);
- current.init();
- running = "";
-
- }
- }
- else {
-
- for(int i = 0; i < query.length(); i++){
- running += query[i];
- }
- }
- }
-
- init(); //Initialize all nodes
- stem();
- connect_nodes();
- cluster();
-
- return 0;
-}
View
@@ -1,26 +0,0 @@
-#include<iostream>
-#include<string>
-#include<fstream>
-#include<cstdio>
-#include<cctype>
-#include "Parser.hpp"
-
-using namespace std;
-
-int main()
-{
- freopen("test-input","r",stdin);
- string s="",r="",x="",y="";
- getline(cin,s);
- for(int i = 0; i < s.length(); i++){
- s[i] = tolower(s[i]);
- }
- //cout<<s<<endl;
- Parser P;
- r = P.parse_title(s);
- x = P.parse_text(s);
- y = P.parse_href(s);
- // cout<<r<<endl;
- cout<<x<<endl;
- // cout<<y<<endl;
-}
View
@@ -1,12 +0,0 @@
-#Case-Insensitive match.
-IGNORECASE = 1
-#awk '/\<A href*/ {print $0}' html_input
-
-#Removes the <p> tag and puts everything into the file input.
-awk '$0 !~ /\<p\>/ {print $0} ' html_input > input
-
-#A HREF part and then the Link name on the next line.
-#Field Separator is </A> Tag
-awk 'BEGIN{RS="\<\/A\>"}/\<A href*/ {print $0}' input > input1
-#Remove the href value
-awk '/\<A href=*/ {print $2}' input1>input2

0 comments on commit 97d5913

Please sign in to comment.