@@ -0,0 +1,106 @@
package com.kemi.storage.crawler.impl;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.kemi.database.LinksDao;
import com.kemi.storage.crawler.WebCrawler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import javax.transaction.Transactional;
import java.net.URL;
import java.util.HashSet;
import java.util.List;

/**
* Created by Kutsyk on 21.03.15.
*/
@Service
public class WebCrawlerImpl implements WebCrawler {

@Autowired
private LinksDao linksDao;

private HashSet<String> visitedLinks = Sets.newHashSet();
private HashSet<String> pdfLinks = Sets.newHashSet();
private HashSet<String> unavailableLinks = Sets.newHashSet();
private Integer timeout;

@Transactional
public void start(URL url, int time) {
timeout = time;
getLinksOnPage(url.toString());
}

private boolean isAllowed(String link) {
return (link.startsWith("nz.ukma.edu.ua")
|| link.startsWith("http://nz.ukma.edu.ua")
|| link.startsWith("https://nz.ukma.edu.ua")
||link.startsWith("http://www.ekmair.ukma.edu.ua")
|| link.startsWith("http://ekmair.ukma.edu.ua")
|| link.startsWith("https://ekmair.ukma.edu.ua")
|| link.startsWith("ekmair.ukma.edu.ua"))
&& !unavailableLinks.contains(link);
}

private void getLinksOnPage(String url) {
if (visitedLinks.contains(url))
return;
format(url);
if (visitedLinks.contains(url))
return;
if (!isAllowed(url))
return;
System.out.println("New link is "+url);
List<String> references = Lists.newArrayList();
boolean pageParsed = parsePage(url, references);
if(pageParsed)
visitedLinks.add(url);
else
unavailableLinks.add(url);
for (String link : references)
getLinksOnPage(link);
}

private String format(String url) {
if (!url.startsWith("http"))
url = "http://nz.ukma.edu.ua" + url;
return url;
}

private boolean parsePage(String url, List<String> references) {
Document doc;
try {
doc = Jsoup.connect(url).timeout(timeout).get();
for (Element link : doc.select("a[href]")) {
String href = link.attr("href");
href = format(href);
if (!isAllowed(href))
continue;
references.add(href);
if (href.endsWith("pdf")) {
System.out.println("New pdf is "+href);
pdfLinks.add(href);
linksDao.create(href);
}
}
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}

@Override
public HashSet<String> getVisitedLinks() {
return visitedLinks;
}

@Override
public HashSet<String> getPdfLinks() {
return pdfLinks;
}
}
@@ -0,0 +1,11 @@
package com.kemi.storage.pdf;

/**
* Created by KutsykV on 23.09.2015.
*/
public interface PDFExtractor {
void setFile(String file);
String getTitle();
String getText();
String getAnnotation();
}
@@ -0,0 +1,136 @@
package com.kemi.storage.pdf.impl;

import com.ibm.icu.util.StringTokenizer;
import com.kemi.storage.pdf.PDFExtractor;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.springframework.stereotype.Service;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.net.URL;

/**
* Created by SV on 21.04.2015.
*/
@Service
public class PDFExtractorImpl implements PDFExtractor {

private String title;
private String text;
private String annotation;

public PDFExtractorImpl() {
}

@Override
public void setFile(String file) {
extract(file);
}

private void extract(String file) {
extractText(file);

String text = getFirstPageText(file);
BufferedReader reader = new BufferedReader(new StringReader(text));
try {
String str = reader.readLine();
if (str != null)
while (!str.toUpperCase().startsWith("УД"))
if ((str = reader.readLine()) == null) {
this.title = "[n/a]";
return;
}

StringBuilder sb = new StringBuilder();
do {
str = reader.readLine();
if (str != null && hasUppercaseWord(str))
sb.append(str);
else break;
} while (true);
this.title = sb.toString().toUpperCase();

sb = new StringBuilder();
int count = 255;
do {
if (str == null) {
sb.append("...");
break;
}
if (count > str.length()) {
count -= str.length();
sb.append(str);
} else {
sb.append(str.substring(0, count)).append("...");
count = 0;
}
str = reader.readLine();
} while (count > 0);
this.annotation = sb.toString();

reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}

private void extractText(String file) {
try {
PDDocument doc = PDDocument.load(new URL(file));
PDFTextStripper stripper = new PDFTextStripper();
this.text = stripper.getText(doc);
doc.close();
} catch (IOException e) {
e.printStackTrace();
}
}

private String getFirstPageText(String file) {
String text = "";
try {
PDDocument doc = PDDocument.load(new URL(file));
PDFTextStripper stripper = new PDFTextStripper();
stripper.setEndPage(1);
text = stripper.getText(doc);
doc.close();
} catch (IOException e) {
e.printStackTrace();
}
return text;
}

private boolean hasUppercaseWord(String str) {
StringTokenizer st = new StringTokenizer(str);
while (st.hasMoreTokens())
if (hasTwoUppercaseLetters(st.nextToken()))
return true;
return false;
}

private boolean hasTwoUppercaseLetters(String word) {
byte count = 0;
for (int i = 0; i < word.length(); ++i)
if (Character.isUpperCase(word.charAt(i)))
++count;
if (count > 1)
return true;
return false;
}

@Override
public String getTitle() {
return title;
}

@Override
public String getText() {
return text;
}

@Override
public String getAnnotation() {
return annotation;
}
}
@@ -2,7 +2,8 @@
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:mvc="http://www.springframework.org/schema/mvc"
xmlns:context="http://www.springframework.org/schema/context"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd http://www.springframework.org/schema/mvc http://www.springframework.org/schema/mvc/spring-mvc.xsd http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd">
xmlns:p="http://www.springframework.org/schema/p" xmlns:tx="http://www.springframework.org/schema/tx"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd http://www.springframework.org/schema/mvc http://www.springframework.org/schema/mvc/spring-mvc.xsd http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx.xsd">


<context:annotation-config/>
@@ -12,4 +13,41 @@

<mvc:resources mapping="/resources/**" location="/resources/"/>

<!-- dataSource -->
<bean id="dataSource"
class="org.springframework.jdbc.datasource.DriverManagerDataSource"
p:driverClassName="com.mysql.jdbc.Driver"
p:url="jdbc:mysql://localhost:3306/neuron"
p:username="root"
p:password="root"/>

<!-- setup hibernate session -->
<bean id="sessionFactory"
class="org.springframework.orm.hibernate3.LocalSessionFactoryBean">
<property name="dataSource" ref="dataSource"/>
<property name="configLocation">
<value>classpath:hibernate.cfg.xml</value>
</property>
<property name="configurationClass">
<value>org.hibernate.cfg.AnnotationConfiguration</value>
</property>
<property name="hibernateProperties">
<props>
<prop key="hibernate.show_sql">false</prop>
<prop key="hibernate.dialect">org.hibernate.dialect.MySQL5Dialect</prop>
<prop key="hibernate.connection.charSet">UTF-8</prop>
</props>
</property>
</bean>


<!-- @Transactional -->
<tx:annotation-driven transaction-manager="transactionManager"/>

<!-- transaction manager -->
<bean id="transactionManager"
class="org.springframework.orm.hibernate3.HibernateTransactionManager">
<property name="sessionFactory" ref="sessionFactory"/>
</bean>

</beans>
@@ -0,0 +1,31 @@
<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE hibernate-configuration PUBLIC
"-//Hibernate/Hibernate Configuration DTD//EN"
"http://hibernate.sourceforge.net/hibernate-configuration-3.0.dtd">

<hibernate-configuration>
<session-factory>
<property name="connection.url">jdbc:mysql://localhost:3306/nextbook</property>
<property name="hibernate.bytecode.use_reflection_optimizer">false</property>
<property name="connection.driver_class">com.mysql.jdbc.Driver</property>
<property name="connection.username">root</property>
<property name="connection.password">root</property>
<property name="show_sql">false</property>
<property name="dialect">org.hibernate.dialect.MySQL5Dialect</property>

<!--http://www.mchange.com/projects/c3p0/#configuration-->
<property name="hibernate.connection.provider_class">org.hibernate.connection.C3P0ConnectionProvider</property>
<property name="connection.provider_class">org.hibernate.connection.C3P0ConnectionProvider</property>
<!--Pool Size-->
<property name="hibernate.c3p0.min_size">5</property>
<property name="hibernate.c3p0.max_size">30</property>
<property name="hibernate.c3p0.acquire_increment">1</property>

<property name="hibernate.c3p0.timeout">100</property>
<property name="hibernate.c3p0.max_statements">50</property>
<property name="hibernate.c3p0.idle_test_period">1000</property>
<property name="hibernate.c3p0.validate">true</property>

<mapping class="com.kemi.entities.PdfLink"/>
</session-factory>
</hibernate-configuration>
@@ -39,6 +39,13 @@
<script src="/resources/jquery/jquery-2.1.3.min.js"></script>
<script type="text/javascript">
$(document).ready(function () {
$("#sb").click(function(e){start();});
window.setInterval(function(){
find();
}, 5000);
});
start = function () {
$.get("/new")
.done(function (data) {
console.log(data);
@@ -47,7 +54,17 @@
.fail(function (data) {
console.log(data)
});
});
};
find = function () {
$.get("/find")
.done(function (data) {
console.log(data);
$("#one").html(syntaxHighlight(data));
})
.fail(function (data) {
console.log(data)
});
};
function syntaxHighlight(json) {
if (typeof json != 'string') {
json = JSON.stringify(json, undefined, 2);
@@ -72,6 +89,7 @@
</script>
</head>
<body>
<p><input type ="button" id = "sb" value="Search Again"/></p>
<pre><code>
<div id="one"></div>
</code></pre>