| @@ -0,0 +1,106 @@ | ||
| package com.kemi.storage.crawler.impl; | ||
|
|
||
| import com.google.common.collect.Lists; | ||
| import com.google.common.collect.Sets; | ||
| import com.kemi.database.LinksDao; | ||
| import com.kemi.storage.crawler.WebCrawler; | ||
| import org.jsoup.Jsoup; | ||
| import org.jsoup.nodes.Document; | ||
| import org.jsoup.nodes.Element; | ||
| import org.springframework.beans.factory.annotation.Autowired; | ||
| import org.springframework.stereotype.Service; | ||
|
|
||
| import javax.transaction.Transactional; | ||
| import java.net.URL; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
|
|
||
| /** | ||
| * Created by Kutsyk on 21.03.15. | ||
| */ | ||
| @Service | ||
| public class WebCrawlerImpl implements WebCrawler { | ||
|
|
||
| @Autowired | ||
| private LinksDao linksDao; | ||
|
|
||
| private HashSet<String> visitedLinks = Sets.newHashSet(); | ||
| private HashSet<String> pdfLinks = Sets.newHashSet(); | ||
| private HashSet<String> unavailableLinks = Sets.newHashSet(); | ||
| private Integer timeout; | ||
|
|
||
| @Transactional | ||
| public void start(URL url, int time) { | ||
| timeout = time; | ||
| getLinksOnPage(url.toString()); | ||
| } | ||
|
|
||
| private boolean isAllowed(String link) { | ||
| return (link.startsWith("nz.ukma.edu.ua") | ||
| || link.startsWith("http://nz.ukma.edu.ua") | ||
| || link.startsWith("https://nz.ukma.edu.ua") | ||
| ||link.startsWith("http://www.ekmair.ukma.edu.ua") | ||
| || link.startsWith("http://ekmair.ukma.edu.ua") | ||
| || link.startsWith("https://ekmair.ukma.edu.ua") | ||
| || link.startsWith("ekmair.ukma.edu.ua")) | ||
| && !unavailableLinks.contains(link); | ||
| } | ||
|
|
||
| private void getLinksOnPage(String url) { | ||
| if (visitedLinks.contains(url)) | ||
| return; | ||
| format(url); | ||
| if (visitedLinks.contains(url)) | ||
| return; | ||
| if (!isAllowed(url)) | ||
| return; | ||
| System.out.println("New link is "+url); | ||
| List<String> references = Lists.newArrayList(); | ||
| boolean pageParsed = parsePage(url, references); | ||
| if(pageParsed) | ||
| visitedLinks.add(url); | ||
| else | ||
| unavailableLinks.add(url); | ||
| for (String link : references) | ||
| getLinksOnPage(link); | ||
| } | ||
|
|
||
| private String format(String url) { | ||
| if (!url.startsWith("http")) | ||
| url = "http://nz.ukma.edu.ua" + url; | ||
| return url; | ||
| } | ||
|
|
||
| private boolean parsePage(String url, List<String> references) { | ||
| Document doc; | ||
| try { | ||
| doc = Jsoup.connect(url).timeout(timeout).get(); | ||
| for (Element link : doc.select("a[href]")) { | ||
| String href = link.attr("href"); | ||
| href = format(href); | ||
| if (!isAllowed(href)) | ||
| continue; | ||
| references.add(href); | ||
| if (href.endsWith("pdf")) { | ||
| System.out.println("New pdf is "+href); | ||
| pdfLinks.add(href); | ||
| linksDao.create(href); | ||
| } | ||
| } | ||
| } catch (Exception e) { | ||
| e.printStackTrace(); | ||
| return false; | ||
| } | ||
| return true; | ||
| } | ||
|
|
||
| @Override | ||
| public HashSet<String> getVisitedLinks() { | ||
| return visitedLinks; | ||
| } | ||
|
|
||
| @Override | ||
| public HashSet<String> getPdfLinks() { | ||
| return pdfLinks; | ||
| } | ||
| } |
| @@ -0,0 +1,11 @@ | ||
| package com.kemi.storage.pdf; | ||
|
|
||
| /** | ||
| * Created by KutsykV on 23.09.2015. | ||
| */ | ||
| public interface PDFExtractor { | ||
| void setFile(String file); | ||
| String getTitle(); | ||
| String getText(); | ||
| String getAnnotation(); | ||
| } |
| @@ -0,0 +1,136 @@ | ||
| package com.kemi.storage.pdf.impl; | ||
|
|
||
| import com.ibm.icu.util.StringTokenizer; | ||
| import com.kemi.storage.pdf.PDFExtractor; | ||
| import org.apache.pdfbox.pdmodel.PDDocument; | ||
| import org.apache.pdfbox.util.PDFTextStripper; | ||
| import org.springframework.stereotype.Service; | ||
|
|
||
| import java.io.BufferedReader; | ||
| import java.io.IOException; | ||
| import java.io.StringReader; | ||
| import java.net.URL; | ||
|
|
||
| /** | ||
| * Created by SV on 21.04.2015. | ||
| */ | ||
| @Service | ||
| public class PDFExtractorImpl implements PDFExtractor { | ||
|
|
||
| private String title; | ||
| private String text; | ||
| private String annotation; | ||
|
|
||
| public PDFExtractorImpl() { | ||
| } | ||
|
|
||
| @Override | ||
| public void setFile(String file) { | ||
| extract(file); | ||
| } | ||
|
|
||
| private void extract(String file) { | ||
| extractText(file); | ||
|
|
||
| String text = getFirstPageText(file); | ||
| BufferedReader reader = new BufferedReader(new StringReader(text)); | ||
| try { | ||
| String str = reader.readLine(); | ||
| if (str != null) | ||
| while (!str.toUpperCase().startsWith("УД")) | ||
| if ((str = reader.readLine()) == null) { | ||
| this.title = "[n/a]"; | ||
| return; | ||
| } | ||
|
|
||
| StringBuilder sb = new StringBuilder(); | ||
| do { | ||
| str = reader.readLine(); | ||
| if (str != null && hasUppercaseWord(str)) | ||
| sb.append(str); | ||
| else break; | ||
| } while (true); | ||
| this.title = sb.toString().toUpperCase(); | ||
|
|
||
| sb = new StringBuilder(); | ||
| int count = 255; | ||
| do { | ||
| if (str == null) { | ||
| sb.append("..."); | ||
| break; | ||
| } | ||
| if (count > str.length()) { | ||
| count -= str.length(); | ||
| sb.append(str); | ||
| } else { | ||
| sb.append(str.substring(0, count)).append("..."); | ||
| count = 0; | ||
| } | ||
| str = reader.readLine(); | ||
| } while (count > 0); | ||
| this.annotation = sb.toString(); | ||
|
|
||
| reader.close(); | ||
| } catch (IOException e) { | ||
| e.printStackTrace(); | ||
| } | ||
| } | ||
|
|
||
| private void extractText(String file) { | ||
| try { | ||
| PDDocument doc = PDDocument.load(new URL(file)); | ||
| PDFTextStripper stripper = new PDFTextStripper(); | ||
| this.text = stripper.getText(doc); | ||
| doc.close(); | ||
| } catch (IOException e) { | ||
| e.printStackTrace(); | ||
| } | ||
| } | ||
|
|
||
| private String getFirstPageText(String file) { | ||
| String text = ""; | ||
| try { | ||
| PDDocument doc = PDDocument.load(new URL(file)); | ||
| PDFTextStripper stripper = new PDFTextStripper(); | ||
| stripper.setEndPage(1); | ||
| text = stripper.getText(doc); | ||
| doc.close(); | ||
| } catch (IOException e) { | ||
| e.printStackTrace(); | ||
| } | ||
| return text; | ||
| } | ||
|
|
||
| private boolean hasUppercaseWord(String str) { | ||
| StringTokenizer st = new StringTokenizer(str); | ||
| while (st.hasMoreTokens()) | ||
| if (hasTwoUppercaseLetters(st.nextToken())) | ||
| return true; | ||
| return false; | ||
| } | ||
|
|
||
| private boolean hasTwoUppercaseLetters(String word) { | ||
| byte count = 0; | ||
| for (int i = 0; i < word.length(); ++i) | ||
| if (Character.isUpperCase(word.charAt(i))) | ||
| ++count; | ||
| if (count > 1) | ||
| return true; | ||
| return false; | ||
| } | ||
|
|
||
| @Override | ||
| public String getTitle() { | ||
| return title; | ||
| } | ||
|
|
||
| @Override | ||
| public String getText() { | ||
| return text; | ||
| } | ||
|
|
||
| @Override | ||
| public String getAnnotation() { | ||
| return annotation; | ||
| } | ||
| } |
| @@ -0,0 +1,31 @@ | ||
| <?xml version='1.0' encoding='utf-8'?> | ||
| <!DOCTYPE hibernate-configuration PUBLIC | ||
| "-//Hibernate/Hibernate Configuration DTD//EN" | ||
| "http://hibernate.sourceforge.net/hibernate-configuration-3.0.dtd"> | ||
|
|
||
| <hibernate-configuration> | ||
| <session-factory> | ||
| <property name="connection.url">jdbc:mysql://localhost:3306/nextbook</property> | ||
| <property name="hibernate.bytecode.use_reflection_optimizer">false</property> | ||
| <property name="connection.driver_class">com.mysql.jdbc.Driver</property> | ||
| <property name="connection.username">root</property> | ||
| <property name="connection.password">root</property> | ||
| <property name="show_sql">false</property> | ||
| <property name="dialect">org.hibernate.dialect.MySQL5Dialect</property> | ||
|
|
||
| <!--http://www.mchange.com/projects/c3p0/#configuration--> | ||
| <property name="hibernate.connection.provider_class">org.hibernate.connection.C3P0ConnectionProvider</property> | ||
| <property name="connection.provider_class">org.hibernate.connection.C3P0ConnectionProvider</property> | ||
| <!--Pool Size--> | ||
| <property name="hibernate.c3p0.min_size">5</property> | ||
| <property name="hibernate.c3p0.max_size">30</property> | ||
| <property name="hibernate.c3p0.acquire_increment">1</property> | ||
|
|
||
| <property name="hibernate.c3p0.timeout">100</property> | ||
| <property name="hibernate.c3p0.max_statements">50</property> | ||
| <property name="hibernate.c3p0.idle_test_period">1000</property> | ||
| <property name="hibernate.c3p0.validate">true</property> | ||
|
|
||
| <mapping class="com.kemi.entities.PdfLink"/> | ||
| </session-factory> | ||
| </hibernate-configuration> |