Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

crawl github instead of using API - much more complete project coverage

  • Loading branch information...
commit e182e0e66c9746ee6ea185f88a39e8f105dbb8ca 1 parent 3933773
@jkk authored
View
3  project.clj
@@ -7,4 +7,5 @@
[hiccup "0.3.6"]
[clj-json "0.3.2"]
[ring/ring-core "0.3.11"]
- [ring/ring-jetty-adapter "0.3.11"]])
+ [ring/ring-jetty-adapter "0.3.11"]
+ [org.jsoup/jsoup "1.6.1"]])
View
47 src/clojuresphere/preprocess.clj
@@ -3,7 +3,7 @@
[clojure.pprint :only [pprint]]
[clojure.data.zip.xml :only [xml-> xml1-> text]]
[clojuresphere.util :only [url-encode qualify-name maven-coord lein-coord
- safe-read-string]])
+ safe-read-string fetch-doc select-els]])
(:require [clojure.xml :as xml]
[clojure.zip :as zip]
[clojure.java.io :as io]))
@@ -57,22 +57,31 @@
(defn fetch-repo [owner repo]
(Thread/sleep 1000) ;crude rate limit
(println "Fetching repo" owner repo)
+ (flush)
(show-repo-info nil owner repo))
-;; TODO: fetch all repos in each repo's network, too
-;; OR: fetch repos mentioned in the homepage url from clojars
-(defn fetch-repos [start-page]
- (Thread/sleep 1000) ;crude rate limit
- (println "Fetching page" start-page) ;FIXME: proper logging
- (search-repos
- github-auth "clojure" :language "clojure" :start-page start-page))
+(defn crawl-repos []
+ (let [url "https://github.com/languages/Clojure/most_watched"]
+ (println "Crawling repos...")
+ (loop [page 1 repos []]
+ (print page " ") (flush)
+ (Thread/sleep 1000) ;crude rate limit
+ (let [doc (fetch-doc url :data {:page page})
+ new-repos (for [el (select-els doc "#directory td.title a")]
+ (let [[_ owner repo-name] (.split (-> el :attrs :href) "/")]
+ [owner repo-name]))]
+ (if (seq new-repos)
+ (recur (inc page) (into repos new-repos))
+ repos)))))
-(defn fetch-all-repos []
- (->> (iterate inc 1)
- (map fetch-repos)
- (take-while seq)
- (apply concat)
- vec))
+(defn crawl-and-fetch-repos []
+ (let [repos (crawl-repos)]
+ (println "\nFound" (count repos) "repos, fetching info for each...")
+ (vec
+ (remove
+ string? ;clj-github quirk
+ (for [[owner repo-name] repos]
+ (fetch-repo owner repo-name))))))
;; TODO: some repos have multiple project.clj files (e.g., ring)
(defn fetch-repo-project [repo]
@@ -114,7 +123,7 @@
(defn fetch-github-projects []
(let [;; special exception for clojure itself (written in java)
clojure-repo (first (search-repos github-auth "clojure"))
- repos (cons clojure-repo (fetch-all-repos))]
+ repos (cons clojure-repo (crawl-and-fetch-repos))]
(remove (comp #{"clojure-slick-rogue"} :name :github) ;broken
(fetch-all-repo-projects repos))))
@@ -237,10 +246,11 @@
clojars-github-urls)
extra-github-repos (doall
(remove
- string? ;quirk of clj-github
+ #(or (string? %) (nil? %)) ;quirk of clj-github
(for [[_ owner repo] owner-repos
:when (and owner repo)]
- (fetch-repo owner repo))))]
+ (try (fetch-repo owner repo)
+ (catch Exception _ nil)))))]
(fetch-all-repo-projects extra-github-repos)))
(defn fetch-all-projects [clojars-dir]
@@ -278,7 +288,10 @@
github-extra-projects
clojars-projects)))
+ ;; TODO: make sure it's readable before writing
+
(spit (str (System/getProperty "user.dir")
"/resources/project_graph.clj")
(with-out-str (pprint project-graph)))
+
)
View
37 src/clojuresphere/util.clj
@@ -1,6 +1,8 @@
(ns clojuresphere.util
+ (:use [clojure.walk :only [keywordize-keys]])
(:require [clojure.java.io :as io]
- [clj-json.core :as json]))
+ [clj-json.core :as json])
+ (:import [org.jsoup Jsoup]))
(def ^:dynamic *req* nil)
@@ -38,7 +40,8 @@
(let [[gid aid] (qualify-name name)]
(lein-coord gid aid version)))
([group-id artifact-id version]
- (let [group-id (or group-id artifact-id)]
+ (let [group-id (if (and group-id (seq group-id))
+ group-id artifact-id)]
[(symbol (str group-id "/" artifact-id)) (str version)])))
(defn memory-stats [& {:keys [gc]}]
@@ -118,3 +121,33 @@
date-days (long (/ (date->seconds date) 60 60 24))]
(- now-days date-days)))
+(defn stringify-map [m]
+ (reduce
+ (fn [m [k v]]
+ (assoc m (if (keyword? k) (name k) (str k)) (str v)))
+ {} m))
+
+(defn fetch-doc [url & {:keys [data cookies timeout method]
+ :or {timeout 10000 data {} cookies {} method :get}}]
+ (let [c (-> (Jsoup/connect url)
+ (.userAgent "ClojureSphere")
+ (.data (stringify-map data))
+ (.timeout 10000))]
+ (doseq [[k v] (stringify-map cookies)]
+ (.cookie c k v))
+ (if (= :post method)
+ (.post c)
+ (.get c))))
+
+(defn- jsoup->clj [el]
+ (let [tag (.tagName el)
+ attrs (keywordize-keys (into {} (.attributes el)))
+ text (.ownText el)
+ children (map jsoup->clj (.children el))
+ children (if (seq text)
+ (cons text children)
+ children)]
+ {:tag tag :attrs attrs :children (vec children)}))
+
+(defn select-els [doc sel]
+ (map jsoup->clj (.select doc sel)))
Please sign in to comment.
Something went wrong with that request. Please try again.