Permalink
Browse files

More parsing goodness, still far from ready

  • Loading branch information...
1 parent 42d3824 commit 342c5a21d046ebf31c9aa6fcee00e8459a31dbdc @leebyron committed Feb 7, 2012
Showing with 173 additions and 28 deletions.
  1. +23 −0 server.js
  2. +129 −18 src/CraigListing.js
  3. +14 −8 src/CraigSource.js
  4. +7 −2 src/Geo.js
View
@@ -1,4 +1,27 @@
+/* TODO:
+
+ * queue and throttle GEO calls.
+ * "Studio" = 1 br?
+ * "Bathroom" with no number and no negative = 1 ba?
+ * Load walkscore into memory on run.
+ * Get some crime data
+ * Get some elevation/slope data
+ * Distance to park?
+ * Store in some DB (Redis?)
+ * Dupe detection
+ * On the market for X days
+ * Map view
+ * Cleanup expired
+
+*/
+
var CraigSource = require('./src/CraigSource.js');
var cs = new CraigSource();
cs.fetchQuery();
+/*
+var Geo = require('./src/Geo.js');
+Geo.geocode("Mission San Francisco CA", function (error, geo_data) {
+ process.stdout.write(JSON.stringify(geo_data, null, 2));
+});
+*/
View
@@ -1,65 +1,137 @@
+// TODO: better tokenizing of "phrases"
+
+var CraigRequest = require('./CraigRequest.js');
+var Geo = require('./Geo.js');
+
var ID_RX = /\d{8,14}/;
var INLINE_TAG_RX = /<\/?(i|b|u|strong|td|span)[^>]*?>/ig;
var TAG_RX = /<[^>]+?>/g;
var SPACE_RX = /[\s]{2,}/g;
var NO_RX = /\b(n|no|non)\b/i;
var ALL_CAPS_RX = /[^a-z\n\.]{6,}/g;
var CAP_RX = /^[A-Z]/;
+var NUM_RX = /\d+|one|two|three|four|five|six|seven|eight|nine/;
+var CLTAG_RX = /<!-- CLTAG ([a-z0-9]+)=(.+?) -->/g;
+
+var PHONE_RX = /1?[\.\-\(\s]?\d{3}?[\.\-\)\s]?\d{3}[\.\-\s]?\d{4}/;
+var PHONE_JUNK_RX = /[^\d]/g;
+var EMAIL_RX = /Reply to: <a href="mailto:([A-Za-z0-9\-\.@]+)/;
+var IMAGE_RX = /<img.+?src="([^"]+)"/g;
var PRICE_RX = /\$([0-9]{3,6})/;
var SQFT_RX = /(\d{3,4})\+?\s?(ft2|ft²|sqft|sq\.? foot)/;
var SQFT2_RX = /sq\.? footage:?\s*(\d{3,4})/i;
var BEDROOMS_RX = /([\d\.]+|no|one|two|three|four|five|six|seven|eight|nine)\s?(br|bd|bed|bedroom)s?\b/i;
-var BEDROOMS2_RX = /bedrooms:?\s*(\d+)/i;
+var BEDROOMS2_RX = /bed(room)?s?:?\s*(\d+)/i;
var BATHROOMS_RX = /([\d\.]+|no|one|two|three|four|five|six|seven|eight|nine)\s?(ba|bath|bathroom)s?\b/i;
-var BATHROOMS2_RX = /bathrooms:?\s*(\d+)/i;
+var BATHROOMS2_RX = /bath(room)?s?:?\s*(\d+)/i;
var DOGS_RX = /[^\n\.]*\b(dog|pet)s?(\spolicy)?\b(:\s+)?[^\n\.]*/i;
var PARKING_RX = /(^|\.|\*|\n)([^\.\*\n]*?\b(parking|garage)\b[^\.\*\n]*?)($|\.|\*|\n)/i;
+var BLACKLIST_RX = /\b(scam|fraud|fake|sanfranciscobayrentals)\b/i;
function CraigListing() {
+ this.title = undefined;
this.id = undefined;
- this.price = 'unknown';
- this.sqft = 'unknown';
- this.bedrooms = 'unknown';
- this.bathrooms = 'unknown';
- this.dogs = 'unknown';
- this.parking = 'unknown';
+ this.url = undefined;
+ this.listed = undefined;
+ this.email = undefined;
+ this.phone = undefined;
+ this.address = undefined;
+ this.addressDetail = {};
+ this.price = undefined;
+ this.sqft = undefined;
+ this.bedrooms = undefined;
+ this.bathrooms = undefined;
+ this.dogs = undefined;
+ this.parking = undefined;
+ this.images = [];
+ this.description = undefined;
}
+
module.exports = CraigListing;
CraigListing.fromRSS = function (item) {
- var htmlDescription = item.get('xmlns:description', 'http://purl.org/rss/1.0/').text()
+ var link = item.get('xmlns:link', 'http://purl.org/rss/1.0/').text();
+ var title = item.get('xmlns:title', 'http://purl.org/rss/1.0/').text();
+ var date = item.get('dc:date', {'dc':'http://purl.org/dc/elements/1.1/'}).text();
+ var htmlDescription = item.get('xmlns:description', 'http://purl.org/rss/1.0/').text();
+
+ if (BLACKLIST_RX.test(title) || BLACKLIST_RX.test(htmlDescription)) {
+ return null;
+ }
var listing = new CraigListing();
- listing.link = item.get('xmlns:link', 'http://purl.org/rss/1.0/').text();
- listing.id = ID_RX.exec(listing.link)[0];
- listing.title = sanitizeText(item.get('xmlns:title', 'http://purl.org/rss/1.0/').text());
- listing.listed = new Date(item.get('dc:date', {'dc':'http://purl.org/dc/elements/1.1/'}).text());
+ listing.url = link;
+ listing.id = ID_RX.exec(link)[0];
+ listing.title = sanitizeText(title);
+ listing.listed = new Date(date);
listing.description = tokenizeHTML(htmlDescription);
+ derivePhone(listing);
derivePrice(listing);
deriveSqft(listing);
deriveDogs(listing);
deriveBedrooms(listing);
deriveBathrooms(listing);
deriveParking(listing);
+ deriveAddress(listing, htmlDescription);
- listing.description = sanitizeText(listing.description);
+ var clTagStart = htmlDescription.indexOf('<!-- START CLTAGS -->');
+ if (clTagStart !== -1) {
+ listing.description = sanitizeText(tokenizeHTML(htmlDescription.substr(0,clTagStart)));
+ } else {
+ listing.description = sanitizeText(listing.description);
+ }
return listing;
};
CraigListing.prototype.loadAdditionalInformation = function (callback) {
- // TODO: load the actual cl post, get EMAIL, ADDRESS, LAT/LON, WALKSCORE, IMAGES, TRANSIT
- callback(null);
+ CraigRequest.get(this.url, function (error, data) {
+ if (!data) {
+ callback(error);
+ return;
+ }
+
+ deriveEmail(this, data);
+ deriveImages(this, data);
+
+ if (this.address) {
+ // geocode
+ Geo.geocode(this.address, function (error, place) {
+ if (!error) {
+ this.addressDetail.address = place.address;
+ this.addressDetail.accuracy = place.AddressDetails.Accuracy;
+ this.addressDetail.coordinate = place.Point.coordinates;
+ this.addressDetail.region = place.ExtendedData.LatLonBox;
+ }
+ callback(error);
+ }.bind(this));
+ } else {
+ callback(null);
+ }
+
+ // WALKSCORE, IMAGES, TRANSIT
+
+ }.bind(this));
};
+
function tokenizeHTML(html) {
html = html.replace(INLINE_TAG_RX, ' ');
return html.replace(TAG_RX, '\n');
}
+function getCLTags(html) {
+ var tags = {};
+ var result;
+ while (result = CLTAG_RX.exec(html)) {
+ tags[result[1]] = result[2];
+ }
+ return tags;
+}
+
var NUMBER_MAP = {
'no' : 0,
'one' : 1,
@@ -112,6 +184,12 @@ function _fixAllCaps(text) {
}
}
+function derivePhone(listing) {
+ if (PHONE_RX.test(listing.description)) {
+ listing.phone = PHONE_RX.exec(listing.description)[0].replace(PHONE_JUNK_RX, '');
+ }
+}
+
function derivePrice(listing) {
if (PRICE_RX.test(listing.title)) {
listing.price = sanitizeNumber(PRICE_RX.exec(listing.title)[1]);
@@ -136,7 +214,7 @@ function deriveBedrooms(listing) {
} else if (BEDROOMS_RX.test(listing.description)) {
listing.bedrooms = sanitizeNumber(BEDROOMS_RX.exec(listing.description)[1]);
} else if (BEDROOMS2_RX.test(listing.description)) {
- listing.bedrooms = sanitizeNumber(BEDROOMS2_RX.exec(listing.description)[1]);
+ listing.bedrooms = sanitizeNumber(BEDROOMS2_RX.exec(listing.description)[2]);
}
}
@@ -146,7 +224,7 @@ function deriveBathrooms(listing) {
} else if (BATHROOMS_RX.test(listing.description)) {
listing.bathrooms = sanitizeNumber(BATHROOMS_RX.exec(listing.description)[1]);
} else if (BATHROOMS2_RX.test(listing.description)) {
- listing.bathrooms = sanitizeNumber(BATHROOMS2_RX.exec(listing.description)[1]);
+ listing.bathrooms = sanitizeNumber(BATHROOMS2_RX.exec(listing.description)[2]);
}
}
@@ -178,3 +256,36 @@ function deriveParking(listing) {
}
}
}
+
+function deriveEmail(listing, data) {
+ if (EMAIL_RX.test(data)) {
+ listing.email = EMAIL_RX.exec(data)[1];
+ }
+}
+
+function deriveImages(listing, data) {
+ var match, image;
+ while (match = IMAGE_RX.exec(data)) {
+ image = match[1];
+ if (image.indexOf('craigslistadtracker') === -1) {
+ listing.images.push(image);
+ }
+ }
+}
+
+function deriveAddress(listing, data) {
+ var tags = getCLTags(data);
+ var city = tags['city'] || 'San Francisco';
+ var region = tags['region'] || 'CA';
+ if (tags['xstreet0']) {
+ if (tags['xstreet1'] && !NUM_RX.test(tags['xstreet0'])) {
+ listing.address = tags['xstreet0'] + ' and ' + tags['xstreet1'] + ' ' + city + ' ' + region;
+ } else {
+ // TODO: find apt number
+ listing.address = tags['xstreet0'] + ' ' + city + ' ' + region;
+ if (tags['xstreet1']) {
+ listing.addressDetail.cross = tags['xstreet1'];
+ }
+ }
+ }
+}
View
@@ -24,20 +24,26 @@ CraigSource.prototype.fetchQuery = function () {
var xmlDoc = libxmljs.parseXmlString(data);
var items = xmlDoc.root().find('xmlns:item', 'http://purl.org/rss/1.0/');
for (var ii in items) {
- try {
+ //try {
var listing = CraigListing.fromRSS(items[ii]);
+
+ if (!listing) {
+ // there was a problem, or the listing was blacklisted
+ continue;
+ }
+
// TODO: test to ensure existing ID doesn't already exist
listing.loadAdditionalInformation(function (error) {
if (error) {
console.error(error);
- } else {
- console.log(listing);
+ return;
}
- });
- } catch (exception) {
- console.error(exception);
- }
- break; // just testing one load for now
+ console.log(this);
+ }.bind(listing));
+ //} catch (exception) {
+ // console.error(exception);
+ //}
+// break; // just testing one load for now
}
});
};
View
@@ -2,7 +2,7 @@ var http = require('http');
var API_KEY = 'ABQIAAAAYBdgz0O3BNjt05urf9GWshTLqBKuUfXmWnGsr0L2cAywggmdABS1r1GvhFcvMJF_rb6F7wVp-hpTqA';
-exports.geoCode = function (address, callback) {
+exports.geocode = function (address, callback) {
var path = '/maps/geo?key=' + API_KEY + '&q=' + encodeURIComponent(address) +
'&output=json&sensor=false';
@@ -24,7 +24,12 @@ exports.geoCode = function (address, callback) {
});
response.on('end', function () {
- callback(null, JSON.parse(data));
+ var info = JSON.parse(data);
+ if (!info['Placemark']) {
+ callback(new Error('Bad geo call: ' + JSON.stringify(info)), null);
+ } else {
+ callback(null, info['Placemark'][0]);
+ }
});
});

0 comments on commit 342c5a2

Please sign in to comment.