Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

adding code

  • Loading branch information...
commit f908d938b0674fefd5c172a3fab43c251dc325cf 1 parent f641019
@harryaskham authored
Showing with 23,695 additions and 0 deletions.
  1. +40 −0 TweetLabel.java
  2. BIN  classify/.DS_Store
  3. +65 −0 classify/.svn/all-wcprops
  4. +368 −0 classify/.svn/entries
  5. +30 −0 classify/.svn/text-base/AlchemyClassification.java.svn-base
  6. +174 −0 classify/.svn/text-base/AlchemyClassifier.java.svn-base
  7. +30 −0 classify/.svn/text-base/CalaisClassification.java.svn-base
  8. +148 −0 classify/.svn/text-base/CalaisClassifier.java.svn-base
  9. +139 −0 classify/.svn/text-base/FullAlchemyClassification.java.svn-base
  10. +130 −0 classify/.svn/text-base/FullCalaisClassification.java.svn-base
  11. +148 −0 classify/.svn/text-base/FullTextwiseClassification.java.svn-base
  12. +196 −0 classify/.svn/text-base/NaiveBayes.java.svn-base
  13. +36 −0 classify/.svn/text-base/TextwiseClassification.java.svn-base
  14. +149 −0 classify/.svn/text-base/TextwiseClassifier.java.svn-base
  15. +30 −0 classify/AlchemyClassification.java
  16. +174 −0 classify/AlchemyClassifier.java
  17. +30 −0 classify/CalaisClassification.java
  18. +148 −0 classify/CalaisClassifier.java
  19. +139 −0 classify/FullAlchemyClassification.java
  20. +130 −0 classify/FullCalaisClassification.java
  21. +148 −0 classify/FullTextwiseClassification.java
  22. +196 −0 classify/NaiveBayes.java
  23. +36 −0 classify/TextwiseClassification.java
  24. +149 −0 classify/TextwiseClassifier.java
  25. +53 −0 eval/.svn/all-wcprops
  26. +300 −0 eval/.svn/entries
  27. +821 −0 eval/.svn/text-base/CosineManager.java.svn-base
  28. +188 −0 eval/.svn/text-base/Diversity.java.svn-base
  29. +189 −0 eval/.svn/text-base/Grapher.java.svn-base
  30. +62 −0 eval/.svn/text-base/Pearson.java.svn-base
  31. +135 −0 eval/.svn/text-base/SVMTest.java.svn-base
  32. +495 −0 eval/.svn/text-base/SimilarityMatrix.java.svn-base
  33. +61 −0 eval/.svn/text-base/SimilarityPair.java.svn-base
  34. +174 −0 eval/.svn/text-base/SpearmanRank.java.svn-base
  35. +821 −0 eval/CosineManager.java
  36. +188 −0 eval/Diversity.java
  37. +189 −0 eval/Grapher.java
  38. +62 −0 eval/Pearson.java
  39. +135 −0 eval/SVMTest.java
  40. +495 −0 eval/SimilarityMatrix.java
  41. +61 −0 eval/SimilarityPair.java
  42. +174 −0 eval/SpearmanRank.java
  43. +29 −0 liwc/.svn/all-wcprops
  44. +164 −0 liwc/.svn/entries
  45. +123 −0 liwc/.svn/text-base/FullLIWCClassification.java.svn-base
  46. +447 −0 liwc/.svn/text-base/LIWCDictionary.java.svn-base
  47. +98 −0 liwc/.svn/text-base/LIWCTree.java.svn-base
  48. +49 −0 liwc/.svn/text-base/LIWCWord.java.svn-base
  49. +123 −0 liwc/FullLIWCClassification.java
  50. +447 −0 liwc/LIWCDictionary.java
  51. +98 −0 liwc/LIWCTree.java
  52. +49 −0 liwc/LIWCWord.java
  53. +65 −0 topics/.svn/all-wcprops
  54. +368 −0 topics/.svn/entries
  55. +122 −0 topics/.svn/text-base/FullLDAClassification.java.svn-base
  56. +299 −0 topics/.svn/text-base/FullLLDAClassification.java.svn-base
  57. +203 −0 topics/.svn/text-base/FullSVMClassification.java.svn-base
  58. +530 −0 topics/.svn/text-base/LDATopicModel.java.svn-base
  59. +861 −0 topics/.svn/text-base/LLDATopicModel.java.svn-base
  60. +279 −0 topics/.svn/text-base/LightweightLLDA.java.svn-base
  61. +126 −0 topics/.svn/text-base/MalletLDA.java.svn-base
  62. +440 −0 topics/.svn/text-base/SVMTopicModel.java.svn-base
  63. +81 −0 topics/.svn/text-base/SimpleLLDA.java.svn-base
  64. +110 −0 topics/.svn/text-base/SimpleMalletLDA.java.svn-base
  65. +122 −0 topics/FullLDAClassification.java
  66. +299 −0 topics/FullLLDAClassification.java
  67. +203 −0 topics/FullSVMClassification.java
  68. +530 −0 topics/LDATopicModel.java
  69. +861 −0 topics/LLDATopicModel.java
  70. +279 −0 topics/LightweightLLDA.java
  71. +126 −0 topics/MalletLDA.java
  72. +440 −0 topics/SVMTopicModel.java
  73. +81 −0 topics/SimpleLLDA.java
  74. +110 −0 topics/SimpleMalletLDA.java
  75. +29 −0 twitter/.svn/all-wcprops
  76. +164 −0 twitter/.svn/entries
  77. +334 −0 twitter/.svn/text-base/Profiler.java.svn-base
  78. +133 −0 twitter/.svn/text-base/RawProfile.java.svn-base
  79. +790 −0 twitter/.svn/text-base/SimpleProfile.java.svn-base
  80. +86 −0 twitter/.svn/text-base/SimpleTweet.java.svn-base
  81. +334 −0 twitter/Profiler.java
  82. +133 −0 twitter/RawProfile.java
  83. +790 −0 twitter/SimpleProfile.java
  84. +86 −0 twitter/SimpleTweet.java
  85. +41 −0 types/.svn/all-wcprops
  86. +232 −0 types/.svn/entries
  87. +52 −0 types/.svn/text-base/Category.java.svn-base
  88. +32 −0 types/.svn/text-base/CategoryScore.java.svn-base
  89. +282 −0 types/.svn/text-base/Corpus.java.svn-base
  90. +67 −0 types/.svn/text-base/Document.java.svn-base
  91. +21 −0 types/.svn/text-base/Pair.java.svn-base
  92. +32 −0 types/.svn/text-base/WordScore.java.svn-base
  93. +52 −0 types/Category.java
  94. +32 −0 types/CategoryScore.java
  95. +282 −0 types/Corpus.java
  96. +67 −0 types/Document.java
  97. +21 −0 types/Pair.java
  98. +32 −0 types/WordScore.java
  99. +29 −0 util/.svn/all-wcprops
  100. +164 −0 util/.svn/entries
  101. +43 −0 util/.svn/text-base/AssociatedPress.java.svn-base
  102. +371 −0 util/.svn/text-base/Stemmer.java.svn-base
  103. +69 −0 util/.svn/text-base/Stopwords.java.svn-base
  104. +1,407 −0 util/.svn/text-base/Tools.java.svn-base
  105. +43 −0 util/AssociatedPress.java
  106. +371 −0 util/Stemmer.java
  107. +69 −0 util/Stopwords.java
  108. +1,407 −0 util/Tools.java
View
40 TweetLabel.java
@@ -0,0 +1,40 @@
+package uk.ac.cam.ha293.tweetlabel;
+
+import java.util.Map;
+
+import uk.ac.cam.ha293.tweetlabel.classify.AlchemyClassifier;
+import uk.ac.cam.ha293.tweetlabel.classify.CalaisClassifier;
+import uk.ac.cam.ha293.tweetlabel.classify.TextwiseClassifier;
+import uk.ac.cam.ha293.tweetlabel.topics.LLDATopicModel;
+import uk.ac.cam.ha293.tweetlabel.topics.LightweightLLDA;
+import uk.ac.cam.ha293.tweetlabel.twitter.Profiler;
+import uk.ac.cam.ha293.tweetlabel.twitter.SimpleProfile;
+import uk.ac.cam.ha293.tweetlabel.types.Corpus;
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+public class TweetLabel {
+
+ public static void init() {
+ Tools.init();
+ AlchemyClassifier.init();
+ CalaisClassifier.init();
+ TextwiseClassifier.init();
+ }
+
+ public static void main(String args[]) {
+ TweetLabel.init();
+ //Place API calls here to do topic modeling
+ //...
+ //eg:
+ /*
+ Corpus corpus = Corpus.loadLabelled("alchemy", "allprofiles-unstemmed-alchemy-top3");
+ LLDATopicModel llda = new LLDATopicModel(corpus,1000,100,0,1,0.01);
+ llda.runCVGibbsSampling(0, 9);
+ llda.printDocumentsVerbose(10);
+ */
+ //Otherwise, warn user application is currently doing nothing:
+ System.out.println("Application not configured to perform topic modeling.");
+
+ }
+
+}
View
BIN  classify/.DS_Store
Binary file not shown
View
65 classify/.svn/all-wcprops
@@ -0,0 +1,65 @@
+K 25
+svn:wc:ra_dav:version-url
+V 89
+/svn/tweetlabel/!svn/ver/52/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify
+END
+CalaisClassifier.java
+K 25
+svn:wc:ra_dav:version-url
+V 111
+/svn/tweetlabel/!svn/ver/24/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify/CalaisClassifier.java
+END
+FullAlchemyClassification.java
+K 25
+svn:wc:ra_dav:version-url
+V 121
+/svn/tweetlabel/!svn/ver/120/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify/FullAlchemyClassification.java
+END
+TextwiseClassifier.java
+K 25
+svn:wc:ra_dav:version-url
+V 114
+/svn/tweetlabel/!svn/ver/109/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify/TextwiseClassifier.java
+END
+NaiveBayes.java
+K 25
+svn:wc:ra_dav:version-url
+V 105
+/svn/tweetlabel/!svn/ver/34/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify/NaiveBayes.java
+END
+AlchemyClassifier.java
+K 25
+svn:wc:ra_dav:version-url
+V 113
+/svn/tweetlabel/!svn/ver/123/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify/AlchemyClassifier.java
+END
+CalaisClassification.java
+K 25
+svn:wc:ra_dav:version-url
+V 115
+/svn/tweetlabel/!svn/ver/24/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify/CalaisClassification.java
+END
+TextwiseClassification.java
+K 25
+svn:wc:ra_dav:version-url
+V 117
+/svn/tweetlabel/!svn/ver/24/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify/TextwiseClassification.java
+END
+FullCalaisClassification.java
+K 25
+svn:wc:ra_dav:version-url
+V 120
+/svn/tweetlabel/!svn/ver/120/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify/FullCalaisClassification.java
+END
+FullTextwiseClassification.java
+K 25
+svn:wc:ra_dav:version-url
+V 122
+/svn/tweetlabel/!svn/ver/120/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify/FullTextwiseClassification.java
+END
+AlchemyClassification.java
+K 25
+svn:wc:ra_dav:version-url
+V 116
+/svn/tweetlabel/!svn/ver/52/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify/AlchemyClassification.java
+END
View
368 classify/.svn/entries
@@ -0,0 +1,368 @@
+10
+
+dir
+89
+https://harryaskham@svn.assembla.com/svn/tweetlabel/trunk/code/TweetLabel/src/uk/ac/cam/ha293/tweetlabel/classify
+https://harryaskham@svn.assembla.com/svn/tweetlabel
+
+
+
+2011-02-12T03:57:22.243172Z
+52
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+791efb99-9773-405d-8e8f-ceb742288c96
+
+CalaisClassifier.java
+file
+
+
+
+
+2011-03-17T02:10:13.000000Z
+034a3a4066e62509ade5a0d781946ea0
+2011-01-05T11:35:53.088085Z
+24
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4574
+
+FullAlchemyClassification.java
+file
+120
+
+
+
+2011-04-09T18:27:10.000000Z
+f382b58dd15af3e2093f1f91b8190059
+2011-04-13T14:21:40.868132Z
+120
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4029
+
+TextwiseClassifier.java
+file
+109
+
+
+
+2011-04-03T18:46:42.000000Z
+46a1f2db94ac6dd649daa2600b32a938
+2011-04-03T21:39:04.147918Z
+109
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+5192
+
+NaiveBayes.java
+file
+
+
+
+
+2011-03-17T02:10:13.000000Z
+554f8e4f4e28b0742872d32576ed0e81
+2011-02-01T22:26:49.684363Z
+34
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+7470
+
+AlchemyClassifier.java
+file
+123
+
+
+
+2011-04-15T17:46:36.000000Z
+f47f1eed38c3daf4877e486952bc7992
+2011-04-15T18:00:49.992695Z
+123
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+6403
+
+CalaisClassification.java
+file
+
+
+
+
+2011-03-17T02:10:13.000000Z
+f503ba7a3eb7155935a8a26a9b917ef2
+2011-01-05T11:35:53.088085Z
+24
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+658
+
+TextwiseClassification.java
+file
+
+
+
+
+2011-03-17T02:10:13.000000Z
+f2232bdaef2e98804505818931b4b02f
+2011-01-05T11:35:53.088085Z
+24
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+803
+
+FullCalaisClassification.java
+file
+120
+
+
+
+2011-04-09T18:27:21.000000Z
+29738b9a61e0f859d31f4f947c184ec7
+2011-04-13T14:21:40.868132Z
+120
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3649
+
+FullTextwiseClassification.java
+file
+120
+
+
+
+2011-04-09T18:27:30.000000Z
+919af2a68361380ccee2ae9ec1314ac5
+2011-04-13T14:21:40.868132Z
+120
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4321
+
+AlchemyClassification.java
+file
+
+
+
+
+2011-03-17T02:10:13.000000Z
+e76fdc5d6bce9be4cc91e16a3132a7ef
+2011-02-12T03:57:22.243172Z
+52
+harryaskham
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+533
+
View
30 classify/.svn/text-base/AlchemyClassification.java.svn-base
@@ -0,0 +1,30 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+public class AlchemyClassification {
+
+ private String category;
+ private double score;
+
+ public AlchemyClassification() {
+ category = null;
+ score = -1.0;
+ }
+
+ public AlchemyClassification(String category, double score) {
+ this.category = category;
+ this.score = score;
+ }
+
+ public String getCategory() {
+ return category;
+ }
+
+ public double getScore() {
+ return score;
+ }
+
+ public void print() {
+ System.out.println(category+": "+score);
+ }
+
+}
View
174 classify/.svn/text-base/AlchemyClassifier.java.svn-base
@@ -0,0 +1,174 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.IOException;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.xpath.XPathExpressionException;
+import org.xml.sax.SAXException;
+
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+
+import com.alchemyapi.api.*;
+
+public class AlchemyClassifier {
+
+ private static String[] keys;
+ private static int keyIndex;
+ private static AlchemyAPI alchemyAPI;
+ private static int requestCount;
+
+ public AlchemyClassifier() {
+ init(); //NOTE: needs to be called from elsewhere too
+ }
+
+ public static void init() {
+ keys = new String[35];
+ keys[0] = "c497c0fc6dd00dd6fb13a7cb29c3311709d8c344";
+ keys[1] = "9dcb52bec38f96f4d455467453858e44cbeccdaf";
+ keys[2] = "5821b26a00557cf60289b03cb1be2257dc3cf6eb";
+ keys[3] = "8aa54e2962747f2653cd1d848a63d2a463e4d41f";
+ keys[4] = "c0b97edd81b8f4bef113f8d78b4c2755768746b6";
+ keys[5] = "f45c15726375f4fa78644d75dbc2eef2e3528d87";
+ keys[6] = "c3811e9bf08a209569d1c335e576e8b4ac0df0bc";
+ keys[7] = "358855bc94a0fbb2a7fd5ac574099cc574a63932";
+ keys[8] = "73173512d654109b54694292fc87fdf51998d408";
+ keys[9] = "2225a47a67142a25643ac926bd6e79a0b6250dd1";
+ keys[10] = "1601aaa431917f4e3ba62c7a3e1ba6afd3190d10";
+ keys[11] = "b6e21a7ded427e2d9bf0f080edadb7cd0e0f31fe";
+ keys[12] = "31b8d6989ec914b4b36f72eff8df6ac20ee37263";
+ keys[13] = "5eb99cbc7756802962c891b7843f4f6160037c31";
+ keys[14] = "8d442ee0195e3e5343a97e1d4c2b478bd91b8899";
+ keys[15] = "02e3f0ab5fd43d85178ed501e9917227c24a1be7";
+ keys[16] = "e4f32b39fa8ca88ef12f488895797695a9c6f902";
+ keys[17] = "a10efc9508044afdbd3415341e1dfb12ef27fc92";
+ keys[18] = "f809b9795a4761084f22e07b339e293c5556d308";
+ keys[19] = "b0470e00a30d6e9fbf94716baca9c4174d2b9f12";
+ keys[20] = "e7f90d7158b2168479ae944af07f56bad6bc2247";
+ keys[21] = "88bd58ed3df9d77eff384092e62d0788363242cd";
+ keys[22] = "3254e7332a9262a475236563b9984eb8b18fc97e";
+ keys[23] = "4af5c0b33f491ea00499b8dfae827c8eaa3ed128";
+ keys[24] = "15a9800bd191aad9bce2e65b7040eee74c36a110";
+ keys[25] = "51c2e1771680357dbd34bb8352bda62a6b661316";
+ keys[26] = "e21a8e0e597e5b9d3278e9d361080909481d20c9";
+ keys[27] = "371ac1ba6dc6b6809a4c2bb15bc37b78879d2714";
+ keys[27] = "81d9efc046063f6508acaf5052083c2188605817";
+ keys[28] = "b44c8b62149e0d27edc51e7df2feddd397178145";
+ keys[29] = "7431ba2ecc7703222a09dd1d86642837af5697eb";
+ keys[30] = "f80d64cc89038a918f42e95c02b6f7fe6b9308c1";
+ keys[31] = "ca3a625d7c429ace4b8033d4cc9ec3b08acaa878";
+ keys[32] = "eaab19db33996b5404a9be6ba00bc5e8c3851561";
+ keys[33] = "97b981de050c18582c39e37f9340f4c57e2e1500";
+ keys[34] = "084385760b5c9b55f8c8c91b26d5195ba3bfa6dd";
+
+
+ keyIndex = 0;
+ alchemyAPI = AlchemyAPI.GetInstanceFromString(keys[keyIndex]);
+ requestCount = 0;
+ }
+
+ //Either returns a valid AlchemyClassification, or it returns an empty one. Never null...
+ public static AlchemyClassification classifyURL(String url) {
+ updateRequestCount();
+ try {
+ org.w3c.dom.Document d = alchemyAPI.URLGetCategory(url);
+ String category = d.getChildNodes().item(0).getChildNodes().item(9).getTextContent();
+ double score = Double.parseDouble(d.getChildNodes().item(0).getChildNodes().item(11).getTextContent());
+ if(category.isEmpty() || category == null) {
+ return new AlchemyClassification();
+ } else {
+ return new AlchemyClassification(category,score);
+ }
+ } catch(NullPointerException e) {
+ System.err.println("NullPointerException trying to classify a URL:");
+ System.err.println(url);
+ } catch (IllegalArgumentException e) {
+ //URL not well formatted - might be a truncated retweet. This is fine.
+ } catch (XPathExpressionException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ //Page not HTML - image or something. Or - daily limit reached!!!
+ if(e.getMessage().contains("daily-transaction-limit-exceeded")) {
+ System.err.println("Transaction limit alert");
+ /*
+ if(updateKey()) {
+ return classifyText(text);
+ } else {
+ return null;
+ }
+ */
+ return null;
+ }
+ } catch (SAXException e) {
+ e.printStackTrace();
+ } catch (ParserConfigurationException e) {
+ e.printStackTrace();
+ }
+ return new AlchemyClassification();
+ }
+
+ public static AlchemyClassification classifyText(String text) {
+ updateRequestCount();
+ try {
+ org.w3c.dom.Document d = alchemyAPI.TextGetCategory(text);
+ //Tools.prettyPrintDocument(d);
+ String category = d.getChildNodes().item(0).getChildNodes().item(9).getTextContent();
+ double score = Double.parseDouble(d.getChildNodes().item(0).getChildNodes().item(11).getTextContent());
+ if(category == null || category.isEmpty()) {
+ return new AlchemyClassification();
+ } else {
+ return new AlchemyClassification(category,score);
+ }
+ } catch(NullPointerException e) {
+ System.err.println("NullPointerException trying to classify some text:");
+ System.err.println(text);
+ } catch (IllegalArgumentException e) {
+ //text not well formatted - might be an empty tweet? This is (probably) fine.
+ } catch (XPathExpressionException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ //Page not HTML - image or something. Or - daily limit reached!!!
+ if(e.getMessage().contains("daily-transaction-limit-exceeded")) {
+ System.err.println("Transaction limit alert");
+ if(updateKey()) {
+ return classifyText(text);
+ } else {
+ return null;
+ }
+ }
+ } catch (SAXException e) {
+ e.printStackTrace();
+ } catch (ParserConfigurationException e) {
+ e.printStackTrace();
+ }
+ return new AlchemyClassification();
+ }
+
+ private static void updateRequestCount() {
+ requestCount++;
+ if(requestCount % 50 == 0) {
+ //System.out.println("Alchemy Requests Made: "+requestCount);
+
+
+ //Experimental key-change -it works !!!
+ if(requestCount % 29900 == 0) {
+ updateKey();
+ }
+ }
+ }
+
+ private static boolean updateKey() {
+ if(keyIndex < keys.length - 1) {
+ System.out.println("Trying another alchemy key...");
+ keyIndex++;
+ System.out.println("Using key "+keyIndex);
+ alchemyAPI = AlchemyAPI.GetInstanceFromString(keys[keyIndex]);
+ return true;
+ } else {
+ System.out.println("Out of alchemy keys to try");
+ keyIndex=0;
+ return true;
+ //return false;
+ }
+ }
+
+}
View
30 classify/.svn/text-base/CalaisClassification.java.svn-base
@@ -0,0 +1,30 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+public class CalaisClassification {
+
+ private Map<String,Double> categoryScores;
+
+ public CalaisClassification() {
+ categoryScores = new HashMap<String,Double>();
+ }
+
+ public Set<String> getCategories() {
+ return categoryScores.keySet();
+ }
+
+ public Map<String,Double> getCategoryScores() {
+ return categoryScores;
+ }
+
+ public double lookupScore(String category) {
+ return categoryScores.get(category);
+ }
+
+ public void add(String category, double score) {
+ categoryScores.put(category, score);
+ }
+}
View
148 classify/.svn/text-base/CalaisClassifier.java.svn-base
@@ -0,0 +1,148 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.clearforest.calais.common.CalaisJavaIf;
+import com.clearforest.calais.common.StringUtils;
+
+public class CalaisClassifier {
+
+ private static String[] keys;
+ private static int keyIndex;
+ private static int requestCount;
+ private static CalaisJavaIf calais;
+
+ public CalaisClassifier() {
+ init(); //NOTE: needs to be called from elsewhere too
+ }
+
+ public static void init() {
+ keys = new String[13];
+ keys[0] = "3xw5bayrkykgeseyydwxw4re";
+ keys[1] = "amn8753bkhfhxf3mth3yv6ev";
+ keys[2] = "sg2dn8yfy4uavasmnzehaffg";
+ keys[3] = "v7kg9y8p388q3wjd4arsgtw2";
+ keys[4] = "bkuf9tnx9xsb6hgjf4f2x845";
+ keys[5] = "tnvpbfb94mxgvg9ycaygkqak";
+ keys[6] = "6umcpjh6ufp6v3th6zw6yfgs";
+ keys[7] = "zcuu786ap89euazm88b77kzm";
+ keys[8] = "54dr8xsuy6z72m5p3x6dt5nq";
+ keys[9] = "hk3nwmjzf5pg8qt79sac9yth";
+ keys[10] = "jt5pyvr9f428f3qr4rsyh6dg";
+ keys[11] = "sg87qnkr8c5jjw233ezejqwh";
+ keys[12] = "fkrrhm7w8asp3dchddzvzzs3";
+ keyIndex = 0;
+ requestCount = 8600; //roughly where it left off
+ calais = new CalaisJavaIf(keys[keyIndex]);
+ }
+
+ //Discards a lot of tweets - maybe better to amalgamate some?
+ public static CalaisClassification classifyText(String text) {
+
+ if(text.length() < 100) {
+ //System.err.println("Tweet is <100 chars - too short for Calais");
+ return null;
+ }
+
+ updateRequestCount();
+
+ //Hacky key rotation scheme to hopefully bypass the time and rate limits...
+ keyIndex++;
+ if(keyIndex >= keys.length) keyIndex = 0;
+ calais = new CalaisJavaIf(keys[keyIndex]);
+
+ CalaisClassification result = new CalaisClassification();
+ String xmlResponse = StringUtils.unescapeHTML(calais.callEnlighten(text));
+
+ DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+ DocumentBuilder documentBuilder = null;
+ org.w3c.dom.Document parsedXML = null;
+ try {
+ documentBuilder = dbf.newDocumentBuilder();
+
+ StringReader reader = new StringReader(xmlResponse);
+ InputSource inputSource = new InputSource(reader);
+ parsedXML = documentBuilder.parse(inputSource);
+
+ //System.out.println(text);
+
+ for(int i=0; i<3; i++) {
+ String pairedString = null;;
+ try {
+ pairedString = parsedXML.getChildNodes().item(0).getChildNodes().item(2).getChildNodes().item(3+i).getTextContent();
+ } catch(NullPointerException e) {
+ //System.err.println("NullPointerException found, scores cannot be present in the XML");
+ break;
+ }
+ if(!pairedString.startsWith("Calais")) {
+ break;
+ }
+ pairedString = pairedString.substring(6);
+ int scoreIndex = pairedString.indexOf("0.");
+ if(pairedString.equals("Other")) {
+ result.add("Other", 1.0);
+ break;
+ }
+
+ //"Foolproof" check to make sure we do actually have a category String...
+ if(scoreIndex == -1) {
+ break;
+ }
+
+ String category = pairedString.substring(0, scoreIndex);
+ String score = pairedString.substring(scoreIndex);
+ try {
+ result.add(category, Double.parseDouble(score));
+ } catch(NumberFormatException e) {
+ System.err.println("Got an NFE - "+category+", "+score);
+ }
+ }
+
+ return result;
+
+ } catch (IOException e) {
+ System.err.println("Couldn't parse the XML response - IOException");
+ } catch (ParserConfigurationException e) {
+ System.err.println("Couldn't parse the XML response - Parser Config Error");
+ } catch (SAXException e) {
+ System.err.println("Couldn't parse the XML response - SAX Error. Retrying...");
+
+ //experimental retry
+ return classifyText(text);
+ }
+ return null;
+ }
+
+ private static void updateRequestCount() {
+ requestCount++;
+ if(requestCount % 50 == 0) {
+ System.out.println("Calais Requests Made: "+requestCount);
+
+ if(requestCount % 49900 == 0) {
+ updateKey();
+ }
+ }
+ }
+
+ private static boolean updateKey() {
+ if(keyIndex < keys.length - 1) {
+ System.out.println("Trying another Calais key...");
+ keyIndex++;
+ calais = new CalaisJavaIf(keys[keyIndex]);
+ System.out.println("Using key "+keyIndex);
+ return true;
+ } else {
+ System.out.println("Out of Calais keys to try");
+ return false;
+ }
+ }
+
+}
View
139 classify/.svn/text-base/FullAlchemyClassification.java.svn-base
@@ -0,0 +1,139 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import uk.ac.cam.ha293.tweetlabel.topics.FullLDAClassification;
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+public class FullAlchemyClassification {
+
+ private long userID;
+ private Map<String,Double> classifications;
+
+ public FullAlchemyClassification(long userID) {
+ this.userID = userID;
+ classifications = new HashMap<String,Double>();
+
+ try {
+ FileInputStream fstream = new FileInputStream("classifications/alchemy/"+userID+".csv");
+ DataInputStream in = new DataInputStream(fstream);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in));
+ String strLine;
+ br.readLine(); //Skp the CSV descriptor
+ while ((strLine = br.readLine()) != null) {
+ String[] split = strLine.split(",");
+ String cat = split[0].substring(1,split[0].length()-1);
+
+ double score = 0.0;
+ //tweet scores
+ if(Integer.parseInt(split[2]) != 0) {
+ score += Double.parseDouble(split[1]) / Double.parseDouble(split[2]);
+ }
+ //url scores
+ if(Integer.parseInt(split[4]) != 0) {
+ score += Double.parseDouble(split[3]) / Double.parseDouble(split[4]);
+ }
+
+ classifications.put(cat,score);
+ }
+ in.close();
+ } catch (Exception e) {
+ System.err.println("Error: " + e.getMessage());
+ }
+
+ //need to normalise the alchemy classifcations, durr
+ double scoreSum = 0.0;
+ for(String cat : classifications.keySet()) {
+ scoreSum += classifications.get(cat);
+ }
+ for(String cat : classifications.keySet()) {
+ classifications.put(cat, classifications.get(cat)/scoreSum);
+ }
+
+ classifications = Tools.sortMapByValueDesc(classifications);
+ }
+
+ public void print() {
+ for(String cat : classifications.keySet()) {
+ System.out.println(cat+": "+classifications.get(cat));
+ }
+ }
+
+ public Set<String> getCategorySet() {
+ return classifications.keySet();
+ }
+
+ public boolean hasCategory(String cat) {
+ if(classifications.keySet().contains(cat)) return true;
+ else return false;
+ }
+
+ public double getScore(String cat) {
+ return classifications.get(cat);
+ }
+
+ public double magnitude() {
+ double sum = 0.0;
+ for(String cat : classifications.keySet()) {
+ sum += (classifications.get(cat)*classifications.get(cat));
+ }
+ sum = Math.sqrt(sum);
+ return sum;
+ }
+
+ public double cosineSimilarity(FullAlchemyClassification c) {
+ Set<String> catSet = new HashSet<String>();
+ catSet.addAll(classifications.keySet());
+ catSet.addAll(c.getCategorySet());
+
+ double score = 0.0;
+ for(String cat : catSet) {
+ if(this.hasCategory(cat) && c.hasCategory(cat)) {
+ score += (this.getScore(cat) * c.getScore(cat));
+ }
+ }
+
+ //normalise by magnitudes
+ Double magnitudes = (this.magnitude() * c.magnitude());
+ score /= magnitudes;
+ if(Double.isNaN(score)) {
+ return 0.0; //NaN caused by zero vectors ie no classifications!
+ }
+ else return score;
+ }
+
+ public double jsDivergence(FullAlchemyClassification c) {
+ Set<String> catSet = new HashSet<String>();
+ catSet.addAll(classifications.keySet());
+ catSet.addAll(c.getCategorySet());
+ Map<String,Double> M = new HashMap<String,Double>();
+ for(String cat : catSet) {
+ if(this.hasCategory(cat) && c.hasCategory(cat)) {
+ M.put(cat, (this.getScore(cat)+c.getScore(cat))/2.0);
+ }
+ }
+ double d1 = 0.0;
+ for(String cat : M.keySet()) {
+ if(this.getCategorySet().contains(cat) ) {
+ d1 += this.getScore(cat) * Math.log(this.getScore(cat)/M.get(cat));
+ }
+ }
+ double d2 = 0.0;
+ for(String cat : M.keySet()) {
+ if(c.getCategorySet().contains(cat)) {
+ d1 += c.getScore(cat) * Math.log(c.getScore(cat)/M.get(cat));
+ }
+ }
+ double score = d1/2.0 + d2/2.0;
+ return score;
+ }
+
+
+}
View
130 classify/.svn/text-base/FullCalaisClassification.java.svn-base
@@ -0,0 +1,130 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import uk.ac.cam.ha293.tweetlabel.topics.FullLDAClassification;
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+public class FullCalaisClassification {
+
+ private long userID;
+ private Map<String,Double> classifications;
+
+ public FullCalaisClassification(long userID) {
+ this.userID = userID;
+ classifications = new HashMap<String,Double>();
+
+ try {
+ FileInputStream fstream = new FileInputStream("classifications/calais/"+userID+".csv");
+ DataInputStream in = new DataInputStream(fstream);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in));
+ String strLine;
+ br.readLine(); //Skp the CSV descriptor
+ while ((strLine = br.readLine()) != null) {
+ String[] split = strLine.split(",");
+ String cat = split[0].substring(1,split[0].length()-1);
+
+ double score = Double.parseDouble(split[1]) / Double.parseDouble(split[2]);
+
+ classifications.put(cat,score);
+ }
+ in.close();
+ } catch (Exception e) {
+ System.err.println("Error: " + e.getMessage());
+ }
+
+ //need to normalise the classifcations, durr
+ double scoreSum = 0.0;
+ for(String cat : classifications.keySet()) {
+ scoreSum += classifications.get(cat);
+ }
+ for(String cat : classifications.keySet()) {
+ classifications.put(cat, classifications.get(cat)/scoreSum);
+ }
+
+ classifications = Tools.sortMapByValueDesc(classifications);
+ }
+
+ public void print() {
+ for(String cat : classifications.keySet()) {
+ System.out.println(cat+": "+classifications.get(cat));
+ }
+ }
+
+ public Set<String> getCategorySet() {
+ return classifications.keySet();
+ }
+
+ public boolean hasCategory(String cat) {
+ if(classifications.keySet().contains(cat)) return true;
+ else return false;
+ }
+
+ public double getScore(String cat) {
+ return classifications.get(cat);
+ }
+
+ public double magnitude() {
+ double sum = 0.0;
+ for(String cat : classifications.keySet()) {
+ sum += (classifications.get(cat)*classifications.get(cat));
+ }
+ sum = Math.sqrt(sum);
+ return sum;
+ }
+
+ public double cosineSimilarity(FullCalaisClassification c) {
+ Set<String> catSet = new HashSet<String>();
+ catSet.addAll(classifications.keySet());
+ catSet.addAll(c.getCategorySet());
+
+ double score = 0.0;
+ for(String cat : catSet) {
+ if(this.hasCategory(cat) && c.hasCategory(cat)) {
+ score += (this.getScore(cat) * c.getScore(cat));
+ }
+ }
+
+ //normalise by magnitudes
+ Double magnitudes = (this.magnitude() * c.magnitude());
+ score /= magnitudes;
+ if(Double.isNaN(score)) {
+ return 0.0; //NaN caused by zero vectors ie no classifications!
+ }
+ else return score;
+ }
+
+ public double jsDivergence(FullCalaisClassification c) {
+ Set<String> catSet = new HashSet<String>();
+ catSet.addAll(classifications.keySet());
+ catSet.addAll(c.getCategorySet());
+ Map<String,Double> M = new HashMap<String,Double>();
+ for(String cat : catSet) {
+ if(this.hasCategory(cat) && c.hasCategory(cat)) {
+ M.put(cat, (this.getScore(cat)+c.getScore(cat))/2.0);
+ }
+ }
+ double d1 = 0.0;
+ for(String cat : M.keySet()) {
+ if(this.getCategorySet().contains(cat) ) {
+ d1 += this.getScore(cat) * Math.log(this.getScore(cat)/M.get(cat));
+ }
+ }
+ double d2 = 0.0;
+ for(String cat : M.keySet()) {
+ if(c.getCategorySet().contains(cat)) {
+ d1 += c.getScore(cat) * Math.log(c.getScore(cat)/M.get(cat));
+ }
+ }
+ double score = d1/2.0 + d2/2.0;
+ return score;
+ }
+
+}
View
148 classify/.svn/text-base/FullTextwiseClassification.java.svn-base
@@ -0,0 +1,148 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import uk.ac.cam.ha293.tweetlabel.topics.FullLDAClassification;
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+public class FullTextwiseClassification {
+
+ private long userID;
+ private Map<String,Double> classifications;
+
+ public FullTextwiseClassification(long userID, boolean proper) {
+ this.userID = userID;
+ classifications = new HashMap<String,Double>();
+
+ try {
+ FileInputStream fstream;
+ if(proper) fstream = new FileInputStream("classifications/textwiseproper/"+userID+".csv");
+ else fstream = new FileInputStream("classifications/textwise/"+userID+".csv");
+ DataInputStream in = new DataInputStream(fstream);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in));
+ String strLine;
+ br.readLine(); //Skp the CSV descriptor
+ while ((strLine = br.readLine()) != null) {
+ //typical, some categories have commas in them!
+ String[] initSplit = strLine.split("\"");
+ String cat = initSplit[1];
+ String[] split = initSplit[2].split(",");
+
+ double score = 0.0;
+ if(!proper) {
+ //tweet scores
+ if(Integer.parseInt(split[2]) != 0) {
+ score += Double.parseDouble(split[1]) / Double.parseDouble(split[2]);
+ }
+ //url scores
+ if(Integer.parseInt(split[4]) != 0) {
+ score += Double.parseDouble(split[3]) / Double.parseDouble(split[4]);
+ }
+ } else {
+ score = Double.parseDouble(split[1])/Double.parseDouble(split[2]);
+ }
+
+ if(proper) cat = cat.substring(0,cat.indexOf("/")); //take only the textwise root
+ classifications.put(cat,score);
+ }
+ in.close();
+ } catch (Exception e) {
+ System.err.println("Error: " + e.getMessage());
+ System.err.println(userID);
+ }
+
+ //need to normalise the classifcations, durr
+ double scoreSum = 0.0;
+ for(String cat : classifications.keySet()) {
+ scoreSum += classifications.get(cat);
+ }
+ for(String cat : classifications.keySet()) {
+ classifications.put(cat, classifications.get(cat)/scoreSum);
+ }
+
+ classifications = Tools.sortMapByValueDesc(classifications);
+ }
+
+ public void print() {
+ for(String cat : classifications.keySet()) {
+ System.out.println(cat+": "+classifications.get(cat));
+ }
+ }
+
+ public Set<String> getCategorySet() {
+ return classifications.keySet();
+ }
+
+ public boolean hasCategory(String cat) {
+ if(classifications.keySet().contains(cat)) return true;
+ else return false;
+ }
+
+ public double getScore(String cat) {
+ return classifications.get(cat);
+ }
+
+ public double magnitude() {
+ double sum = 0.0;
+ for(String cat : classifications.keySet()) {
+ sum += (classifications.get(cat)*classifications.get(cat));
+ }
+ sum = Math.sqrt(sum);
+ return sum;
+ }
+
+ public double cosineSimilarity(FullTextwiseClassification c) {
+ Set<String> catSet = new HashSet<String>();
+ catSet.addAll(classifications.keySet());
+ catSet.addAll(c.getCategorySet());
+
+ double score = 0.0;
+ for(String cat : catSet) {
+ if(this.hasCategory(cat) && c.hasCategory(cat)) {
+ score += (this.getScore(cat) * c.getScore(cat));
+ }
+ }
+
+ //normalise by magnitudes
+ Double magnitudes = (this.magnitude() * c.magnitude());
+ score /= magnitudes;
+ if(Double.isNaN(score)) {
+ return 0.0; //NaN caused by zero vectors ie no classifications!
+ }
+ else return score;
+ }
+
+ public double jsDivergence(FullTextwiseClassification c) {
+ Set<String> catSet = new HashSet<String>();
+ catSet.addAll(classifications.keySet());
+ catSet.addAll(c.getCategorySet());
+ Map<String,Double> M = new HashMap<String,Double>();
+ for(String cat : catSet) {
+ if(this.hasCategory(cat) && c.hasCategory(cat)) {
+ M.put(cat, (this.getScore(cat)+c.getScore(cat))/2.0);
+ }
+ }
+ double d1 = 0.0;
+ for(String cat : M.keySet()) {
+ if(this.getCategorySet().contains(cat) ) {
+ d1 += this.getScore(cat) * Math.log(this.getScore(cat)/M.get(cat));
+ }
+ }
+ double d2 = 0.0;
+ for(String cat : M.keySet()) {
+ if(c.getCategorySet().contains(cat)) {
+ d1 += c.getScore(cat) * Math.log(c.getScore(cat)/M.get(cat));
+ }
+ }
+ double score = d1/2.0 + d2/2.0;
+ return score;
+ }
+
+}
View
196 classify/.svn/text-base/NaiveBayes.java.svn-base
@@ -0,0 +1,196 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import uk.ac.cam.ha293.tweetlabel.liwc.LIWCDictionary;
+import uk.ac.cam.ha293.tweetlabel.types.Category;
+import uk.ac.cam.ha293.tweetlabel.types.CategoryScore;
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+//NOTE: This class gets hairy as it performs naive bayesian classification compatible with the LIWC's ___* based word format
+//NOTE: Use LIWCDictionary.lookupLIWCVersion(word) to get LIWC* form!
+public class NaiveBayes implements Serializable {
+
+ private static final long serialVersionUID = -6242147857055689677L;
+
+ Map<Category,Map<String,Integer>> categories;
+ Map<Category,Integer> frequencyCounts;
+ Map<String,Map<Category,Integer>> words;
+ double totalNumberOfWordsTrained;
+ LIWCDictionary liwc;
+
+ public NaiveBayes(LIWCDictionary liwc) {
+ categories = new HashMap<Category,Map<String,Integer>>();
+ frequencyCounts = new HashMap<Category,Integer>();
+ words = new HashMap<String,Map<Category,Integer>>();
+ totalNumberOfWordsTrained = 0.0;
+ this.liwc = liwc;
+ }
+
+ public void addCategory(Category category) {
+ categories.put(category, new HashMap<String,Integer>());
+ frequencyCounts.put(category,new Integer(0));
+ }
+
+ //Note - since we're getting LIWC* form, we can train using LIWC* form! simple naive bayes works now. thank god.
+ //Also - each category is only going to have at most a count of 1 for any given word... how does this affect the maths?
+ //Maybe NB and training from a dictionary don't mix so well. Will test TODO
+ //Also, I guess we could use this to train a NB model from a twitter profile, which would make use of all the frequency stuff...
+ public void trainLIWC(String document, Category category) {
+ String stripped = Tools.LIWCStripTweet(document);
+ String[] split = stripped.split("\\s+");
+ if(!categories.containsKey(category)) {
+ categories.put(category, new HashMap<String,Integer>());
+ }
+ if(!frequencyCounts.containsKey(category)) {
+ frequencyCounts.put(category, new Integer(0));
+ }
+ Map<String,Integer> wordMapping = categories.get(category);
+ for(String word : split) {
+ String liwcVersion = liwc.LIWCVersionLookup(word);
+ if(liwcVersion == null) continue;
+
+ if(!words.containsKey(liwcVersion)) {
+ words.put(liwcVersion, new HashMap<Category,Integer>());
+ }
+ Map<Category,Integer> categoryMapping = words.get(liwcVersion);
+
+ //If we already have the word stored for this category, increment its count
+ //Otherwise, add it in as 1
+ if(wordMapping.containsKey(liwcVersion)) {
+ wordMapping.put(liwcVersion, new Integer(wordMapping.get(liwcVersion)+1));
+ } else {
+ wordMapping.put(liwcVersion, new Integer(1));
+ }
+
+ //If we already have the category stored for this word, increment its count
+ //Otherwise, add it in as 1
+ if(categoryMapping.containsKey(category)) {
+ categoryMapping.put(category, new Integer(categoryMapping.get(category)+1));
+ } else {
+ categoryMapping.put(category, new Integer(1));
+ }
+
+ //For easy probability calculation later on - also store individual frequencies for each category
+ //To avoid having to sum later on every time
+ totalNumberOfWordsTrained++;
+ frequencyCounts.put(category,new Integer(frequencyCounts.get(category)+1));
+ }
+ }
+
+ public List<CategoryScore> logClassify(String document) {
+ List<CategoryScore> categoryScores = new ArrayList<CategoryScore>();
+ for(Category category : categories.keySet()) {
+ double logP = logPOfCategoryGivenDocument(category, Tools.LIWCStripTweet(document));
+ categoryScores.add(new CategoryScore(category,logP));
+ }
+ return categoryScores;
+ }
+
+ public List<CategoryScore> classify(String document) {
+ List<CategoryScore> categoryScores = new ArrayList<CategoryScore>();
+ for(Category category : categories.keySet()) {
+ double p = pOfCategoryGivenDocument(category, Tools.LIWCStripTweet(document));
+ categoryScores.add(new CategoryScore(category,p));
+ }
+ return categoryScores;
+ }
+
+ public double logPOfCategoryGivenDocument(Category category, String document) {
+ double p = 0.0;
+ String[] split = document.split("\\s+");
+ for(String token : split) {
+ //We add because we're dealing with logs - this would be a multiple product, normally
+ String liwcVersion = liwc.LIWCVersionLookup(token);
+ if(liwcVersion == null) continue;
+ p += logPOfWordGivenCategory(liwcVersion, category);
+ }
+ p += logPOfCategory(category);
+ return p;
+ }
+
+ public double pOfCategoryGivenDocument(Category category, String document) {
+ double p = 1.0;
+ String[] split = document.split("\\s+");
+ for(String token : split) {
+ String liwcVersion = liwc.LIWCVersionLookup(token);
+ if(liwcVersion == null) continue;
+ p *= pOfWordGivenCategory(liwcVersion, category);
+ }
+ p *= pOfCategory(category);
+ return p;
+ }
+
+ public double logPOfCategory(Category category) {
+ double p = Math.log((double)(frequencyCounts.get(category))/totalNumberOfWordsTrained);
+ return p;
+ }
+
+ public double pOfCategory(Category category) {
+ double p = ((double)(frequencyCounts.get(category))/totalNumberOfWordsTrained);
+ return p;
+ }
+
+ public double logPOfWordGivenCategory(String word, Category category) {
+ double tiny = Math.log(0.0000000001);
+
+ if(!words.containsKey(word) || !words.get(word).containsKey(category)) {
+ return tiny; //Maybe we want to output 0 in the cases where we have not seen the word before...
+ }
+
+ double p = Math.log((double)(words.get(word).get(category)) / (double)frequencyCounts.get(category));
+ if(p == 0.0) return tiny;
+ else return p;
+ }
+
+ public double pOfWordGivenCategory(String word, Category category) {
+ double tiny = 0.0000000001;
+
+ if(!words.containsKey(word) || !words.get(word).containsKey(category)) {
+ return tiny; //Maybe we want to output 0 in the cases where we have not seen the word before...
+ }
+
+ double p = (double)(words.get(word).get(category)) / (double)frequencyCounts.get(category);
+ if(p == 0.0) return tiny;
+ else return p;
+ }
+
+ public void print() {
+ for(Category category : categories.keySet()) {
+ System.out.println("Category: "+category.getTitle());
+ for(String word : categories.get(category).keySet()) {
+ System.out.print(word+" ");
+ }
+ System.out.println();
+ }
+ }
+
+ public void tests() {
+ System.out.println("Running tests on the Naive Bayesian Classifier");
+
+ double categorySum = 0.0;
+ for(Category category : categories.keySet()) {
+ double prob = pOfCategory(category);
+ double logProb = logPOfCategory(category);
+ categorySum += prob;
+ System.out.println("P("+category.getTitle()+") = "+prob+", log(P("+category.getTitle()+")) = "+logProb);
+ }
+ System.out.println("Sum = "+categorySum);
+
+ for(Category category : categories.keySet()) {
+ double wordSum = 0.0;
+ for(String word : categories.get(category).keySet()) {
+ double prob = pOfWordGivenCategory(word,category);
+ double logProb = logPOfWordGivenCategory(word,category);
+ wordSum += prob;
+ System.out.println("P("+word+"|"+category.getTitle()+") = "+prob+", log(P("+word+"|"+category.getTitle()+")) = "+logProb);
+ }
+ System.out.println("Sum = "+wordSum);
+ }
+ }
+}
View
36 classify/.svn/text-base/TextwiseClassification.java.svn-base
@@ -0,0 +1,36 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+public class TextwiseClassification {
+
+ private Map<String,Double> categoryScores;
+
+ public TextwiseClassification() {
+ categoryScores = new HashMap<String,Double>();
+ }
+
+ public Set<String> getCategories() {
+ return categoryScores.keySet();
+ }
+
+ public Map<String,Double> getCategoryScores() {
+ return categoryScores;
+ }
+
+ public double lookupScore(String category) {
+ return categoryScores.get(category);
+ }
+
+ public void add(String category, double score) {
+ categoryScores.put(category, score);
+ }
+
+ public void print() {
+ for(String cat : categoryScores.keySet()) {
+ System.out.println(cat+": "+categoryScores.get(cat));
+ }
+ }
+}
View
149 classify/.svn/text-base/TextwiseClassifier.java.svn-base
@@ -0,0 +1,149 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLEncoder;
+
+import org.w3c.dom.NodeList;
+
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+public class TextwiseClassifier {
+
+ private static String[] keys;
+ private static int keyIndex;
+ private static int requestCount;
+ private static String requestURL;
+
+ public TextwiseClassifier() {
+ init(); //NOTE: needs to be called from elsewhere too
+ }
+
+ public static void init() {
+ keys = new String[7];
+ keys[0] = "t6sqta0i";
+ keys[1] = "kjk1r0lh";
+ keys[2] = "i3x9rte7";
+ keys[3] = "fb4dt0gk";
+ keys[4] = "gfgkiiv5";
+ keys[5] = "w1jv7fli";
+ keys[6] = "p3ssii6p";
+ keyIndex = 1;
+ requestCount = 0;
+ updateRequestURL();
+ }
+
+ //NOTE: Either need to add content=URL+ENCODED+STRING or uri=someURI to this for a valid request
+ public static void updateRequestURL() {
+ requestURL = "http://api.semantichacker.com/" + keys[keyIndex] + "/category?filter=text&format=xml&showLabels=true&nCategories=1&useShortLabels=false&";
+ //requestURL = "http://api.semantichacker.com/" + keys[keyIndex] + "/category?filter=text&format=xml&showLabels=true&nCategories=1&useShortLabels=true&";
+ }
+
+ public static TextwiseClassification classify(String text, boolean isURL) {
+ updateRequestCount();
+ String requestURLSnapshot;
+ try {
+ if(isURL) {
+ requestURLSnapshot = new String(requestURL) + "uri=" + URLEncoder.encode(text, "UTF-8");
+ } else {
+ requestURLSnapshot = new String(requestURL) + "content=" + URLEncoder.encode(text, "UTF-8");
+ if(requestURLSnapshot.length()>1000) return new TextwiseClassification();
+ }
+ URL url = new URL(requestURLSnapshot);
+
+ URLConnection urlConnection = url.openConnection();
+ BufferedInputStream buffer = new BufferedInputStream(urlConnection.getInputStream());
+ StringBuilder builder = new StringBuilder();
+ int byteRead;
+ while((byteRead = buffer.read()) != -1) {
+ builder.append((char)byteRead);
+ }
+ buffer.close();
+ String xmlString = builder.toString();
+ org.w3c.dom.Document doc = Tools.xmlStringToDocument(xmlString);
+
+ //Check for errors in the XML
+ String message;
+ try {
+ message = doc.getElementsByTagName("message").item(0).getAttributes().getNamedItem("string").getTextContent();
+ if(message.equals("Over hour limit")) {
+ System.err.println("Hourly limit reached");
+ updateKey();
+ return classify(text,isURL);
+ } else if(message.equals("Over minute limit")) {
+ System.err.println("Minute(!?) limit reached");
+ updateKey();
+ return classify(text,isURL);
+ } else if(message.equals("Over load limit")) {
+ System.err.println("Load limit reached");
+ updateKey();
+ Thread.sleep(500);
+ return classify(text,isURL);
+ } else if(message.equals("Invalid token")) {
+ System.err.println("Invalid token used");
+ updateKey();
+ return classify(text,isURL);
+ } else if(message.equals("Failure Fetching Content")) {
+ System.err.println("Couldn't fetch content, skipping");
+ updateKey();
+ return new TextwiseClassification();
+ } else if(message.equals("Invalid URI")) {
+ System.err.println(text+" is an invalid URI");
+ return new TextwiseClassification();
+ } else {
+ System.err.println("Unknown error: "+message);
+ updateKey();
+ return classify(text,isURL);
+ }
+ } catch(NullPointerException e) {
+ //This is fine - just means there's no message.
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+
+ NodeList categories = doc.getElementsByTagName("category");
+
+ TextwiseClassification result = new TextwiseClassification();
+
+ for(int i=0; i<categories.getLength(); i++) {
+ String category = categories.item(i).getAttributes().getNamedItem("label").getTextContent();
+ String score = categories.item(i).getAttributes().getNamedItem("weight").getTextContent();
+ result.add(category, Double.parseDouble(score));
+ }
+ return result;
+ } catch (UnsupportedEncodingException e) {
+ System.err.println("Couldn't URL-Encode the text");
+ } catch (MalformedURLException e) {
+ System.err.println("URL-Encoded request is badly formed");
+ } catch (IOException e) {
+ System.err.println("Couldn't connect to URL");
+ updateKey();
+ return classify(text,isURL);
+ } catch (NumberFormatException e) {
+ System.err.println("Couldn't format the score as a Double");
+ }
+ return new TextwiseClassification();
+ }
+
+ private static void updateRequestCount() {
+ requestCount++;
+ if(requestCount % 50 == 0) {
+ System.out.println("Textwise Requests Made: "+requestCount);
+ }
+ }
+
+ private static boolean updateKey() {
+ System.out.println("Trying another Textwise key...");
+ keyIndex++;
+ if(keyIndex >= keys.length) keyIndex = 0;
+ updateRequestURL();
+ System.out.println("Using key "+keyIndex);
+ return true;
+ }
+}
View
30 classify/AlchemyClassification.java
@@ -0,0 +1,30 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+public class AlchemyClassification {
+
+ private String category;
+ private double score;
+
+ public AlchemyClassification() {
+ category = null;
+ score = -1.0;
+ }
+
+ public AlchemyClassification(String category, double score) {
+ this.category = category;
+ this.score = score;
+ }
+
+ public String getCategory() {
+ return category;
+ }
+
+ public double getScore() {
+ return score;
+ }
+
+ public void print() {
+ System.out.println(category+": "+score);
+ }
+
+}
View
174 classify/AlchemyClassifier.java
@@ -0,0 +1,174 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.IOException;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.xpath.XPathExpressionException;
+import org.xml.sax.SAXException;
+
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+
+import com.alchemyapi.api.*;
+
+public class AlchemyClassifier {
+
+ private static String[] keys;
+ private static int keyIndex;
+ private static AlchemyAPI alchemyAPI;
+ private static int requestCount;
+
+ public AlchemyClassifier() {
+ init(); //NOTE: needs to be called from elsewhere too
+ }
+
+ public static void init() {
+ keys = new String[35];
+ keys[0] = "c497c0fc6dd00dd6fb13a7cb29c3311709d8c344";
+ keys[1] = "9dcb52bec38f96f4d455467453858e44cbeccdaf";
+ keys[2] = "5821b26a00557cf60289b03cb1be2257dc3cf6eb";
+ keys[3] = "8aa54e2962747f2653cd1d848a63d2a463e4d41f";
+ keys[4] = "c0b97edd81b8f4bef113f8d78b4c2755768746b6";
+ keys[5] = "f45c15726375f4fa78644d75dbc2eef2e3528d87";
+ keys[6] = "c3811e9bf08a209569d1c335e576e8b4ac0df0bc";
+ keys[7] = "358855bc94a0fbb2a7fd5ac574099cc574a63932";
+ keys[8] = "73173512d654109b54694292fc87fdf51998d408";
+ keys[9] = "2225a47a67142a25643ac926bd6e79a0b6250dd1";
+ keys[10] = "1601aaa431917f4e3ba62c7a3e1ba6afd3190d10";
+ keys[11] = "b6e21a7ded427e2d9bf0f080edadb7cd0e0f31fe";
+ keys[12] = "31b8d6989ec914b4b36f72eff8df6ac20ee37263";
+ keys[13] = "5eb99cbc7756802962c891b7843f4f6160037c31";
+ keys[14] = "8d442ee0195e3e5343a97e1d4c2b478bd91b8899";
+ keys[15] = "02e3f0ab5fd43d85178ed501e9917227c24a1be7";
+ keys[16] = "e4f32b39fa8ca88ef12f488895797695a9c6f902";
+ keys[17] = "a10efc9508044afdbd3415341e1dfb12ef27fc92";
+ keys[18] = "f809b9795a4761084f22e07b339e293c5556d308";
+ keys[19] = "b0470e00a30d6e9fbf94716baca9c4174d2b9f12";
+ keys[20] = "e7f90d7158b2168479ae944af07f56bad6bc2247";
+ keys[21] = "88bd58ed3df9d77eff384092e62d0788363242cd";
+ keys[22] = "3254e7332a9262a475236563b9984eb8b18fc97e";
+ keys[23] = "4af5c0b33f491ea00499b8dfae827c8eaa3ed128";
+ keys[24] = "15a9800bd191aad9bce2e65b7040eee74c36a110";
+ keys[25] = "51c2e1771680357dbd34bb8352bda62a6b661316";
+ keys[26] = "e21a8e0e597e5b9d3278e9d361080909481d20c9";
+ keys[27] = "371ac1ba6dc6b6809a4c2bb15bc37b78879d2714";
+ keys[27] = "81d9efc046063f6508acaf5052083c2188605817";
+ keys[28] = "b44c8b62149e0d27edc51e7df2feddd397178145";
+ keys[29] = "7431ba2ecc7703222a09dd1d86642837af5697eb";
+ keys[30] = "f80d64cc89038a918f42e95c02b6f7fe6b9308c1";
+ keys[31] = "ca3a625d7c429ace4b8033d4cc9ec3b08acaa878";
+ keys[32] = "eaab19db33996b5404a9be6ba00bc5e8c3851561";
+ keys[33] = "97b981de050c18582c39e37f9340f4c57e2e1500";
+ keys[34] = "084385760b5c9b55f8c8c91b26d5195ba3bfa6dd";
+
+
+ keyIndex = 0;
+ alchemyAPI = AlchemyAPI.GetInstanceFromString(keys[keyIndex]);
+ requestCount = 0;
+ }
+
+ //Either returns a valid AlchemyClassification, or it returns an empty one. Never null...
+ public static AlchemyClassification classifyURL(String url) {
+ updateRequestCount();
+ try {
+ org.w3c.dom.Document d = alchemyAPI.URLGetCategory(url);
+ String category = d.getChildNodes().item(0).getChildNodes().item(9).getTextContent();
+ double score = Double.parseDouble(d.getChildNodes().item(0).getChildNodes().item(11).getTextContent());
+ if(category.isEmpty() || category == null) {
+ return new AlchemyClassification();
+ } else {
+ return new AlchemyClassification(category,score);
+ }
+ } catch(NullPointerException e) {
+ System.err.println("NullPointerException trying to classify a URL:");
+ System.err.println(url);
+ } catch (IllegalArgumentException e) {
+ //URL not well formatted - might be a truncated retweet. This is fine.
+ } catch (XPathExpressionException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ //Page not HTML - image or something. Or - daily limit reached!!!
+ if(e.getMessage().contains("daily-transaction-limit-exceeded")) {
+ System.err.println("Transaction limit alert");
+ /*
+ if(updateKey()) {
+ return classifyText(text);
+ } else {
+ return null;
+ }
+ */
+ return null;
+ }
+ } catch (SAXException e) {
+ e.printStackTrace();
+ } catch (ParserConfigurationException e) {
+ e.printStackTrace();
+ }
+ return new AlchemyClassification();
+ }
+
+ public static AlchemyClassification classifyText(String text) {
+ updateRequestCount();
+ try {
+ org.w3c.dom.Document d = alchemyAPI.TextGetCategory(text);
+ //Tools.prettyPrintDocument(d);
+ String category = d.getChildNodes().item(0).getChildNodes().item(9).getTextContent();
+ double score = Double.parseDouble(d.getChildNodes().item(0).getChildNodes().item(11).getTextContent());
+ if(category == null || category.isEmpty()) {
+ return new AlchemyClassification();
+ } else {
+ return new AlchemyClassification(category,score);
+ }
+ } catch(NullPointerException e) {
+ System.err.println("NullPointerException trying to classify some text:");
+ System.err.println(text);
+ } catch (IllegalArgumentException e) {
+ //text not well formatted - might be an empty tweet? This is (probably) fine.
+ } catch (XPathExpressionException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ //Page not HTML - image or something. Or - daily limit reached!!!
+ if(e.getMessage().contains("daily-transaction-limit-exceeded")) {
+ System.err.println("Transaction limit alert");
+ if(updateKey()) {
+ return classifyText(text);
+ } else {
+ return null;
+ }
+ }
+ } catch (SAXException e) {
+ e.printStackTrace();
+ } catch (ParserConfigurationException e) {
+ e.printStackTrace();
+ }
+ return new AlchemyClassification();
+ }
+
+ private static void updateRequestCount() {
+ requestCount++;
+ if(requestCount % 50 == 0) {
+ //System.out.println("Alchemy Requests Made: "+requestCount);
+
+
+ //Experimental key-change -it works !!!
+ if(requestCount % 29900 == 0) {
+ updateKey();
+ }
+ }
+ }
+
+ private static boolean updateKey() {
+ if(keyIndex < keys.length - 1) {
+ System.out.println("Trying another alchemy key...");
+ keyIndex++;
+ System.out.println("Using key "+keyIndex);
+ alchemyAPI = AlchemyAPI.GetInstanceFromString(keys[keyIndex]);
+ return true;
+ } else {
+ System.out.println("Out of alchemy keys to try");
+ keyIndex=0;
+ return true;
+ //return false;
+ }
+ }
+
+}
View
30 classify/CalaisClassification.java
@@ -0,0 +1,30 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+public class CalaisClassification {
+
+ private Map<String,Double> categoryScores;
+
+ public CalaisClassification() {
+ categoryScores = new HashMap<String,Double>();
+ }
+
+ public Set<String> getCategories() {
+ return categoryScores.keySet();
+ }
+
+ public Map<String,Double> getCategoryScores() {
+ return categoryScores;
+ }
+
+ public double lookupScore(String category) {
+ return categoryScores.get(category);
+ }
+
+ public void add(String category, double score) {
+ categoryScores.put(category, score);
+ }
+}
View
148 classify/CalaisClassifier.java
@@ -0,0 +1,148 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.clearforest.calais.common.CalaisJavaIf;
+import com.clearforest.calais.common.StringUtils;
+
+public class CalaisClassifier {
+
+ private static String[] keys;
+ private static int keyIndex;
+ private static int requestCount;
+ private static CalaisJavaIf calais;
+
+ public CalaisClassifier() {
+ init(); //NOTE: needs to be called from elsewhere too
+ }
+
+ public static void init() {
+ keys = new String[13];
+ keys[0] = "3xw5bayrkykgeseyydwxw4re";
+ keys[1] = "amn8753bkhfhxf3mth3yv6ev";
+ keys[2] = "sg2dn8yfy4uavasmnzehaffg";
+ keys[3] = "v7kg9y8p388q3wjd4arsgtw2";
+ keys[4] = "bkuf9tnx9xsb6hgjf4f2x845";
+ keys[5] = "tnvpbfb94mxgvg9ycaygkqak";
+ keys[6] = "6umcpjh6ufp6v3th6zw6yfgs";
+ keys[7] = "zcuu786ap89euazm88b77kzm";
+ keys[8] = "54dr8xsuy6z72m5p3x6dt5nq";
+ keys[9] = "hk3nwmjzf5pg8qt79sac9yth";
+ keys[10] = "jt5pyvr9f428f3qr4rsyh6dg";
+ keys[11] = "sg87qnkr8c5jjw233ezejqwh";
+ keys[12] = "fkrrhm7w8asp3dchddzvzzs3";
+ keyIndex = 0;
+ requestCount = 8600; //roughly where it left off
+ calais = new CalaisJavaIf(keys[keyIndex]);
+ }
+
+ //Discards a lot of tweets - maybe better to amalgamate some?
+ public static CalaisClassification classifyText(String text) {
+
+ if(text.length() < 100) {
+ //System.err.println("Tweet is <100 chars - too short for Calais");
+ return null;
+ }
+
+ updateRequestCount();
+
+ //Hacky key rotation scheme to hopefully bypass the time and rate limits...
+ keyIndex++;
+ if(keyIndex >= keys.length) keyIndex = 0;
+ calais = new CalaisJavaIf(keys[keyIndex]);
+
+ CalaisClassification result = new CalaisClassification();
+ String xmlResponse = StringUtils.unescapeHTML(calais.callEnlighten(text));
+
+ DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+ DocumentBuilder documentBuilder = null;
+ org.w3c.dom.Document parsedXML = null;
+ try {
+ documentBuilder = dbf.newDocumentBuilder();
+
+ StringReader reader = new StringReader(xmlResponse);
+ InputSource inputSource = new InputSource(reader);
+ parsedXML = documentBuilder.parse(inputSource);
+
+ //System.out.println(text);
+
+ for(int i=0; i<3; i++) {
+ String pairedString = null;;
+ try {
+ pairedString = parsedXML.getChildNodes().item(0).getChildNodes().item(2).getChildNodes().item(3+i).getTextContent();
+ } catch(NullPointerException e) {
+ //System.err.println("NullPointerException found, scores cannot be present in the XML");
+ break;
+ }
+ if(!pairedString.startsWith("Calais")) {
+ break;
+ }
+ pairedString = pairedString.substring(6);
+ int scoreIndex = pairedString.indexOf("0.");
+ if(pairedString.equals("Other")) {
+ result.add("Other", 1.0);
+ break;
+ }
+
+ //"Foolproof" check to make sure we do actually have a category String...
+ if(scoreIndex == -1) {
+ break;
+ }
+
+ String category = pairedString.substring(0, scoreIndex);
+ String score = pairedString.substring(scoreIndex);
+ try {
+ result.add(category, Double.parseDouble(score));
+ } catch(NumberFormatException e) {
+ System.err.println("Got an NFE - "+category+", "+score);
+ }
+ }
+
+ return result;
+
+ } catch (IOException e) {
+ System.err.println("Couldn't parse the XML response - IOException");
+ } catch (ParserConfigurationException e) {
+ System.err.println("Couldn't parse the XML response - Parser Config Error");
+ } catch (SAXException e) {
+ System.err.println("Couldn't parse the XML response - SAX Error. Retrying...");
+
+ //experimental retry
+ return classifyText(text);
+ }
+ return null;
+ }
+
+ private static void updateRequestCount() {
+ requestCount++;
+ if(requestCount % 50 == 0) {
+ System.out.println("Calais Requests Made: "+requestCount);
+
+ if(requestCount % 49900 == 0) {
+ updateKey();
+ }
+ }
+ }
+
+ private static boolean updateKey() {
+ if(keyIndex < keys.length - 1) {
+ System.out.println("Trying another Calais key...");
+ keyIndex++;
+ calais = new CalaisJavaIf(keys[keyIndex]);
+ System.out.println("Using key "+keyIndex);
+ return true;
+ } else {
+ System.out.println("Out of Calais keys to try");
+ return false;
+ }
+ }
+
+}
View
139 classify/FullAlchemyClassification.java
@@ -0,0 +1,139 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import uk.ac.cam.ha293.tweetlabel.topics.FullLDAClassification;
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+public class FullAlchemyClassification {
+
+ private long userID;
+ private Map<String,Double> classifications;
+
+ public FullAlchemyClassification(long userID) {
+ this.userID = userID;
+ classifications = new HashMap<String,Double>();
+
+ try {
+ FileInputStream fstream = new FileInputStream("classifications/alchemy/"+userID+".csv");
+ DataInputStream in = new DataInputStream(fstream);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in));
+ String strLine;
+ br.readLine(); //Skp the CSV descriptor
+ while ((strLine = br.readLine()) != null) {
+ String[] split = strLine.split(",");
+ String cat = split[0].substring(1,split[0].length()-1);
+
+ double score = 0.0;
+ //tweet scores
+ if(Integer.parseInt(split[2]) != 0) {
+ score += Double.parseDouble(split[1]) / Double.parseDouble(split[2]);
+ }
+ //url scores
+ if(Integer.parseInt(split[4]) != 0) {
+ score += Double.parseDouble(split[3]) / Double.parseDouble(split[4]);
+ }
+
+ classifications.put(cat,score);
+ }
+ in.close();
+ } catch (Exception e) {
+ System.err.println("Error: " + e.getMessage());
+ }
+
+ //need to normalise the alchemy classifcations, durr
+ double scoreSum = 0.0;
+ for(String cat : classifications.keySet()) {
+ scoreSum += classifications.get(cat);
+ }
+ for(String cat : classifications.keySet()) {
+ classifications.put(cat, classifications.get(cat)/scoreSum);
+ }
+
+ classifications = Tools.sortMapByValueDesc(classifications);
+ }
+
+ public void print() {
+ for(String cat : classifications.keySet()) {
+ System.out.println(cat+": "+classifications.get(cat));
+ }
+ }
+
+ public Set<String> getCategorySet() {
+ return classifications.keySet();
+ }
+
+ public boolean hasCategory(String cat) {
+ if(classifications.keySet().contains(cat)) return true;
+ else return false;
+ }
+
+ public double getScore(String cat) {
+ return classifications.get(cat);
+ }
+
+ public double magnitude() {
+ double sum = 0.0;
+ for(String cat : classifications.keySet()) {
+ sum += (classifications.get(cat)*classifications.get(cat));
+ }
+ sum = Math.sqrt(sum);
+ return sum;
+ }
+
+ public double cosineSimilarity(FullAlchemyClassification c) {
+ Set<String> catSet = new HashSet<String>();
+ catSet.addAll(classifications.keySet());
+ catSet.addAll(c.getCategorySet());
+
+ double score = 0.0;
+ for(String cat : catSet) {
+ if(this.hasCategory(cat) && c.hasCategory(cat)) {
+ score += (this.getScore(cat) * c.getScore(cat));
+ }
+ }
+
+ //normalise by magnitudes
+ Double magnitudes = (this.magnitude() * c.magnitude());
+ score /= magnitudes;
+ if(Double.isNaN(score)) {
+ return 0.0; //NaN caused by zero vectors ie no classifications!
+ }
+ else return score;
+ }
+
+ public double jsDivergence(FullAlchemyClassification c) {
+ Set<String> catSet = new HashSet<String>();
+ catSet.addAll(classifications.keySet());
+ catSet.addAll(c.getCategorySet());
+ Map<String,Double> M = new HashMap<String,Double>();
+ for(String cat : catSet) {
+ if(this.hasCategory(cat) && c.hasCategory(cat)) {
+ M.put(cat, (this.getScore(cat)+c.getScore(cat))/2.0);
+ }
+ }
+ double d1 = 0.0;
+ for(String cat : M.keySet()) {
+ if(this.getCategorySet().contains(cat) ) {
+ d1 += this.getScore(cat) * Math.log(this.getScore(cat)/M.get(cat));
+ }
+ }
+ double d2 = 0.0;
+ for(String cat : M.keySet()) {
+ if(c.getCategorySet().contains(cat)) {
+ d1 += c.getScore(cat) * Math.log(c.getScore(cat)/M.get(cat));
+ }
+ }
+ double score = d1/2.0 + d2/2.0;
+ return score;
+ }
+
+
+}
View
130 classify/FullCalaisClassification.java
@@ -0,0 +1,130 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import uk.ac.cam.ha293.tweetlabel.topics.FullLDAClassification;
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+public class FullCalaisClassification {
+
+ private long userID;
+ private Map<String,Double> classifications;
+
+ public FullCalaisClassification(long userID) {
+ this.userID = userID;
+ classifications = new HashMap<String,Double>();
+
+ try {
+ FileInputStream fstream = new FileInputStream("classifications/calais/"+userID+".csv");
+ DataInputStream in = new DataInputStream(fstream);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in));
+ String strLine;
+ br.readLine(); //Skp the CSV descriptor
+ while ((strLine = br.readLine()) != null) {
+ String[] split = strLine.split(",");
+ String cat = split[0].substring(1,split[0].length()-1);
+
+ double score = Double.parseDouble(split[1]) / Double.parseDouble(split[2]);
+
+ classifications.put(cat,score);
+ }
+ in.close();
+ } catch (Exception e) {
+ System.err.println("Error: " + e.getMessage());
+ }
+
+ //need to normalise the classifcations, durr
+ double scoreSum = 0.0;
+ for(String cat : classifications.keySet()) {
+ scoreSum += classifications.get(cat);
+ }
+ for(String cat : classifications.keySet()) {
+ classifications.put(cat, classifications.get(cat)/scoreSum);
+ }
+
+ classifications = Tools.sortMapByValueDesc(classifications);
+ }
+
+ public void print() {
+ for(String cat : classifications.keySet()) {
+ System.out.println(cat+": "+classifications.get(cat));
+ }
+ }
+
+ public Set<String> getCategorySet() {
+ return classifications.keySet();
+ }
+
+ public boolean hasCategory(String cat) {
+ if(classifications.keySet().contains(cat)) return true;
+ else return false;
+ }
+
+ public double getScore(String cat) {
+ return classifications.get(cat);
+ }
+
+ public double magnitude() {
+ double sum = 0.0;
+ for(String cat : classifications.keySet()) {
+ sum += (classifications.get(cat)*classifications.get(cat));
+ }
+ sum = Math.sqrt(sum);
+ return sum;
+ }
+
+ public double cosineSimilarity(FullCalaisClassification c) {
+ Set<String> catSet = new HashSet<String>();
+ catSet.addAll(classifications.keySet());
+ catSet.addAll(c.getCategorySet());
+
+ double score = 0.0;
+ for(String cat : catSet) {
+ if(this.hasCategory(cat) && c.hasCategory(cat)) {
+ score += (this.getScore(cat) * c.getScore(cat));
+ }
+ }
+
+ //normalise by magnitudes
+ Double magnitudes = (this.magnitude() * c.magnitude());
+ score /= magnitudes;
+ if(Double.isNaN(score)) {
+ return 0.0; //NaN caused by zero vectors ie no classifications!
+ }
+ else return score;
+ }
+
+ public double jsDivergence(FullCalaisClassification c) {
+ Set<String> catSet = new HashSet<String>();
+ catSet.addAll(classifications.keySet());
+ catSet.addAll(c.getCategorySet());
+ Map<String,Double> M = new HashMap<String,Double>();
+ for(String cat : catSet) {
+ if(this.hasCategory(cat) && c.hasCategory(cat)) {
+ M.put(cat, (this.getScore(cat)+c.getScore(cat))/2.0);
+ }
+ }
+ double d1 = 0.0;
+ for(String cat : M.keySet()) {
+ if(this.getCategorySet().contains(cat) ) {
+ d1 += this.getScore(cat) * Math.log(this.getScore(cat)/M.get(cat));
+ }
+ }
+ double d2 = 0.0;
+ for(String cat : M.keySet()) {
+ if(c.getCategorySet().contains(cat)) {
+ d1 += c.getScore(cat) * Math.log(c.getScore(cat)/M.get(cat));
+ }
+ }
+ double score = d1/2.0 + d2/2.0;
+ return score;
+ }
+
+}
View
148 classify/FullTextwiseClassification.java
@@ -0,0 +1,148 @@
+package uk.ac.cam.ha293.tweetlabel.classify;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import uk.ac.cam.ha293.tweetlabel.topics.FullLDAClassification;
+import uk.ac.cam.ha293.tweetlabel.util.Tools;
+
+public class FullTextwiseClassification {
+
+ private long userID;
+ private Map<String,Double> classifications;
+
+ public FullTextwiseClassification(long userID, boolean proper) {
+ this.userID = userID;
+ classifications = new HashMap<String,Double>();
+
+ try {
+ FileInputStream fstream;
+ if(proper) fstream = new FileInputStream("classifications/textwiseproper/"+userID+".csv");
+ else fstream = new FileInputStream("classifications/textwise/"+userID+".csv");
+ DataInputStream in = new DataInputStream(fstream);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in));
+ String strLine;
+ br.readLine(); //Skp the CSV descriptor
+ while ((strLine = br.readLine()) != null) {
+ //typical, some categories have commas in them!
+ String[] initSplit = strLine.split("\"");
+ String cat = initSplit[1];
+ String[] split = initSplit[2].split(",");
+
+ double score = 0.0;
+ if(!proper) {
+ //tweet scores
+ if(Integer.parseInt(split[2]) != 0) {
+ score += Double.parseDouble(split[1]) / Double.parseDouble(split[2]);
+ }
+ //url scores
+ if(Integer.parseInt(split[4]) != 0) {
+ score += Double.parseDouble(split[3]) / Double.parseDouble(split[4]);
+ }
+ } else {
+ score = Double.parseDouble(split[1])/Double.parseDouble(split[2]);
+ }
+
+ if(proper) cat = cat.substring(0,cat.indexOf("/")); //take only the textwise root
+ classifications.put(cat,score);
+ }
+ in.close();
+ } catch (Exception e) {
+ System.err.println("Error: " + e.getMessage());
+ System.err.println(userID);
+ }
+
+ //need to normalise the classifcations, durr
+ double scoreSum = 0.0;
+ for(String cat : classifications.keySet()) {
+ scoreSum += classifications.get(cat);
+ }
+ for(String cat : classifications.keySet()) {
+ classifications.put(cat, classifications.get(cat)/scoreSum);
+ }
+
+ classifications = Tools.sortMapByValueDesc(classifications);
+ }
+
+ public void print() {
+ for(String cat : classifications.keySet()) {
+ System.out.println(cat+": "+classifications.get(cat));
+ }
+ }
+
+ public Set<String> getCategorySet() {
+ return classifications.keySet();
+ }
+
+ public boolean hasCategory(String cat) {