Permalink
Browse files

sorting files

  • Loading branch information...
j3nnn1 committed Jul 22, 2013
1 parent 2665697 commit 3b831bbdffd75a92197682039ebbaf3fa638d1a9
Showing with 229,111 additions and 33 deletions.
  1. +5 −0 .gitignore
  2. +24 −0 DBformatCsvTsv/csvToJson.rb
  3. +0 −23 dotcloud-test/debug.txt
  4. +5 −0 textmining/README.txt
  5. +68 −0 textmining/bigram.pl
  6. +8 −0 textmining/cap2textmining.pl
  7. +11 −0 textmining/cap3/3.10_textmining.pl
  8. +131 −0 textmining/cap3/3.3_textmining.pl
  9. +67 −0 textmining/cap3/3.4_textmining.pl
  10. +155 −0 textmining/cap3/3.5_textmining.pl
  11. +72 −0 textmining/cap3/3.6_textmining.pl
  12. +80 −0 textmining/cap3/3.7_textmining.pl
  13. +39 −0 textmining/cap3/3.8_textmining.pl
  14. +55 −0 textmining/cap3/3.9_textmining.pl
  15. 0 textmining/cap3/3.pl
  16. +113,809 −0 textmining/cap3/CROSSWD.txt
  17. +113,809 −0 textmining/cap3/CROSSWD.txt.fic
  18. +31 −0 textmining/cap3/exercise3.1textmining.pl
  19. +28 −0 textmining/cap4/4.1_textmining.pl
  20. +13 −0 textmining/cap4/4.2_textminig.pl
  21. +8 −0 textmining/cap4/4.3_textminig.pl
  22. +4 −0 textmining/cap4/4.4_textmining.pl
  23. +18 −0 textmining/cap4/4.5_textmining.pl
  24. +10 −0 textmining/cap4/4.6_textmining.pl
  25. +47 −0 textmining/cap4/4.7_textmining.pl
  26. +12 −0 textmining/cap4/4.8_textmining.pl
  27. +62 −0 textmining/cap5/cap.5.4_sample5.5_cosene.pl
  28. +6 −0 textmining/cap5/cap5-countword-pl.txt
  29. +43 −0 textmining/cap5/cap5.4_productotwovector.pl
  30. +17 −0 textmining/cap5/length_vector.pl
  31. +23 −0 textmining/cap5/producttwovector.pl
  32. +12 −0 textmining/chapter9_acme_umlautify.pl
  33. +55 −0 textmining/count_characters_text.pl
  34. +27 −0 textmining/fall_word_richness.pl
  35. +23 −0 textmining/flip.pl
  36. +45 −0 textmining/perl/ADVANCED/Data-BT-PhoneBill-Call.pm
  37. +36 −0 textmining/perl/ADVANCED/GLOB.PL
  38. +25 −0 textmining/perl/ADVANCED/Some-Module.pm
  39. +20 −0 textmining/perl/ADVANCED/export.pl
  40. +67 −0 textmining/probabilityconditional.pl
  41. +27 −0 textmining/textminingtwitter.pl
  42. +58 −0 textmining/thebagofwords.pl
  43. +46 −0 textmining/twitterperl.pl
  44. +2 −2 twitter/perl/date-calc.pl
  45. +2 −2 twitter/perl/diad.pl
  46. +2 −2 twitter/perl/directtoupdate.pl
  47. +2 −2 twitter/perl/search.pl
  48. +2 −2 twitter/perl/twitter.pl
View
@@ -0,0 +1,5 @@
+
+*.Rout
+*~
+*.csv
+*.nosubir
@@ -0,0 +1,24 @@
+require 'csv'
+require 'json'
+
+options = Hash.new
+options[:col_sep] = ';'
+options[:headers] = false
+
+
+tweet = CSV.read('data/manyeye.csv', options).to_a
+
+#tweetheader = tweet.headers
+tweetheader = ['source','location','followers_count']
+tweets = Array.new
+
+
+tweet.to_a.each { |row|
+ tweetHash = Hash.new
+ tweetHash['source'] = row[0]
+ tweetHash['location'] = row[1]
+ tweetHash['followers_count'] = row[2]
+ tweets.push(tweetHash)
+}
+puts tweets.to_json
+
View
@@ -1,23 +0,0 @@
-dotcloud logs kintrini.www
-
-create account
-sudo easy_install pip && sudo pip install dotcloud
-dotcloud setup
-dotcloud create kintriniapp
-
-create local directory dotcloudtest
-on dotcloudtest put dotcloud.yml
-and a directory helloperl
-dotcloudtest
-|---dotcloud.yml
-|---helloperl
-|---|---app.psgi
-|---|---Makefile.PL
-|---|---myapp.pl
-|---|---static
-
-dotcloud push kintrini .
-or
-dotcloud push kintrini dotcloudtest
-
-
View
@@ -0,0 +1,5 @@
+
+
+Apuntes de Textmining with Perl
+Book: Practical Text Mining with Perl
+http://www.amazon.com/Practical-Mining-Series-Methods-Applications/dp/0470176431
View
@@ -0,0 +1,68 @@
+#!/usr/bin/perl
+use strict;
+
+
+# 1 se sustituyen las no letras
+# 2 se cambian los multiples espacios en uno
+# 3 convierte todas las letras mayusculas en minusculas
+# 4 convierte el texto en palabras con la función split
+# 5 luego divide cada palabra en letras
+# 6 Finalmente las letras son combinadas en pares con la funcion join
+# 7 numero de bigramas numero de bigramas elevado a la 2, numero de posibles bigramas
+
+#recibo el archivo como parametro de entrada
+my %freq;
+my $count;
+
+open (FILE, $ARGV[0]) or die "no existe archivo \n";
+
+
+while (my $linea=<FILE>){
+
+$linea =~ s/[^a-zA-Z]/ /g;
+$linea =~ s/-+/ /g;
+$linea =~ s/\s+/ /g;
+lc $linea;
+#divide en palabras la linea
+my @words = split(/\s/, $linea);
+
+ foreach my $word (@words) {
+
+ my @letters = split(//, $word);
+
+ foreach (@letters) {
+ my $i; $i++;
+ my $bigram = join ('', @letters[$i..($i+1)]);
+ ++$freq{$bigram};
+ ++$count;
+ }
+ }
+
+}
+close FILE;
+
+#imprimiento resultados de los bigramas
+
+my $bigram;
+my $j=0;
+
+foreach $bigram ( sort byReverseValues keys %freq) {
+ print "$bigram: $freq{$bigram}\n";
+ $j++;
+
+ if ($j==10) { last;}
+}
+
+#total.
+
+print "Total de numero de bigramas: $count \n";
+
+
+sub byReverseValues {
+#los ordena por la cantidad de veces que sale una letra
+#print "a:$a b:$b \n";
+ my $value = $freq{$b} <=> $freq{$a};
+ if ($value==0) { return $a cmp $b;}
+ else { return $value;}
+}
+
@@ -0,0 +1,8 @@
+#!/usr/bin/perl -w
+use strict;
+use common::sense;
+
+#jcmm986 - 0.0.0 -
+
+$expr = / cat /;
+
@@ -0,0 +1,11 @@
+#As noted in problem 2.9.b, at present, there are plenty of DNA sequences available to the public
+#at the National Center for Biotechnology Information (NCBI) at its Web page: http://www.ncbi.nlm.nih.gov/ [81].
+
+#Since DNA is text (though its words and grammar are mostly unknown), it makes sense to use Perl and
+#regexes to analyze DNA, which is exactly the point made in Perl for Exploring DNA by Mark LeBlanc
+#and Betsey Dexter Dyer [70]. Note that these authors also enjoy word games, and they introduce the
+# idea of text patterns in DNA by analyzing letter patterns in English words.
+# Even without a background in biology, the book is quite readable, and I recommend it.
+
+#For this problem, get a copy of LeBlanc and Dyer’s book and see how they use Perl for DNA pattern finding.
+# With the data of the NCBI, perhaps you can discover something notable.
@@ -0,0 +1,131 @@
+#Discusses word ANAGRAMS,
+
+#which are two (or more) words using the same group of letters, but with different orders.
+#For example, algorithm is an anagram of logarithm since the former is a permutation of the latter.
+#Instead of allowing all permutations,
+#one challenge is finding word anagrams limited to only certain types of permutations,
+#A - The easiest permutation to check is reversing the letters because there is already a Perl function to do this, reverse.
+#examples of which are given below. Examples can be found by using a word list, for example, Grady Ward’s CROSSWD.TXT from the Moby Word Lists [123].
+#Find all the words that are also words when read backwards.
+#Hint: One way to do this is to create a hash, say %list, where its keys are the words in the word list.
+#Then loop through the keys checking to see if the reversal of the word is also a key, as done in code sample 3.36.
+#Note that this also finds palindromes, that is, words that are the same backwards as forwards, for example, deified.
+#B - Another simple permutation is taking the last letter and putting it first (sometimes called a rotation).
+#Find words for which this rotation is also a word. For example, rotating trumpets produces strumpet,
+#or rotating elects produces select.
+#Hint: Use the function rotate in code sample 3.37 instead of reverse in code sample 3.36.
+
+ #sub rotate {
+ # my $word = $_[0];
+ # my @letters = split(//, $word);
+ # unshift(@letters, pop(@letters));
+ # return join('', @letters);
+ #}
+
+# C - Create a function that is a rotation in the opposite sense of rotate in code sample 3.37.
+#Then find all words that are still words under this new rotation. For example, rotating swear
+#in this way produces wears. Question: how does this list compare with the list from problem 3.3.b?
+
+#______________________________SPANISH_____________________________________
+
+#ANAGRAMAS,
+#Los anagramas son dos o más palabras usando el mismo grupo de letras, pero con diferente orden.
+#Por ejemplo, algoritmo es un anagrama de logaritmo. A pesar de permitir todas las permutaciones,
+#una forma es encontrar palabras anagramas limitar por ciertos tipos de permutaciones,
+#Un ejemplo puede ser encontrado usando una lista de palabras,
+#como Grady Ward’s CROSSWD.TXT de la lista de Moby.
+
+#A - la forma más fácil de revisar la permutación es revertiendo las letras, porque ya existe una función en perl
+#que hace esto, reverse. Encuentra todas las palabras que son tambien palabras cuando son leidas al reves.
+#Pista: Una forma para hacer esto es crear un hash, llamado %list, donde sus llaves son las palabras en la lista
+#de palabras. Entonces recorre las llaves chequeando para ver si la palabra al revés es entonces una llave,
+#Como en el codigo de ejemplo 3.36.
+#Note que esto tambien encuentra PALINDROMES,
+#que son palabras que tienen el mismo significado escritas al revés y de forma normal, por ejemplo deified.
+
+ #foreach $x (sort %list) {
+ # if ( exists($list{reverse($x)}) ) {
+ # print "$x\n";
+ # }
+ #}
+
+#B - Otra simple permutacion es tomando la ultima letra y colocandola de primero (algunas veces llamada rotación).
+#encuentra palabras en las cuales esta rotación es tambien una palabra por ejemplo, rotando trumpets se produce strumpet,
+#o rotando elects se produce select.
+#Pista: Usa la función rotate en el codigo de ejemplo 3.37 así como tambien de reverse en el codigo de ejemplo 3.36
+#Code Sample 3.37: A function to move the last letter of a word to the front for problem 3.3.b.
+
+#sub rotate {
+# my $word = $_[0];
+# my @letters = split(//, $word);
+# unshift(@letters, pop(@letters));
+# return join('', @letters);
+#}
+
+#C - Crea una funcion que se llame rotacion, en opuesto sentido de rotate en el ejemplo de codigo del 3.37
+#Encuentra todas las que todavia son palabras en esta nueva rotación, por ejemplo rotando swear
+#En esta manera produce wears. Pregunta: Como esta lista compara con la lista del problema 3.3b?
+
+#leyendo CROSSWD.txt
+
+my %list;
+my %palindrome;
+my %rotate;
+my %rotacion;
+
+open WORD, 'CROSSWD.txt';
+
+while (<WORD>) {
+ chomp;
+ #print $_."\n";
+ $list{$_}=undef;
+}
+
+close WORD;
+
+print "Ejercicio A \n";
+foreach my $word (sort %list){
+ if (exists($list{reverse($word)})) {
+ print "$word : ".reverse($word)."\n";
+ $palindrome{$word}=undef;
+ }
+}
+print "Total palindromes: ". scalar (keys (%palindrome ))."\n";
+print "Total palabras CROSSWD.txt: " . scalar (keys (%list))."\n";
+
+print "Ejercicio B \n";
+sub rotate {
+ my $word = $_[0];
+ my @letters = split(//, $word);
+ unshift(@letters, pop(@letters));
+ return join('', @letters);
+}
+
+foreach my $word (sort %list){
+ if (exists($list{rotate($word)})) {
+ print "$word : ".rotate($word)."\n";
+ $rotate{$word}=undef;
+ }
+}
+print "Total rotate: ". scalar (keys (%rotate ))."\n";
+print "Total palabras CROSSWD.txt: " . scalar (keys (%list))."\n";
+
+print "Ejercicio C \n";
+
+sub rotacion {
+ my $word = shift;
+ my @letters = split //, $word;
+ push(@letters, shift(@letters));
+ return join('', @letters);
+}
+
+foreach my $word (sort %list){
+ if (exists($list{rotacion($word)})) {
+ print "$word : ".rotacion($word)."\n";
+ $rotacion{$word}=undef;
+ }
+}
+print "Total palindromes: ". scalar (keys (%palindrome ))."\n";
+print "Total rotate: ". scalar (keys (%rotate ))."\n";
+print "Total rotacion: ". scalar (keys (%rotacion ))."\n";
+print "Total palabras CROSSWD.txt: " . scalar (keys (%list))."\n";
@@ -0,0 +1,67 @@
+#Using a word list, for example, Grady Ward’s CROSSWD.TXT from the Moby Word Lists [123],
+#find all the words where every letter appears exactly twice using a regex. For example,
+#this is true of the word hotshots.
+#This property is called an isogram. For more information on these, see section 29 and figure
+# 29c of Ross Eckler’s Making the Alphabet Dance
+
+#First hint: Sort the letters of each word into alphabetical order,
+#then try to create a regex that matches pairs of letters.
+#Note that /^((\w)\1)+$/ seems promising, but does not work.
+#Second hint: define a pair of letters regex using qr// as shown below.
+$pattern = qr/(\w)\1/;
+
+#Then use the regex /^$pattern+$/. This regex allows false positives (describe them).
+#Is there a simple way to correct this?
+
+
+#____________SPANISH______________
+
+#USANDO una lista de palabras, por ejemplo Grady ward's CROSSWD.TXT de la lista de palabras Moby,
+#Encuentra todas las palabras donde cada letra aparece exactamente 2 veces usando una expresión regular,
+#por Ejemplo, esto es verdadero en la palabra hotshots
+#Esta propiedad es llamada Isograma, para mas información de esto, ver la seccion 29 y figura 29c
+
+#PRIMERA PISTA: Organiza las letras de cada palabra en orden alfabetico,
+#Entonces intenta crear una expresion regular que extraiga solo par de letras,
+#Note que /^((\w)\1)+$/ parece prometer, pero esto no funciona.
+#SEGUNDA PISTA: define un par de letras en regex usando qr// como se muestra a continuacion.
+#$pattern = qr/(\w)\1/;
+#Entonces usa la expresion regular /^$pattern+$/. esta expresion regular permite falsos positivos (descríbelos)
+#Existe una manera simple de corregir esto?
+
+my %list;
+my %isograma;
+
+open WORD, 'CROSSWD.txt';
+
+while (<WORD>) {
+ chomp;
+ #print $_."\n";
+ $list{$_}=undef;
+}
+
+close WORD;
+
+sub isogram {
+ my $word = shift;
+ my @letters = split //, $word;
+ my @orden = sort @letters;
+ my $word2 = join '', @orden;
+ $pattern = qr/(\w)\1/;
+ if ($word2 =~/^$pattern+$/) {
+ return $word;
+ }
+ else {
+ return undef;
+ }
+}
+
+foreach my $word (sort %list){
+ if (exists($list{isogram($word)})) {
+ print "$word : ".isogram($word)."\n";
+ $isograma{$word}=undef;
+ }
+}
+
+print "Total isogramas: ". scalar (keys (%isograma ))."\n";
+print "Total palabras CROSSWD.txt: " . scalar (keys (%list))."\n";
Oops, something went wrong.

0 comments on commit 3b831bb

Please sign in to comment.