Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

sorting files

  • Loading branch information...
commit 3b831bbdffd75a92197682039ebbaf3fa638d1a9 1 parent 2665697
@j3nnn1 authored
Showing with 229,111 additions and 33 deletions.
  1. +5 −0 .gitignore
  2. +24 −0 DBformatCsvTsv/csvToJson.rb
  3. +0 −23 dotcloud-test/debug.txt
  4. +5 −0 textmining/README.txt
  5. +68 −0 textmining/bigram.pl
  6. +8 −0 textmining/cap2textmining.pl
  7. +11 −0 textmining/cap3/3.10_textmining.pl
  8. +131 −0 textmining/cap3/3.3_textmining.pl
  9. +67 −0 textmining/cap3/3.4_textmining.pl
  10. +155 −0 textmining/cap3/3.5_textmining.pl
  11. +72 −0 textmining/cap3/3.6_textmining.pl
  12. +80 −0 textmining/cap3/3.7_textmining.pl
  13. +39 −0 textmining/cap3/3.8_textmining.pl
  14. +55 −0 textmining/cap3/3.9_textmining.pl
  15. 0  textmining/cap3/3.pl
  16. +113,809 −0 textmining/cap3/CROSSWD.txt
  17. +113,809 −0 textmining/cap3/CROSSWD.txt.fic
  18. +31 −0 textmining/cap3/exercise3.1textmining.pl
  19. +28 −0 textmining/cap4/4.1_textmining.pl
  20. +13 −0 textmining/cap4/4.2_textminig.pl
  21. +8 −0 textmining/cap4/4.3_textminig.pl
  22. +4 −0 textmining/cap4/4.4_textmining.pl
  23. +18 −0 textmining/cap4/4.5_textmining.pl
  24. +10 −0 textmining/cap4/4.6_textmining.pl
  25. +47 −0 textmining/cap4/4.7_textmining.pl
  26. +12 −0 textmining/cap4/4.8_textmining.pl
  27. +62 −0 textmining/cap5/cap.5.4_sample5.5_cosene.pl
  28. +6 −0 textmining/cap5/cap5-countword-pl.txt
  29. +43 −0 textmining/cap5/cap5.4_productotwovector.pl
  30. +17 −0 textmining/cap5/length_vector.pl
  31. +23 −0 textmining/cap5/producttwovector.pl
  32. +12 −0 textmining/chapter9_acme_umlautify.pl
  33. +55 −0 textmining/count_characters_text.pl
  34. +27 −0 textmining/fall_word_richness.pl
  35. +23 −0 textmining/flip.pl
  36. +45 −0 textmining/perl/ADVANCED/Data-BT-PhoneBill-Call.pm
  37. +36 −0 textmining/perl/ADVANCED/GLOB.PL
  38. +25 −0 textmining/perl/ADVANCED/Some-Module.pm
  39. +20 −0 textmining/perl/ADVANCED/export.pl
  40. +67 −0 textmining/probabilityconditional.pl
  41. +27 −0 textmining/textminingtwitter.pl
  42. +58 −0 textmining/thebagofwords.pl
  43. +46 −0 textmining/twitterperl.pl
  44. +2 −2 twitter/perl/date-calc.pl
  45. +2 −2 twitter/perl/diad.pl
  46. +2 −2 twitter/perl/directtoupdate.pl
  47. +2 −2 twitter/perl/search.pl
  48. +2 −2 twitter/perl/twitter.pl
View
5 .gitignore
@@ -0,0 +1,5 @@
+
+*.Rout
+*~
+*.csv
+*.nosubir
View
24 DBformatCsvTsv/csvToJson.rb
@@ -0,0 +1,24 @@
+require 'csv'
+require 'json'
+
+options = Hash.new
+options[:col_sep] = ';'
+options[:headers] = false
+
+
+tweet = CSV.read('data/manyeye.csv', options).to_a
+
+#tweetheader = tweet.headers
+tweetheader = ['source','location','followers_count']
+tweets = Array.new
+
+
+tweet.to_a.each { |row|
+ tweetHash = Hash.new
+ tweetHash['source'] = row[0]
+ tweetHash['location'] = row[1]
+ tweetHash['followers_count'] = row[2]
+ tweets.push(tweetHash)
+}
+puts tweets.to_json
+
View
23 dotcloud-test/debug.txt
@@ -1,23 +0,0 @@
-dotcloud logs kintrini.www
-
-create account
-sudo easy_install pip && sudo pip install dotcloud
-dotcloud setup
-dotcloud create kintriniapp
-
-create local directory dotcloudtest
-on dotcloudtest put dotcloud.yml
-and a directory helloperl
-dotcloudtest
-|---dotcloud.yml
-|---helloperl
-|---|---app.psgi
-|---|---Makefile.PL
-|---|---myapp.pl
-|---|---static
-
-dotcloud push kintrini .
-or
-dotcloud push kintrini dotcloudtest
-
-
View
5 textmining/README.txt
@@ -0,0 +1,5 @@
+
+
+Apuntes de Textmining with Perl
+Book: Practical Text Mining with Perl
+http://www.amazon.com/Practical-Mining-Series-Methods-Applications/dp/0470176431
View
68 textmining/bigram.pl
@@ -0,0 +1,68 @@
+#!/usr/bin/perl
+use strict;
+
+
+# 1 se sustituyen las no letras
+# 2 se cambian los multiples espacios en uno
+# 3 convierte todas las letras mayusculas en minusculas
+# 4 convierte el texto en palabras con la función split
+# 5 luego divide cada palabra en letras
+# 6 Finalmente las letras son combinadas en pares con la funcion join
+# 7 numero de bigramas numero de bigramas elevado a la 2, numero de posibles bigramas
+
+#recibo el archivo como parametro de entrada
+my %freq;
+my $count;
+
+open (FILE, $ARGV[0]) or die "no existe archivo \n";
+
+
+while (my $linea=<FILE>){
+
+$linea =~ s/[^a-zA-Z]/ /g;
+$linea =~ s/-+/ /g;
+$linea =~ s/\s+/ /g;
+lc $linea;
+#divide en palabras la linea
+my @words = split(/\s/, $linea);
+
+ foreach my $word (@words) {
+
+ my @letters = split(//, $word);
+
+ foreach (@letters) {
+ my $i; $i++;
+ my $bigram = join ('', @letters[$i..($i+1)]);
+ ++$freq{$bigram};
+ ++$count;
+ }
+ }
+
+}
+close FILE;
+
+#imprimiento resultados de los bigramas
+
+my $bigram;
+my $j=0;
+
+foreach $bigram ( sort byReverseValues keys %freq) {
+ print "$bigram: $freq{$bigram}\n";
+ $j++;
+
+ if ($j==10) { last;}
+}
+
+#total.
+
+print "Total de numero de bigramas: $count \n";
+
+
+sub byReverseValues {
+#los ordena por la cantidad de veces que sale una letra
+#print "a:$a b:$b \n";
+ my $value = $freq{$b} <=> $freq{$a};
+ if ($value==0) { return $a cmp $b;}
+ else { return $value;}
+}
+
View
8 textmining/cap2textmining.pl
@@ -0,0 +1,8 @@
+#!/usr/bin/perl -w
+use strict;
+use common::sense;
+
+#jcmm986 - 0.0.0 -
+
+$expr = / cat /;
+
View
11 textmining/cap3/3.10_textmining.pl
@@ -0,0 +1,11 @@
+#As noted in problem 2.9.b, at present, there are plenty of DNA sequences available to the public
+#at the National Center for Biotechnology Information (NCBI) at its Web page: http://www.ncbi.nlm.nih.gov/ [81].
+
+#Since DNA is text (though its words and grammar are mostly unknown), it makes sense to use Perl and
+#regexes to analyze DNA, which is exactly the point made in Perl for Exploring DNA by Mark LeBlanc
+#and Betsey Dexter Dyer [70]. Note that these authors also enjoy word games, and they introduce the
+# idea of text patterns in DNA by analyzing letter patterns in English words.
+# Even without a background in biology, the book is quite readable, and I recommend it.
+
+#For this problem, get a copy of LeBlanc and Dyer’s book and see how they use Perl for DNA pattern finding.
+# With the data of the NCBI, perhaps you can discover something notable.
View
131 textmining/cap3/3.3_textmining.pl
@@ -0,0 +1,131 @@
+#Discusses word ANAGRAMS,
+
+#which are two (or more) words using the same group of letters, but with different orders.
+#For example, algorithm is an anagram of logarithm since the former is a permutation of the latter.
+#Instead of allowing all permutations,
+#one challenge is finding word anagrams limited to only certain types of permutations,
+#A - The easiest permutation to check is reversing the letters because there is already a Perl function to do this, reverse.
+#examples of which are given below. Examples can be found by using a word list, for example, Grady Ward’s CROSSWD.TXT from the Moby Word Lists [123].
+#Find all the words that are also words when read backwards.
+#Hint: One way to do this is to create a hash, say %list, where its keys are the words in the word list.
+#Then loop through the keys checking to see if the reversal of the word is also a key, as done in code sample 3.36.
+#Note that this also finds palindromes, that is, words that are the same backwards as forwards, for example, deified.
+#B - Another simple permutation is taking the last letter and putting it first (sometimes called a rotation).
+#Find words for which this rotation is also a word. For example, rotating trumpets produces strumpet,
+#or rotating elects produces select.
+#Hint: Use the function rotate in code sample 3.37 instead of reverse in code sample 3.36.
+
+ #sub rotate {
+ # my $word = $_[0];
+ # my @letters = split(//, $word);
+ # unshift(@letters, pop(@letters));
+ # return join('', @letters);
+ #}
+
+# C - Create a function that is a rotation in the opposite sense of rotate in code sample 3.37.
+#Then find all words that are still words under this new rotation. For example, rotating swear
+#in this way produces wears. Question: how does this list compare with the list from problem 3.3.b?
+
+#______________________________SPANISH_____________________________________
+
+#ANAGRAMAS,
+#Los anagramas son dos o más palabras usando el mismo grupo de letras, pero con diferente orden.
+#Por ejemplo, algoritmo es un anagrama de logaritmo. A pesar de permitir todas las permutaciones,
+#una forma es encontrar palabras anagramas limitar por ciertos tipos de permutaciones,
+#Un ejemplo puede ser encontrado usando una lista de palabras,
+#como Grady Ward’s CROSSWD.TXT de la lista de Moby.
+
+#A - la forma más fácil de revisar la permutación es revertiendo las letras, porque ya existe una función en perl
+#que hace esto, reverse. Encuentra todas las palabras que son tambien palabras cuando son leidas al reves.
+#Pista: Una forma para hacer esto es crear un hash, llamado %list, donde sus llaves son las palabras en la lista
+#de palabras. Entonces recorre las llaves chequeando para ver si la palabra al revés es entonces una llave,
+#Como en el codigo de ejemplo 3.36.
+#Note que esto tambien encuentra PALINDROMES,
+#que son palabras que tienen el mismo significado escritas al revés y de forma normal, por ejemplo deified.
+
+ #foreach $x (sort %list) {
+ # if ( exists($list{reverse($x)}) ) {
+ # print "$x\n";
+ # }
+ #}
+
+#B - Otra simple permutacion es tomando la ultima letra y colocandola de primero (algunas veces llamada rotación).
+#encuentra palabras en las cuales esta rotación es tambien una palabra por ejemplo, rotando trumpets se produce strumpet,
+#o rotando elects se produce select.
+#Pista: Usa la función rotate en el codigo de ejemplo 3.37 así como tambien de reverse en el codigo de ejemplo 3.36
+#Code Sample 3.37: A function to move the last letter of a word to the front for problem 3.3.b.
+
+#sub rotate {
+# my $word = $_[0];
+# my @letters = split(//, $word);
+# unshift(@letters, pop(@letters));
+# return join('', @letters);
+#}
+
+#C - Crea una funcion que se llame rotacion, en opuesto sentido de rotate en el ejemplo de codigo del 3.37
+#Encuentra todas las que todavia son palabras en esta nueva rotación, por ejemplo rotando swear
+#En esta manera produce wears. Pregunta: Como esta lista compara con la lista del problema 3.3b?
+
+#leyendo CROSSWD.txt
+
+my %list;
+my %palindrome;
+my %rotate;
+my %rotacion;
+
+open WORD, 'CROSSWD.txt';
+
+while (<WORD>) {
+ chomp;
+ #print $_."\n";
+ $list{$_}=undef;
+}
+
+close WORD;
+
+print "Ejercicio A \n";
+foreach my $word (sort %list){
+ if (exists($list{reverse($word)})) {
+ print "$word : ".reverse($word)."\n";
+ $palindrome{$word}=undef;
+ }
+}
+print "Total palindromes: ". scalar (keys (%palindrome ))."\n";
+print "Total palabras CROSSWD.txt: " . scalar (keys (%list))."\n";
+
+print "Ejercicio B \n";
+sub rotate {
+ my $word = $_[0];
+ my @letters = split(//, $word);
+ unshift(@letters, pop(@letters));
+ return join('', @letters);
+}
+
+foreach my $word (sort %list){
+ if (exists($list{rotate($word)})) {
+ print "$word : ".rotate($word)."\n";
+ $rotate{$word}=undef;
+ }
+}
+print "Total rotate: ". scalar (keys (%rotate ))."\n";
+print "Total palabras CROSSWD.txt: " . scalar (keys (%list))."\n";
+
+print "Ejercicio C \n";
+
+sub rotacion {
+ my $word = shift;
+ my @letters = split //, $word;
+ push(@letters, shift(@letters));
+ return join('', @letters);
+}
+
+foreach my $word (sort %list){
+ if (exists($list{rotacion($word)})) {
+ print "$word : ".rotacion($word)."\n";
+ $rotacion{$word}=undef;
+ }
+}
+print "Total palindromes: ". scalar (keys (%palindrome ))."\n";
+print "Total rotate: ". scalar (keys (%rotate ))."\n";
+print "Total rotacion: ". scalar (keys (%rotacion ))."\n";
+print "Total palabras CROSSWD.txt: " . scalar (keys (%list))."\n";
View
67 textmining/cap3/3.4_textmining.pl
@@ -0,0 +1,67 @@
+#Using a word list, for example, Grady Ward’s CROSSWD.TXT from the Moby Word Lists [123],
+#find all the words where every letter appears exactly twice using a regex. For example,
+#this is true of the word hotshots.
+#This property is called an isogram. For more information on these, see section 29 and figure
+# 29c of Ross Eckler’s Making the Alphabet Dance
+
+#First hint: Sort the letters of each word into alphabetical order,
+#then try to create a regex that matches pairs of letters.
+#Note that /^((\w)\1)+$/ seems promising, but does not work.
+#Second hint: define a pair of letters regex using qr// as shown below.
+$pattern = qr/(\w)\1/;
+
+#Then use the regex /^$pattern+$/. This regex allows false positives (describe them).
+#Is there a simple way to correct this?
+
+
+#____________SPANISH______________
+
+#USANDO una lista de palabras, por ejemplo Grady ward's CROSSWD.TXT de la lista de palabras Moby,
+#Encuentra todas las palabras donde cada letra aparece exactamente 2 veces usando una expresión regular,
+#por Ejemplo, esto es verdadero en la palabra hotshots
+#Esta propiedad es llamada Isograma, para mas información de esto, ver la seccion 29 y figura 29c
+
+#PRIMERA PISTA: Organiza las letras de cada palabra en orden alfabetico,
+#Entonces intenta crear una expresion regular que extraiga solo par de letras,
+#Note que /^((\w)\1)+$/ parece prometer, pero esto no funciona.
+#SEGUNDA PISTA: define un par de letras en regex usando qr// como se muestra a continuacion.
+#$pattern = qr/(\w)\1/;
+#Entonces usa la expresion regular /^$pattern+$/. esta expresion regular permite falsos positivos (descríbelos)
+#Existe una manera simple de corregir esto?
+
+my %list;
+my %isograma;
+
+open WORD, 'CROSSWD.txt';
+
+while (<WORD>) {
+ chomp;
+ #print $_."\n";
+ $list{$_}=undef;
+}
+
+close WORD;
+
+sub isogram {
+ my $word = shift;
+ my @letters = split //, $word;
+ my @orden = sort @letters;
+ my $word2 = join '', @orden;
+ $pattern = qr/(\w)\1/;
+ if ($word2 =~/^$pattern+$/) {
+ return $word;
+ }
+ else {
+ return undef;
+ }
+}
+
+foreach my $word (sort %list){
+ if (exists($list{isogram($word)})) {
+ print "$word : ".isogram($word)."\n";
+ $isograma{$word}=undef;
+ }
+}
+
+print "Total isogramas: ". scalar (keys (%isograma ))."\n";
+print "Total palabras CROSSWD.txt: " . scalar (keys (%list))."\n";
View
155 textmining/cap3/3.5_textmining.pl
@@ -0,0 +1,155 @@
+#One way to associate a numeric value to a word is as follows.
+# Let A=1, B=2, C=3, …, and Z=26, then for a given word,
+#sum up its letter values, for example, cat produces 3+1+20,
+#or 24. This method is sometimes used in word puzzles, for example,
+#see section 59 of Ross Eckler’s Making the Alphabet Dance
+# the goal is to write a function that takes a word and returns its number.
+
+#shows one way to do this for all the numerical values at once using a hash
+# of hashes. To figure out how the code works, refer back to section 3.8.2.
+#The function ord changes an ASCII character into a number,
+# which makes it easy to convert a to 1, b to 2, and so forth.
+# The function map applies a function defined with $_ as its argument
+# to every entry in an array. For more information on this command,
+# try looking it up online at http://perldoc.perl.org/ [3].
+# Finally, note that using an array of hashes is another approach to this task.
+
+#Code Sample 3.38: Assuming that WORDS is a filehandle to word list,
+# this code finds all words having the same numerical value using the procedure given in problem 3.5.
+
+# $baseline = ord('a')-1;
+
+# while (<WORDS>) {
+ # chomp;
+ # @letters = split(//);
+ # @values = map(ord($_)-$baseline, @letters);
+ # $total = 0; foreach $x (@values) { $total += $x; }
+ # push( @{$list{$total}}, $_);
+
+# }
+
+# foreach $value (sort {$a <=> $b} keys %list) {
+ # print "$value\n";
+ # foreach $word ( @{$list{$value}} ) {
+ # print "$word ";
+ # }
+ # print "\n\n";
+# }
+
+#A - Perhaps this problem can be a start of a new type of pseudoscience.
+#For your name, find out its value, then examine the words that share this
+#value to discover possible clues to your personality (or love life, or career paths,…).
+#For example, the name Roger has the value 63, which is shared by acetone, catnip, and quiche.
+#Not surprisingly, these words describe me quite well
+
+#B - Another numerology angle arises by concatenating the letter values together
+#to form a string. For example, Roger becomes 18157518. It can happen that some numbers are
+#associated with more than one word. For example, abode and lode both have the number 121545.
+#For this problem write a Perl program that finds all such words. See the article
+#Concatenating Letter Ranks [13] for more information.
+#____________SPANISH______________
+
+#Una manera para asociar un valor numerico a una palabra es como como sigue
+#asignemos A=1, B=2, C=3, …, and Z=26, entonces para una palabra dada
+#suma los valores de las letras, por ejemplo cat produce 3+1+20,
+#o 24, Este metodo a veces es usado en los rompecabezas, por ejemplo,
+#ver seccion 59 de Ross Eckler’s Making the Alphabet Dance
+#La meta es escribir una funcion que tome una palabra y retorne su numero asociado.
+
+#muestra la manera de hacer esto para todos los valores numericos usando un hash de hashes
+#para ver como el codigo funciona, ver seccion 3.8.2
+#La funcion ord cambia a caracteres ASCII en un numero,
+#lo cual hace fácil convertir A á 1, B a 2, y así sucesivamente.
+#La funcion map aplica una funcion definida con $_ como su argumento,
+#a cada elemento de un arreglo. Para mas información de este comando,
+#inteta buscar en linea en http://perldoc.perl.org/ [3].
+#finalmente, note que usando un arreglo de hashes es otra aproximacion a esta tarea.
+
+#Codigo Ejemplo 3.38: Assumiendo que WORDS is a manejador de archivo con la lista de palabras,
+# este codigo encuentra todas las palabras que tienen el mismo
+#valor numerico usando el procedimiento dado en el problema 3.5
+
+#$baseline = ord('a')-1;
+
+#while (<WORDS>) {
+# chomp;
+# @letters = split(//);
+# @values = map(ord($_)-$baseline, @letters);
+# $total = 0; foreach $x (@values) { $total += $x; }
+# push( @{$list{$total}}, $_);
+#}
+
+#foreach $value (sort {$a <=> $b} keys %list) {
+# print "$value\n";
+# foreach $word ( @{$list{$value}} ) {
+# print "$word ";
+# }
+# print "\n\n";
+#}
+
+#A - Quizas este problema puede iniciar un nuevo tipo de pseudociencia.
+#Para tu nombre, encuentra estos valores, entonces examina las palabras que comparten estos
+#valores para descubrir posibles pistas de tu personalidad (o vida amorosa, o carrera profesional, ...)
+#Por ejemplo, el nombre Roger tiene el valor 63, el cual es compartido por acetona, catnip, y quiche.
+#No sorpresivamente, estas palabras me describen muy bien.
+
+#B - Otro punto de vista numerologico surge por la concatenacion de los valores de las letras juntos.
+#para formar una cadena. Por ejemplo Roger inició 18157518. Esto puede suceder con algunos numeros que son
+#asociados con mas de una palabra. Por ejemplo abode y lode ambos tienen el numero 121545.
+#Para este problema escribe un programa de perl que encuentre todas estas palabras. ver el articulo
+#Rangos de Concatenacion de letras para mas información.
+
+my %list;
+my %isograma;
+
+open WORD, 'CROSSWD.txt';
+
+while (<WORD>) {
+ chomp;
+ #print $_."\n";
+ $list{$_}=undef;
+}
+
+sub isogram {
+ my $word = shift;
+ my @letters = split //, $word;
+ my @orden = sort @letters;
+ my $word2 = join '', @orden;
+ $pattern = qr/(\w)\1/;
+ if ($word2 =~/^$pattern+$/) {
+ return $word;
+ }
+ else {
+ return undef;
+ }
+}
+
+sub falsepositive {
+ my $word = shift;
+ my $baseline = ord('a')-1; #96
+ my @letters = split(//, $word);
+ @values = map(ord($_)-$baseline, @letters);
+ my $total = 0; foreach $x (@values) { $total += $x; }
+ return $total;
+}
+
+sub isogramatrue {
+ my $word = shift;
+ my $baseline = ord('a') - 1; #96
+ my @letters = split(//, $word);
+}
+
+foreach $value (sort {$a <=> $b} keys %list) {
+ print "$value\n";
+ foreach $word ( @{$list{$value}} ) {
+ print "$word ";
+ }
+ print "\n\n";
+}
+
+close WORD;
+
+#el listado del 3.4depurarlo con el peso de las letras.
+#separando por letras diferentes, obteniendo su valor numerico y multiplicandolo por 2 si este valor numerico es igual.
+#al valor numerico calculado de la palabra entonces NO es un falso positivo y es un isograma.
+#dos funciones, converttonumber, converttonumberletters.
View
72 textmining/cap3/3.6_textmining.pl
@@ -0,0 +1,72 @@
+#3.6 - Transaddition is the following process: take a word, add a letter such that all the letters can be
+#rearranged to form a new word. For example, adding the letter t to learn produces antler (or learnt or rental).
+# A transdeletion is the removal of a letter so that what remains can be rearranged into a word, for example,
+# removing l from learn produces earn (or near). For an extensive discussion on these two ideas, see sections 41
+# and 49 of Ross Eckler’s Making the Alphabet Dance
+
+#Code sample 3.39 shows how to take a word and find what words can be found by adding a letter and then rearranging all
+# of them. Starting with this code, try changing it so that it can find transdeletions instead. Assume that WORDS is
+# the filehandle for a word list.
+#Code Sample 3.39: Code to find all transadditions of a given word. For problem 3.6.
+
+#while (<WORDS>) {
+# chomp;
+# $key = join('',sort(split(//, $_)));
+# if ( exists($list{$key}) ) {
+# $list{$key} .= ",$_";
+# } else {
+# $list{$key} = $_;
+# }
+#}
+
+# Transaddition
+
+#$word = $ARGV[0];
+#@letters = split(//, $word);
+#foreach $x ('a' .. 'z') {
+# @temp = @letters;
+# push(@temp, $x);
+# $key = join('', sort(@temp));
+# if ( exists($list{$key}) ) {
+# print "$list{$key}\n";
+# }
+#}
+
+#___________________SPANISH_____________________________
+
+#3.6 - TRANSADICIÓN ES el siguiente proceso: toma una palabra, añade una letra como esta, todas las letras pueden ser
+#reorganizadas para formar una nueva palabra.
+# Por ejemplo, añadiendo la letra t para aprender producir antler (o learnt o rental).
+# UNA TRANSELIMINACION ES la eliminación de una letra entonces el resto resultante puede ser reorganizado en una nueva palabra,
+# Por ejemplo, eliminando l de learn produce earn (or near).
+# Para una extensa discución en estas dos ideas, ver las secciones 41
+# and 49 of Ross Eckler’s Making the Alphabet Dance
+
+#Code de ejemplo 3.39 muestra como tomar una palabra y encontrar que palabras
+# pueden ser halladas añadiendo una letra a ellas y reorganizandolas todas
+# Iniciando con este codigo, intenta cambiarlo y encontrar en él una transeliminacion dentro de él.
+# Asume que WORDS is el manejador de archivos para una lista de palabras
+#Code Sample 3.39: Codigo para encontrar todas las transadiciones de una palabra dada. For problem 3.6.
+
+#while (<WORDS>) {
+# chomp;
+# $key = join('',sort(split(//, $_)));
+# if ( exists($list{$key}) ) {
+# $list{$key} .= ",$_";
+# } else {
+# $list{$key} = $_;
+# }
+#}
+
+# Transaddition
+
+#$word = $ARGV[0];
+#@letters = split(//, $word);
+#foreach $x ('a' .. 'z') {
+# @temp = @letters;
+# push(@temp, $x);
+# $key = join('', sort(@temp));
+# if ( exists($list{$key}) ) {
+# print "$list{$key}\n";
+# }
+#}
View
80 textmining/cap3/3.7_textmining.pl
@@ -0,0 +1,80 @@
+# 3.7 - Lewis Carroll created the game called Doublets, where the goal is to transform one word into another
+# (of the same length) by changing one letter at a time, and such that each intermediate step is itself a word.
+# For example, red can be transformed into hot as follows: red, rod, rot, hot.
+
+# One approach to this is to create a word network that shows all the one-letter-change linkages.
+# The programming task of creating and storing such a network in a (complex) data structure is
+# challenging because the network can be quite large (depending on the number of letters), and it is possible
+# to have loops in the network (the network is not a tree in the graph-theoretic sense).
+
+# This problem presents an easier task: given one word, find all other words that are only a one-letter
+# change from the given word. For example, the words deashed, leached, and leashes are all exactly one
+# letter different from leashed.
+
+# Here is one approach. Create a hash from a word list (using, for example, Grady Ward’s CROSSWD.TXT
+# from the Moby Word Lists [123]). Then take the given word, replace the first letter by each letter of
+# the alphabet. Check each of these potential words against the hash. Then do this for the second letter,
+# and the third, and so forth. See code sample 3.40 to get started.
+
+# Code Sample 3.40: Hint on how to find all words that are one letter different from a specified word. For problem 3.7.
+
+# $len = length of the word in $ARGV[0]
+# The keys of %list are from a word list
+
+# for ($i = 0; $i < $len; ++$i) {
+ # foreach $letter ( 'a' .. 'z' ) {
+ # $word = $ARGV[0];
+ # substr($word, $i, 1) = $letter;
+
+ # if ( exists($list{$word}) and $word ne $ARGV[0]) {
+ # print "$word\n";
+ # }
+ # }
+# }
+
+# Finally, for more information on Doublets, see chapter 22 of Tony Augarde’s The Oxford Guide to Word Games [5].
+# Moreover, sections 42 through 44 of Ross Eckler’s Making the Alphabet Dance [41] give examples of word networks.
+
+
+#______________________SPANISH___________________________
+
+# 3.7 - Lewis Carroll creó el juego llamado Doublets, donde la meta es transformar una palabra en otra
+# (del mismo tamaño) cambiando una letra en cada tiempo, y como eso es cada paso intermedio es en si mismo una palabra.
+# Por Ejemplo, red puede ser transfomado en hot como se muestra a continuación: red, rod, rot, hot.
+
+# Una aproximación para hacer esto es crear una red de palabras que muestre todo, el unico-cambio-de-letra es el enlace.
+# La tarea de programar de crear y almacenar como una red en una (compleja) estructura de data
+# es un reto porque la red puede ser muy grande (dependiendo del numero de letras), y es posible
+# tener ciclos en la red (la red no es un arbol en el sentido de teoria de grafos).
+
+# Este problema presenta una fácil tarea: dada una palabra, encuentra todas las otras
+# palabras que solo tienen una letra cambiada a partir de la palabra dada.
+# Por Ejemplo, las palabras deashed, leached, y leashes son todas exactamente una letra diferente
+# de leashed
+
+# Aquí una aproximación. Crea un hash a partir de una lista de palabras (usando, por ejemplo, Grady Ward’s CROSSWD.TXT
+# from the Moby Word Lists [123]). Entonces toma la palabra dada,
+# remplaza la primera letra por cada una letra del alfabeto
+# Verifica cada una de estas potenciales palabras contra el hash. Entonces haz esto para la segunda letra,
+# y la tercera, y así consecutivamente. Ver codigo de ejemplo 3.40 para iniciar.
+
+# Code Sample 3.40: Hint on how to find all words that are one letter different from a specified word.
+# For problem 3.7.
+
+# $len = length of the word in $ARGV[0]
+# The keys of %list are from a word list
+
+# for ($i = 0; $i < $len; ++$i) {
+ # foreach $letter ( 'a' .. 'z' ) {
+ # $word = $ARGV[0];
+ # substr($word, $i, 1) = $letter;
+
+ # if ( exists($list{$word}) and $word ne $ARGV[0]) {
+ # print "$word\n";
+ # }
+ # }
+# }
+
+# Finally, for more information on Doublets, see chapter 22 of Tony Augarde’s The Oxford Guide to Word Games [5].
+# Moreover, sections 42 through 44 of Ross Eckler’s Making the Alphabet Dance [41] give examples of word networks.
+
View
39 textmining/cap3/3.8_textmining.pl
@@ -0,0 +1,39 @@
+# 3.8 - With HTML, it is possible to encode a variety of information by modifying the font in various ways.
+# This problem considers one such example. Section 3.6.1 shows how to compute word frequencies.
+# Given these frequencies, the task here is to convert them into font sizes, which are then used to write
+# an HTML page
+# Code sample 3.41 assumes that the hash %size contains font sizes in points for each word in Poe’s “The Black Cat.”
+# The HTML is printed to the file BlackCat.html. Add a note hereThese font sizes are based on word counts using all
+# of Poe’s short stories, and $size{$word} was set to the function below.
+
+# int(1.5*log($freq)+12.5)
+
+# Add a note hereIn this case, the frequencies went from 1 to 24,401, so the this function reduces this wide range
+# of counts to a range appropriate for font sizes. Output 3.33 has the beginning of the HTML that is produced
+# by this code.
+
+# For a text of your own choosing, create a word frequency list, and then modify the frequencies to create font sizes.
+
+#______________SPANISH__________________________
+
+# 3.8 - Con HTML, es posible codificar una variedad de informacion por
+# la modificacion de la fuente(letra font) en varias maneras.
+# Este problema considera uno como ejemplo. Section 3.6.1 muestra como procesar frecuencias de palabras.
+# dadas estas frecuencias, la tarea aqui es convertirlos en tamaños de fuentes, las cuales son usadas para escribir
+# una pagina HTML
+
+# Code sample 3.41 asume que el hash %size contiene el tamaño de las letras(fuentes)
+# en puntos para cada palabra en Poe’s “The Black Cat.”
+# El HTML es impreso a el archivo BlackCat.html. estos tamaños de fuentes estan basados
+# en el conteo de palabras usado en todo Poe’s short stories, y $size{$word}
+# fue configurado para la funcion a continuacion.
+
+# int(1.5*log($freq)+12.5)
+
+# En este caso, la frecuencia dada por 1 to 24,401, entonces esta funcion reduce este amplio rango
+# de contadores a un rango apropiado para el tamaño de las fuentes.
+# salida 3.33 muestra el HTML que se genera por este codigo
+
+# Para un texto de tu escogencia, crea una lista de frecuencia de palabra, y entonces modifica
+# las frequencias para crear el tamaño de la fuente(letra).
+
View
55 textmining/cap3/3.9_textmining.pl
@@ -0,0 +1,55 @@
+#3.9 - shows how to find distinct words that have the same letters, but in different orders,
+#which are called anagrams. The same idea is applicable to numerals. For example, are there
+#many square numbers with anagrams that are also square numbers?
+#Examples are the squares 16,384 (equals 1282), 31,684 (1782), 36,481 (1912), 38,416 (1962),
+# and 43,681 (2092), and all five of these five-digit numbers use the same digits.
+#This is called an anasquare.
+
+#Code Sample 3.41: Code to vary font size in an HTML document. For problem 3.8.
+
+# open(STORY, "The_Black_Cat.txt");
+# open(OUT, ">BlackCat.html") or die;
+# print OUT "<html>\n<body>\n<marquee>\n";
+
+# while(<STORY>) {
+ # chomp;
+ # @words = split(/\s+/);
+
+ # if ( /^$/ ) {
+ # print OUT "\n</marquee><marquee>\n";
+ # } else {
+ # foreach $x (@words) {
+ # $x =~ /([\w-]+('s)?)/;
+ # if ( exists($size{lc($1)}) ) {
+ # print OUT "<span style=\"font-size:",
+
+ # "$size{1c($1)}pt\">$x</span>\n";
+ # } else { print "Missing Value for $1\n"; }
+ # }
+ # }
+# }
+# close(STORY);
+# print OUT "</marquee>\n</body>\n</html>\n";
+
+# Output 3.33: A few lines from code sample 3.41.
+
+# <html>
+# <body>
+# <marquee>
+# <span style="font-size:24pt">FOR</span>
+# <span style="font-size:27pt">the</span>
+# <span style="font-size:21pt">most</span>
+
+# <span style="font-size:19pt">wild,</span>
+# <span style="font-size:21pt">yet</span>
+# <span style="font-size:21pt">most</span>
+# <span style="font-size:12pt">homely</span>
+
+# <span style="font-size:16pt">narrative</span>
+# <span style="font-size:24pt">which</span>
+
+# Although word anagrams are not that common, this is not true for anasquares.
+# See “Anasquares: Square anagrams of squares” [14] for a discussion of this.
+
+#_________________________SPANISH______________________________
+
View
0  textmining/cap3/3.pl
No changes.
View
113,809 textmining/cap3/CROSSWD.txt
113,809 additions, 0 deletions not shown
View
113,809 textmining/cap3/CROSSWD.txt.fic
113,809 additions, 0 deletions not shown
View
31 textmining/cap3/exercise3.1textmining.pl
@@ -0,0 +1,31 @@
+#!/usr/bin/perl
+#
+#Find all the words containing interior apostrophes in Dickens’s A Christmas Carol.
+#Hence, on each side of the apostrophe there is an alphanumeric character. There are
+#quite a few of these, some familiar to today’s reader (like it’s or I’ll), and some
+#unfamiliar (like thank’ee or sha’n’t). For each of these, find its frequency in the novel.
+my %hash;
+
+open FILE, "christmasscarol.txt";
+
+while (my $linea = <FILE>) {
+ $linea = lc($linea);
+ chomp $linea;
+ $linea=~s/[.,:;?"!()]//g;
+ $linea=~s/--//g;
+ $linea=~s/ +/ /g;
+ $linea=~s/\s^'//g;
+
+my @words = split (' ', $linea);
+ foreach my $value (@words) {
+ $value =~/(\w+'\w+)/;
+ if (defined($1)) {
+ $hash{$1} = $hash{$1} + 1;
+ }
+ }
+}
+
+
+foreach my $keys (keys %hash){
+print "$keys: $hash{$keys}\n";
+}
View
28 textmining/cap4/4.1_textmining.pl
@@ -0,0 +1,28 @@
+#A fair coin is flipped until either HHH or THH is obtained.
+#If HHH occurs first, then Player A wins, and if THH occurs first, then Player B wins.
+#For example, the sequence HTTTTHH is a win for Player B because THH occurs in the last three flips,
+#but HHH does not appear. Although both sequences are equally likely when flipping a coin three times,
+#one of the two players is a favorite to win. Write a Perl program to simulate this process,
+#find who wins, and then estimate Player A’s probability of winning.
+#This problem is just one case of a game described by Walter Penney.
+#See pages 59–63 of John Haigh’s Taking Chances:
+#Winning with Probability [50] for a description of Penney’s game and how it has counterintuitive properties
+#!/usr/bin/perl -w
+use strict;
+my @ocurrence;
+
+sub generateevent{
+
+}
+
+sub flipped {
+
+}
+
+sub confirmwinner {
+
+my @ocurrence = shift;
+
+print @ocurrence;
+
+}
View
13 textmining/cap4/4.2_textminig.pl
@@ -0,0 +1,13 @@
+#4.2 Suppose two people are betting the outcome of a fair coin where
+#Player A loses a dollar if the flip is tails and otherwise wins a dollar.
+# A running tally of Player A’s net winnings or loses is kept where the initial
+# value is $0. For example, if the game starts off with HTTHT, then Player
+# A has $1, $0, -$1, $0, -$1, respectively.
+#Write a Perl simulation of this game for 20 tosses, then compute the proportion of
+# the flips where Player A is ahead or even. Then repeat this simulation 10,000 times.
+# The result will be 10,000 proportions each based on 20 flips. One guesses that since
+# the coin is fair, that Player A should be ahead about 50% of the time, but this is not true.
+# Surprisingly, the most probable scenario is that Player A is either ahead or behind the entire game.
+# See pages 64–69 of John Haigh’s Taking Chances: Winning with Probability [50] for a discussion of this process.
+# For a mathematical exposition, see section III.4 of
+# An Introduction to Probability Theory and Its Applications by William Feller
View
8 textmining/cap4/4.3_textminig.pl
@@ -0,0 +1,8 @@
+# [Requires a statistical package] Output 4.2 gives the frequencies of the
+# letters appearing in the fictional works A Christmas Carol and “The Black Cat.”
+# For this problem, focus on the former. The ranks of the letters in A Christmas
+# Carol are easy to assign since these values are already in numerical order;
+# that is, the letter on the first line (e) has rank 1, the letter on the second line (t)
+# has rank 2, and so forth. Using a statistical package such as R,
+# make a plot of the Log(Rank) vs. Log(Frequency) for the 26 letters.
+# How does this compare to figure 3.1? In your opinion, how well does Zipf’s law hold for letters?
View
4 textmining/cap4/4.4_textmining.pl
@@ -0,0 +1,4 @@
+#Modify program 4.4 to make it enumerate trigrams. Hint:
+#In the foreach loop that iterates over the array @words,
+#modify the join statement so that it takes three instead of two letters in a row.
+# A modification of the for loop’s ending condition is needed, too.
View
18 textmining/cap4/4.5_textmining.pl
@@ -0,0 +1,18 @@
+#4.5 [Mathematical] For four-letter words, equation 4.8
+#suggests that the events “first letter is a q” and “second letter is an u”
+# are dependent as language intuition suggests. However, how strong is this evidence?
+# This problem gives a quantitative answer to this.
+#The problem of independence of events can be solved with contingency tables.
+# There are several ways to do this, and this problem applies Fisher’s exact test.
+# Equation 4.11 shows the computation needed, which gives the probability of seeing
+#the counts in table 4.3 if independence were true. Since this answer is about six
+#in a billion, the reasonable conclusion is that the two events are dependent
+
+ #1st is q #1st not q #Row Sums
+#2nd is u 10 368 378
+#2nd not u 2 3306 3308
+#Column Sums 12 3674 3686
+
+#For this problem, find a statistics text that shows how to analyze categorical data.
+#Then look up Fisher’s exact test to see why it works. For example, see section 3.5.1
+# of Alan Agresti’s Categorical Data Analysis [2].
View
10 textmining/cap4/4.6_textmining.pl
@@ -0,0 +1,10 @@
+# 4.6 In section 4.4, the proportions of the letter e in 68 Poe stories are given.
+# Here are some steps to compute these values. First, download the five volumes from the Web,
+# and get rid of the initial and ending text so that just the titles and stories are left.
+# Second, although the titles are easy for a person to read, it helps to make them completely unambiguous.
+# One common way to add information to a text is by XML tags.
+# These work the same way as HTML tags except that they can stand for anything,
+# not just how to display a Web page. Here we put the story titles in between two title tags,
+# for example, <TITLE>The Black Cat</TITLE>. Third, scan these five files line by line using a while loop.
+# Finally, use code sample 4.3 as a start for counting the total number of letters (in $count)
+# and the number of e’s (in $count_e).
View
47 textmining/cap4/4.7_textmining.pl
@@ -0,0 +1,47 @@
+#4.7 [Mathematical] For some distributions,
+# the sample data can be summarized by a few sufficient statistics without loss of any information
+# about the population parameters. However, this assumes that the data values are really generated
+#by the assumed population distribution, which is rarely exactly true when working with a real data set.
+#Hence, in practice, reducing the data to sufficient statistics can lose information about how well
+#the population distribution fits the observed data.
+
+#Add a note hereCode Sample 4.3: Code sample for problem 4.6.
+
+#if (/<TITLE>(.*)<\/TITLE>/)
+#$count = 0;
+#$title = $1;
+#$count_e = 0;
+#}
+#else {
+#$_=lc;
+#s/[^a-z]//g;
+#$count_e += tr/e/e/;
+#$count += length;
+#}
+
+#Suppose we assume that this sequence is generated by a coin,
+# where 1 stands for heads and 0 for tails. Assume that the probability of heads is p,
+# which we wish to estimate. Assuming that this model is true, then the sufficient
+# statistic for p is estimated by the number of 1’s divided by the number of flips,
+# which gives 17/25=68%
+
+#However, this data set does not look like it comes from flipping a coin because
+# the 0’s and 1’s tend to repeat. For this problem, compute the probability of getting
+#the data in equation 4.12 assuming that a coin with p=0.68 is, in fact, used.
+
+#If this probability is low, then the assumption of a biased coin model is cast into doubt.
+# However, reducing the data set to the sufficient statistic for p makes it impossible
+#to decide on the validity of this coin model; that is, information is lost by ignoring
+#the original data in favor of the estimate p=0.68.
+
+#Hint: see section 9.4.1 for one approach of estimating the probability observing
+# equation 4.12 if seventeen 1’s and eight 0’s can appear in any order with equal probability.
+
+#For more on sufficient statistics see chapter 10 of Lee Bain and Max Engelhardt’s
+# Introduction to Probability and Mathematical Statistics [7]. In addition, the point
+#that the data set does have more information than the sufficient statistic is made in
+# section 8.7 of John A.Rice’s Mathematical Statistics and Data Analysis
+
+#For the normal distribution, the sample mean and sample standard deviation are
+# sufficient for the population mean and population standard deviation.
+#See theorem 7.1.1 of James Press’s Applied Multivariate Analysis [103] for a proof.
View
12 textmining/cap4/4.8_textmining.pl
@@ -0,0 +1,12 @@
+#4.8 To randomize the words in a story requires two steps.
+#First, they must be identified. Second, they are stored and
+# then permuted. The task of identifying the words is discussed in
+# section 2.4 (and see program 2.6). So here we focus on rearranging
+# them. For each word, store it in a hash using a string generated by
+# the function rand as follows.
+
+#$permutation{rand()} = $word;
+
+#Then print out the hash %permutation by sorting on its keywords
+#(either a numerical or an alphabetical sort works). Since the keywords are
+#randomly generated, the sort randomly permutes the values of this hash.
View
62 textmining/cap5/cap.5.4_sample5.5_cosene.pl
@@ -0,0 +1,62 @@
+# Angles are in degrees
+# Requires subroutines cosine() and dot()
+# Requires the existence of the hash %freq
+
+use Math::Trig; # Load all trig functions and pi
+
+@pronouns = qw(he she him her his hers himself herself);
+
+
+foreach $story (keys %freq) { # Print out the story names
+ print "$story\n";
+}
+
+print "\nCOSINE ANGLES\n\n";
+foreach $story1 (keys %freq) {
+ foreach $story2 (keys %freq) {
+ %hash1 = %{$freq{$storyl}};
+
+ %hash2 = %{$freq{$story2}};
+ @vector1 = @hash1{@pronouns};
+ @vector2 = @hash2{@pronouns};
+ $angle = acos(cosine(\@vector1, \@vector2))/pi*180;
+ printf " %.1f", $angle;
+ }
+ print "\n";
+
+}
+
+
+sub cosine {
+ # This uses the subroutine dot
+ my ($vector_ref1, $vector_ref2) = @_;
+ my @vector1 = @{$vector_ref1};
+ my @vector2 = @{$vector_ref2};
+
+
+ if ($#vector1 == $#vector2) { # Do vectors have the same length?
+ my $length1 = sqrt(dot(\@vector1, \@vector1));
+ my $length2 = sqrt(dot(\@vector2, \@vector2));
+ my $answer = dot(\@vector1, \@vector2)/($length1*$length2);
+
+ return($answer);
+ } else {
+ return('Error');
+}
+}
+
+sub dot {
+ my ($vector_ref1, $vector_ref2) = @_;
+ my $sum_cross = 0;
+
+ my @vector1 = @{$vector_ref1}; # Dereference pointer
+ my @vector2 = @{$vector_ref2}; # Dereference pointer
+
+ if ($#vector1 == $#vector2) { # Ensure vectors have same length
+ for (my $i=0; $i <= $#vector1; ++$i) {
+
+ $sum_cross += $vector1 [$i] *$vector2[$i] ;
+ }
+ return($sum_cross);
+ }
+}
View
6 textmining/cap5/cap5-countword-pl.txt
@@ -0,0 +1,6 @@
+#Add a note here5.2.1 Counting Letters in Poe with Perl
+#Deben ser eliminados los puntos.
+#ejecutar un programa que determine los diferentes caracteres usados en estas 4 historias
+
+
+my %freq;
View
43 textmining/cap5/cap5.4_productotwovector.pl
@@ -0,0 +1,43 @@
+#producto de dos vectores
+#tomando en cuenta su ángulo.
+
+@x = (19, 9, 7, 13, 22, 0, 1, 2);
+@y = (33, 0, 17, 3, 32, 0, 1, 0);
+
+
+sub dot {
+ my ($vector_ref1, $vector_ref2) = @_;
+ my $sum_cross = 0;
+
+ my @vector1 = @{$vector_ref1}; # Dereference pointer
+ my @vector2 = @{$vector_ref2}; # Dereference pointer
+
+ if ($#vector1 == $#vector2) { # Ensure vectors have same length
+ for (my $i=0; $i <= $#vector1; ++$i) {
+
+ $sum_cross += $vector1 [$i] *$vector2[$i] ;
+ }
+ return($sum_cross);
+ }
+}
+
+$answer = cosine(\@x, \@y);
+print "Cosine = $answer\n";
+
+sub cosine {
+ # This uses the subroutine dot
+ my ($vector_ref1, $vector_ref2) = @_;
+ my @vector1 = @{$vector_ref1};
+ my @vector2 = @{$vector_ref2};
+
+
+ if ($#vector1 == $#vector2) { # Do vectors have the same length?
+ my $length1 = sqrt(dot(\@vector1, \@vector1));
+ my $length2 = sqrt(dot(\@vector2, \@vector2));
+ my $answer = dot(\@vector1, \@vector2)/($length1*$length2);
+
+ return($answer);
+ } else {
+ return('Error');
+}
+}
View
17 textmining/cap5/length_vector.pl
@@ -0,0 +1,17 @@
+#!/usr/bin/perl
+#subrutina coseno
+#textmining
+
+#subrutina que retorna el tamaño de un vector
+@vector = (19, 9, 7, 13, 22, 0, 1, 2);
+$length = vector_length(@vector);
+print "Length of vector = $length\n";
+
+sub vector_length {
+ my $sum = 0;
+ for(my $i = 0; $i <= $#_; $i++) {
+ $sum += $_[$i]*$_[$i];
+
+ }
+ return(sqrt($sum));
+}
View
23 textmining/cap5/producttwovector.pl
@@ -0,0 +1,23 @@
+#multiplicando dos vectores suponiendo que tienen un ángulo 0
+#
+@x = (19, 9, 7, 13, 22, 0, 1, 2);
+@y = (33, 0, 17, 3, 32, 0, 1, 0);
+$answer = dot(\@x, \@y);
+print "Dot product = $answer\n";
+
+sub dot {
+ my ($vector_ref1, $vector_ref2) = @_;
+ my $sum_cross = 0;
+
+ my @vector1 = @{$vector_ref1}; # Dereference pointer
+ my @vector2 = @{$vector_ref2}; # Dereference pointer
+
+ if ($#vector1 == $#vector2) { # Ensure vectors have same length
+ for (my $i=0; $i <= $#vector1; ++$i) {
+
+ $sum_cross += $vector1 [$i] *$vector2[$i] ;
+ }
+ return($sum_cross);
+ }
+}
+
View
12 textmining/chapter9_acme_umlautify.pl
@@ -0,0 +1,12 @@
+#!/usr/bin/perl -w
+use strict;
+use common::sense;
+
+#jcmm986 - 0.0.0 -
+
+use Acme::Umlautify;
+
+my $au = new Acme::Umlautify;
+
+print $au->do('"Motley Crue" could have had way more umlauts, dude.'."\n");
+
View
55 textmining/count_characters_text.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/perl
+
+#USAGE > perl count_characters_text.pl filename.txt
+#this program is case insensitive
+#this program counts all characters, not just letters.
+
+open (FILE, "$ARGV[0]") or die("$ARGV[0] not found");
+
+#variable especial que contiene el separador utilizado al hacer print
+#Paragraph mode used
+$/= "";
+
+#contando letras del archivo.
+
+while (<FILE>) {
+chomp;
+s/\n/ /; #Reemplazo nueva líneas por espacios
+$_ = lc; #llevando todo a minuscula
+@chars = split(//); #todos los caracteres de una linea para el @chars
+
+ foreach $char (@chars) {
+ #hash que contiene la frecuencia con que sale una letra en el texto
+ ++$freq{$char};
+ }
+}
+
+
+#probabilidad
+
+
+$count=0; #Almacena el numero total de letras
+
+foreach $char (sort byReverseValues keys %freq) {
+ if ('a' le $char and $char le 'z') {
+ $count += $freq{$char};
+ }
+ print "$char: $freq{$char}\n";
+}
+
+print "\nTotal numero de letras $count\n";
+
+foreach $letter ('a'.. 'z') {
+
+ print "$letter: $freq{$letter}\n"
+}
+
+sub byReverseValues {
+ #los ordena por la cantidad de veces que sale una letra
+ print "a:$a b:$b \n";
+ $value = $freq{$b} <=> $freq{$a};
+ if ($value==0) { return $a cmp $b;}
+ else { return $value;}
+}
+
+
View
27 textmining/fall_word_richness.pl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl
+
+open IND, 'texto.txt' or die 'no existe archivo texto.txt';
+open OUT, '> salidatexto.csv' or die 'No se pudo crear archivo';
+
+while (<IND>) {
+ chomp;
+ $_=lc;
+ s/[.,:;?"!()]//g;
+ s/--//g;
+ s/ +/ /g;
+
+ if ( not /^$/ ) { # Ignore empty lines
+ @words = split(/ /);
+
+ foreach $x (@words) {
+ ++$tokens;
+ ++$freq{$x};
+ }
+ $types = scalar keys %freq;
+ $ratio = $tokens/$types;
+ print OUT "$tokens, $types, $ratio\n";
+ }
+}
+
+close IN;
+close OUT;
View
23 textmining/flip.pl
@@ -0,0 +1,23 @@
+#!/usr/bin/perl
+
+#USAGE > perl flip.pl value
+#Este programa simula la probabilidad de lanzar una moneda
+# $ARGV[0] de veces
+
+
+for ($n=1; $n<=$ARGV[0]; ++$n) {
+ if (rand > 0.5) {
+ print 'H';
+ ++$count;
+ }
+ else {
+ print 'T';
+ }
+}
+
+
+#Probabilidad empírica.
+#evento es probabilidad que salga cara en $ARGV[0] intentos
+$proportion = $count/$ARGV[0];
+
+print "\n". 'Probabilidad de que salga cara en '.$ARGV[0].' intentos ='.$proportion;
View
45 textmining/perl/ADVANCED/Data-BT-PhoneBill-Call.pm
@@ -0,0 +1,45 @@
+package Data::BT::PhoneBill::_Call;
+ our @fields = qw(type installation line chargecard _date time
+ destination _number _duration rebate _cost);
+
+
+#This creates a new subroutine in the glob for each of the fields in the arrayequivalent to *type = sub { shift->{type} }.
+ for my $f (@fields) {
+ no strict 'refs';
+ *$f = sub { shift->{$f} };
+ }
+
+
+
+ sub new {
+ my ($class, @data) = @_;
+ bless { map { $fields[$_] => $data[$_] } 0..$#fields } => $class;
+ }
+
+ # sub type { shift->{type} }
+ # sub installation { shift->{installation} }
+ # sub line { shift->{line} }
+
+
+
+
+
+ # sub installation { shift->[0] }
+ # sub line { shift->[1] }
+
+ # {
+ # my $seq = 3;
+ # sub sequence { $seq += 3 }
+ # }
+
+ # print $seq; # out of scope
+
+ # print sequence; # prints 6
+ # print sequence; # prints 9
+
+
+ # sub type { shift->[0] }
+ # sub installation { shift->[1] }
+ # sub line { shift->[2] }
+
+
View
36 textmining/perl/ADVANCED/GLOB.PL
@@ -0,0 +1,36 @@
+
+
+
+ @b = (1,2,3,4);
+ *a = \@b;
+
+ push @b, 5;
+ print @a; # 12345
+
+ # However:
+ $a = "Bye";
+ $b = "Hello there!";
+ print $a; # Bye
+ print $b;
+
+ *a = \"Hello";
+ *a = [ 1, 2, 3 ];
+ *a = { red => "rouge", blue => "bleu" };
+
+ print $a; # Hello
+ print $a[1]; # 2
+ print $a{"red"}; # rouge
+
+ #accediendo a lAs PARTES globs
+
+ *a{ARRAY} = \@a;
+ *a{HASH} = \@a;
+ *a{IO} = \@a;
+ *a{CODE} = \@a;
+ *a{FORMAT} = \@a;
+ *a{GLOB} = \@a;
+
+
+
+
+
View
25 textmining/perl/ADVANCED/Some-Module.pm
@@ -0,0 +1,25 @@
+use Some::Module;
+our $useful = "Some handy string";
+
+print $Some::Module::useful;
+
+#${caller( )."::useful"} = $useful;
+#@{caller( )."::useful"} = @useful;
+&{caller( )."::useful"} = &useful;
+
+
+sub useful { 42 }
+ sub import {
+ no strict 'refs';
+ *{caller( )."::useful"} = \&useful;
+}
+
+
+
+
+
+
+
+
+
+
View
20 textmining/perl/ADVANCED/export.pl
@@ -0,0 +1,20 @@
+#How work EXPORTS
+#$sym = useful
+#
+
+foreach $sym (@imports) {
+ # shortcut for the common case of no type character
+ (*{"${callpkg}::$sym"} = \&{"${pkg}::$sym"}, next)
+ unless $sym =~ s/^(\W)//;
+
+ $type = $1;
+ *{"${callpkg}::$sym"} =
+ $type eq '&' ? \&{"${pkg}::$sym"} :
+ $type eq '$' ? \${"${pkg}::$sym"} :
+ $type eq '@' ? \@{"${pkg}::$sym"} :
+ $type eq '%' ? \%{"${pkg}::$sym"} :
+ $type eq '*' ? *{"${pkg}::$sym"} :
+ do { require Carp; Carp::croak("Can't export symbol:$type$sym") };
+ }
+
+
View
67 textmining/probabilityconditional.pl
@@ -0,0 +1,67 @@
+#!/usr/bin/perl -w
+use strict;
+
+#probabilidad condicional
+# P(B|C) => la probabilidad que el evento B ocurra dado que el evento C ya ocurrio.
+# la probabilidad de B dado C
+# n => el numero de posibilidades
+
+# P(E|F) = n (E and F) / n (F)
+# Probabilidad de que ocurran ambos
+# Probabilidad del que ocurrio.
+# P(C) => probabilidad que tenga una letra q al comienzo de la palabra
+# P(B && C) => probailidad que tenga la letra q al comienzo y siguiente la letra u
+# P(B) => Probabildiad que salga una letra u como segunda letra.
+# P(B|C) = P (B|C) / P(C)
+#
+
+open (FILE, $ARGV[0]) or die "ups.. No existe archivo, o problemas al abrir";
+
+my $n;
+my $n_q_first;
+my $n_u_second;
+my $n_q_then_u;
+my $n_nada;
+my $n_hice;
+my $n_hace;
+my $n_todo;
+
+while (my $linea=<FILE>) {
+
+ chomp $linea;
+ $linea = lc $linea;
+ $linea =~ s/[^a-zA-Z]/ /g;
+ $linea =~ s/-+/ /g;
+ $linea =~ s/\s+/ /g;
+
+ #divide en palabras la linea
+ my @words = split(/\s/, $linea);
+
+ foreach (@words) {
+
+ if ( length == 4 ) {
+ ++$n;
+ print $_, "\n";
+ if ( /q.../ ) { ++$n_q_first }
+ if ( /.u../ ) { ++$n_u_second }
+ if ( /qu../ ) { ++$n_q_then_u }
+ if ( /nada/ ) { ++$n_nada; }
+ if ( /hice/ ) { ++$n_hice; }
+ if ( /hace/ ) { ++$n_hace; }
+ if ( /todo/ ) { ++$n_todo; }
+
+ }
+ }
+
+}
+
+print "# 4 letter words = ".($n || 'Nada' )."\n";
+print "# 4 letter words with q first = ".($n_q_first || 'Nada')."\n";
+print "# 4 letter words with u second = ".($n_u_second|| 'nada' )."\n";
+print "# 4 letter words starting with qu =".( $n_q_then_u || 'nada' )."\n";
+print "Contando los nada: ".$n_nada. "\n";
+print "Contando los hice: ".$n_hice. "\n";
+print "Contando los hace: ".$n_hace. "\n";
+print "Contando los todo: ".$n_todo. "\n";
+
+close FILE;
View
27 textmining/textminingtwitter.pl
@@ -0,0 +1,27 @@
+ use Net::Twitter;
+ use Scalar::Util 'blessed';
+
+ # As of 13-Aug-2010, Twitter requires OAuth for authenticated requests
+ my $nt = Net::Twitter->new(
+ traits => [qw/OAuth API::REST/],
+ consumer_key => $consumer_key,
+ consumer_secret => $consumer_secret,
+ access_token => $token,
+ access_token_secret => $token_secret,
+ );
+
+ my $result = $nt->update('Hello, world!');
+
+ eval {
+ my $statuses = $nt->friends_timeline({ since_id => $high_water, count => 100 });
+ for my $status ( @$statuses ) {
+ print "$status->{created_at} <$status->{user}{screen_name}> $status->{text}\n";
+ }
+ };
+ if ( my $err = $@ ) {
+ die $@ unless blessed $err && $err->isa('Net::Twitter::Error');
+
+ warn "HTTP Response Code: ", $err->code, "\n",
+ "HTTP Message......: ", $err->message, "\n",
+ "Twitter error.....: ", $err->error, "\n";
+ }
View
58 textmining/thebagofwords.pl
@@ -0,0 +1,58 @@
+#!/usr/bin/perl -w
+use strict;
+
+# THE BAG-OF-WORDS MODEL
+# es una apropiada metadora tomar las palabras y agruparlas en una bolsa
+# sin tomar en cuenta el orden, y luego regresarlas a su sitio.
+# la bolsa de palabras: es el analisis de frecuencia de uso de una palabra
+# limitations
+# el orden de las palabras es irrelevante
+# pierdes importante información
+#
+
+my %freq;
+my @words;
+
+open (FILE, $ARGV[0]) or die "no existe archivo \n";
+
+
+while (my $linea=<FILE>){
+
+ $linea =~ s/[^a-zA-Z]/ /g;
+ $linea =~ s/-+/ /g;
+ $linea =~ s/\s+/ /g;
+ $linea = lc $linea;
+ #divide en palabras la linea
+ @words = split(/\s/, $linea);
+
+ foreach my $word (@words) {
+ $freq{$word}++;
+ }
+}
+close FILE;
+
+#imprimiento resultados de los bigramas
+
+my $bigram;
+my $j=0;
+
+foreach $bigram ( sort byReverseValues keys %freq) {
+ print "$bigram: $freq{$bigram}\n";
+ $j++;
+
+ #if ($j==10) { last;}
+}
+
+#total.
+
+print "Total de numero de bigramas: ".($#words+1)." \n";
+
+
+sub byReverseValues {
+#los ordena por la cantidad de veces que sale una letra
+#print "a:$a b:$b \n";
+ my $value = $freq{$b} <=> $freq{$a};
+ if ($value==0) { return $a cmp $b;}
+ else { return $value;}
+}
+
View
46 textmining/twitterperl.pl
@@ -0,0 +1,46 @@
+use Net::Twitter;
+
+my $nt = Net::Twitter->new(
+ traits => ['API::REST', 'OAuth'],
+ consumer_key => "",
+ consumer_secret => "",
+ );
+
+# You'll save the token and secret in cookie, config file or session database
+my($access_token, $access_token_secret) = restore_tokens();
+open (FILE, ">>accesstoken.txt") or die 'imposible obtener acceso a twitter';
+
+
+
+ if ($access_token && $access_token_secret) {
+ $nt->access_token($access_token);
+ $nt->access_token_secret($access_token_secret);
+
+ }
+
+ unless ( $nt->authorized ) {
+ # The client is not yet authorized: Do it now
+ print "Authorize this app at ", $nt->get_authorization_url, " and enter the PIN#\n";
+
+ my $pin = <STDIN>; # wait for input
+ chomp $pin;
+
+ my($access_token, $access_token_secret, $user_id, $screen_name) = $nt->request_access_token(verifier => $pin);
+ save_tokens($access_token, $access_token_secret); # if necessary
+ }
+close FILE;
+
+sub save_tokens{
+ my $access_token = shift;
+ my $access_token_secret = shift;
+ print FILE "$access_token;$access_token_secret\n";
+}
+ # Everything's ready
+
+sub restore_tokens{
+ open FILE, "accesstoken.txt";
+ $linea = <FILE>;
+ chomp $linea;
+ my @result = split($linea);
+ return @result;
+}
View
4 twitter/perl/date-calc.pl
@@ -67,7 +67,7 @@
sub restore_tokens {
my $a = shift;
my $b = shift;
-open FILE, "tokenaccess.db";
+open FILE, "tokenaccess.db.nosubir";
my $linea = <FILE>;
chomp $linea;
my @result = split ';', $linea;
@@ -78,7 +78,7 @@ sub restore_tokens {
sub save_tokens {
my $a = shift;
my $b = shift;
-open FILE, ">>tokenaccess.db";
+open FILE, ">>tokenaccess.db.nosubir";
print FILE $a.';'.$b;
close FILE;
}
View
4 twitter/perl/diad.pl
@@ -84,7 +84,7 @@ sub existe {
sub restore_tokens {
my $a = shift;
my $b = shift;
-open FILE, "tokenaccess.db" or die ('No se pudo abrir el archivo tokenaccess.db');
+open FILE, "tokenaccess.db.nosubir" or die ('No se pudo abrir el archivo tokenaccess.db');
my $linea = <FILE>;
chomp $linea;
my @result = split ';', $linea;
@@ -95,7 +95,7 @@ sub restore_tokens {
sub save_tokens {
my $a = shift;
my $b = shift;
-open FILE, ">>tokenaccess.db";
+open FILE, ">>tokenaccess.db.nosubir";
print FILE $a.';'.$b;
close FILE;
}
View
4 twitter/perl/directtoupdate.pl
@@ -85,7 +85,7 @@
sub restore_tokens {
my $a = shift;
my $b = shift;
-open FILE, "tokenaccess.db";
+open FILE, "tokenaccess.db.nosubir";
my $linea = <FILE>;
chomp $linea;
my @result = split ';', $linea;
@@ -96,7 +96,7 @@ sub restore_tokens {
sub save_tokens {
my $a = shift;
my $b = shift;
-open FILE, ">>tokenaccess.db";
+open FILE, ">>tokenaccess.db.nosubir";
print FILE $a.';'.$b;
close FILE;
}
View
4 twitter/perl/search.pl
@@ -102,7 +102,7 @@ sub existe {
sub restore_tokens {
my $a = shift;
my $b = shift;
-open FILE, "tokenaccess.db";
+open FILE, "tokenaccess.db.nosubir";
my $linea = <FILE>;
chomp $linea;
my @result = split ';', $linea;
@@ -113,7 +113,7 @@ sub restore_tokens {
sub save_tokens {
my $a = shift;
my $b = shift;
-open FILE, ">>tokenaccess.db";
+open FILE, ">>tokenaccess.db.nosubir";
print FILE $a.';'.$b;
close FILE;
}
View
4 twitter/perl/twitter.pl
@@ -59,7 +59,7 @@
sub restore_tokens {
my $a = shift;
my $b = shift;
-open FILE, "tokenaccess.db";
+open FILE, "tokenaccess.db.nosubir";
my $linea = <FILE>;
chomp $linea;
my @result = split ';', $linea;
@@ -70,7 +70,7 @@ sub restore_tokens {
sub save_tokens {
my $a = shift;
my $b = shift;
-open FILE, ">>tokenaccess.db";
+open FILE, ">>tokenaccess.db.nosubir";
print FILE $a.';'.$b;
close FILE;
}
Please sign in to comment.
Something went wrong with that request. Please try again.