Permalink
Browse files

adding base python scripts

  • Loading branch information...
1 parent 098b361 commit 4a9f2cb3a173b025004929e9a00849b2ec37fd9a @hmason committed Feb 17, 2012
@@ -0,0 +1,34 @@
+import numpy as np
+
+# Create a random dataset
+rng = np.random.RandomState(1)
+X = np.sort(5 * rng.rand(80, 1), axis=0)
+y = np.sin(X).ravel()
+y[::5] += 3 * (0.5 - rng.rand(16))
+
+# Fit regression model
+from sklearn.tree import DecisionTreeRegressor
+
+clf_1 = DecisionTreeRegressor(max_depth=2)
+clf_2 = DecisionTreeRegressor(max_depth=5)
+clf_1.fit(X, y)
+clf_2.fit(X, y)
+
+# Predict
+X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
+y_1 = clf_1.predict(X_test)
+y_2 = clf_2.predict(X_test)
+
+# Plot the results
+import pylab as pl
+
+pl.figure()
+pl.scatter(X, y, c="k", label="data")
+pl.plot(X_test, y_1, c="g", label="max_depth=2", linewidth=2)
+pl.plot(X_test, y_2, c="r", label="max_depth=5", linewidth=2)
+pl.xlabel("data")
+pl.ylabel("target")
+pl.title("Decision Tree Regression")
+pl.legend()
+#pl.show()
+pl.savefig('decision_tree_regression.png', format='png')
@@ -0,0 +1,80 @@
+import csv
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.feature_extraction.text import Vectorizer
+from sklearn import metrics
+
+from sklearn.cluster import KMeans, MiniBatchKMeans
+
+import logging
+from optparse import OptionParser
+import sys
+from time import time
+
+import numpy as np
+
+
+# Display progress logs on stdout
+logging.basicConfig(level=logging.INFO,
+ format='%(asctime)s %(levelname)s %(message)s')
+
+# parse commandline arguments
+op = OptionParser()
+op.add_option("--no-minibatch",
+ action="store_false", dest="minibatch", default=True,
+ help="Use ordinary k-means algorithm.")
+
+print __doc__
+op.print_help()
+
+(opts, args) = op.parse_args()
+if len(args) > 0:
+ op.error("this script takes no arguments.")
+ sys.exit(1)
+
+
+input_data = csv.reader(open('descriptions.csv','rb'))
+dataset_data = []
+dataset_target = []
+for row in input_data:
+ dataset_data.append(row[1])
+ dataset_target.append(row[0])
+
+labels = dataset_target
+true_k = np.unique(labels).shape[0]
+
+print "Extracting features from the training dataset using a sparse vectorizer"
+t0 = time()
+vectorizer = Vectorizer(max_df=0.95, max_features=10000)
+X = vectorizer.fit_transform(dataset_data)
+print X
+
+print "done in %fs" % (time() - t0)
+print "n_samples: %d, n_features: %d" % X.shape
+print
+
+
+###############################################################################
+# Do the actual clustering
+
+if opts.minibatch:
+ km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,
+ init_size=1000,
+ batch_size=1000, verbose=1)
+else:
+ km = KMeans(k=true_k, init='random', max_iter=100, n_init=1, verbose=1)
+
+print "Clustering sparse data with %s" % km
+t0 = time()
+km.fit(X)
+print "done in %0.3fs" % (time() - t0)
+print
+
+print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)
+print "Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)
+print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
+print "Adjusted Rand-Index: %.3f" % \
+ metrics.adjusted_rand_score(labels, km.labels_)
+print "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
+ X, labels, sample_size=1000)
+
+print
@@ -0,0 +1,22 @@
+import sys, os
+import csv
+
+from sklearn import tree
+
+if __name__ == '__main__':
+ input_file = "thingiverse_liked_objects_1k.csv"
+ input_data = csv.reader(open(input_file, 'rb'))
+
+ data_features = []
+ data_labels = []
+
+ for row in input_data:
+ data_features.append([row[0], row[1]])
+ data_labels.append(row[2])
+
+ dt = tree.DecisionTreeClassifier()
+ dt = dt.fit(data_features, data_labels)
+
+ # print dt.predict([12,5])
+
+ # o = tree.export_graphviz(dt,out_file='thingiverse_tree.dot',feature_names=['user_id','num_likes'])
@@ -0,0 +1,91 @@
+#!/usr/bin/python
+#
+# Licensed to Cloudera, Inc. under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. Cloudera, Inc. licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# Template for python Hadoop streaming. Fill in the map() and reduce()
+# functions, which should call emit(), as appropriate.
+#
+# Test your script with
+# cat input | python wordcount.py map | sort | python wordcount.py reduce
+
+import sys
+import re
+try:
+ import simplejson as json
+except ImportError:
+ import json
+
+import __builtin__
+
+def map(line):
+ words = line.split()
+ for word in words:
+ emit(word, str(1))
+
+def reduce(key, values):
+ emit(key, str(sum(__builtin__.map(int,values))))
+
+# Common library code follows:
+
+def emit(key, value):
+ """
+ Emits a key->value pair. Key and value should be strings.
+ """
+ try:
+ print "\t".join( (key, value) )
+ except:
+ pass
+
+def run_map():
+ """Calls map() for each input value."""
+ for line in sys.stdin:
+ line = line.rstrip()
+ map(line)
+
+def run_reduce():
+ """Gathers reduce() data in memory, and calls reduce()."""
+ prev_key = None
+ values = []
+ for line in sys.stdin:
+ line = line.rstrip()
+ key, value = re.split("\t", line, 1)
+ if prev_key == key:
+ values.append(value)
+ else:
+ if prev_key is not None:
+ reduce(prev_key, values)
+ prev_key = key
+ values = [ value ]
+
+ if prev_key is not None:
+ reduce(prev_key, values)
+
+def main():
+ """Runs map or reduce code, per arguments."""
+ if len(sys.argv) != 2 or sys.argv[1] not in ("map", "reduce"):
+ print "Usage: %s <map|reduce>" % sys.argv[0]
+ sys.exit(1)
+ if sys.argv[1] == "map":
+ run_map()
+ elif sys.argv[1] == "reduce":
+ run_reduce()
+ else:
+ assert False
+
+if __name__ == "__main__":
+ main()
@@ -0,0 +1,15 @@
+from hashes.simhash import simhash
+
+if __name__ == '__main__':
+ f = open('flat.txt', 'r')
+ data = [line.strip() for line in f.readlines()]
+ f.close()
+
+ # print data
+ all_hashes = dict([(d, simhash(d)) for d in data])
+
+ for k, h in all_hashes.items():
+ print "%s %s" % (k, h)
+ print all_hashes['Flatpack Bunny'].similarity(h)
+
+

0 comments on commit 4a9f2cb

Please sign in to comment.