end of week 3

kisonecat · May 22, 2019 · 230b6f2 · 230b6f2
1 parent f90b063
commit 230b6f2
Show file tree

Hide file tree

Showing 7 changed files with 1,001 additions and 1 deletion.
diff --git a/week3/10lsa.ipynb b/week3/10lsa.ipynb
@@ -0,0 +1,139 @@
+{
+   "metadata" : {
+      "org" : null,
+      "language_info" : {
+         "file_extension" : ".py",
+         "nbconvert_exporter" : "python",
+         "mimetype" : "text/x-python",
+         "codemirror_mode" : {
+            "version" : 3,
+            "name" : "ipython"
+         },
+         "pygments_lexer" : "ipython3",
+         "name" : "python",
+         "version" : "3.5.2"
+      },
+      "kernelspec" : {
+         "language" : "python",
+         "name" : "python3",
+         "display_name" : "Python 3"
+      }
+   },
+   "nbformat_minor" : 0,
+   "nbformat" : 4,
+   "cells" : [
+      {
+         "metadata" : {},
+         "source" : [
+            "I should mention that this technique (latent semantic analysis,\notherwise known as LSA) is related to principal component analysis\n(PCA).  Matt Osborne will be presenting much more on PCA during a\nspecial session.\n\n"
+         ],
+         "cell_type" : "markdown"
+      },
+      {
+         "metadata" : {},
+         "cell_type" : "markdown",
+         "source" : [
+            "## Load some real-world data\n\n"
+         ]
+      },
+      {
+         "metadata" : {},
+         "cell_type" : "markdown",
+         "source" : [
+            "Let&rsquo;s see how this might work by reprising our reddit data set.\n\n"
+         ]
+      },
+      {
+         "outputs" : [],
+         "metadata" : {},
+         "cell_type" : "code",
+         "source" : [
+            "import json\nimport bz2\ncomments = []\nwith bz2.open('/home/jim/downloads/RC_2010-10.bz2', 'r') as f:\n    for line in f:\n        comment = json.loads(line.strip().decode('utf-8'))\n        if comment['subreddit'] == 'politics':\n            if comment['body'] != '[deleted]':\n                comments.append( comment )\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nvectorizer = TfidfVectorizer()\ncorpus = [comment['body'] for comment in comments]\nX = vectorizer.fit_transform(corpus)"
+         ],
+         "execution_count" : 1
+      },
+      {
+         "metadata" : {},
+         "cell_type" : "markdown",
+         "source" : [
+            "At this point `X` is rows of vectors for each &ldquo;document&rdquo; which in this\ncase is a reddit comment.\n\n"
+         ]
+      },
+      {
+         "metadata" : {},
+         "source" : [
+            "## Reduce dimension\n\n"
+         ],
+         "cell_type" : "markdown"
+      },
+      {
+         "execution_count" : 1,
+         "metadata" : {},
+         "cell_type" : "code",
+         "source" : [
+            "from sklearn.decomposition import  TruncatedSVD\ntsvd = TruncatedSVD(n_components=300)\ntsvd.fit(X)  \nX2 = tsvd.transform(X)"
+         ],
+         "outputs" : []
+      },
+      {
+         "metadata" : {},
+         "source" : [
+            "You might enjoy changing `n_components`.  In this case, `300` is a\n&ldquo;recommended number.&rdquo;\n\n"
+         ],
+         "cell_type" : "markdown"
+      },
+      {
+         "metadata" : {},
+         "cell_type" : "markdown",
+         "source" : [
+            "## Then cluster\n\n"
+         ]
+      },
+      {
+         "metadata" : {},
+         "cell_type" : "markdown",
+         "source" : [
+            "Why is it reasonable (or a good idea) to first perform SVD before\nclustering?\n\n"
+         ]
+      },
+      {
+         "execution_count" : 1,
+         "outputs" : [],
+         "cell_type" : "code",
+         "source" : [
+            "from sklearn.cluster import KMeans\nkmeans = KMeans(n_clusters=10).fit(X2)\nkmeans.labels_"
+         ],
+         "metadata" : {}
+      },
+      {
+         "cell_type" : "markdown",
+         "source" : [
+            "## Explore the clusters\n\n"
+         ],
+         "metadata" : {}
+      },
+      {
+         "execution_count" : 1,
+         "metadata" : {},
+         "source" : [
+            "for j in np.unique(kmeans.labels_):\n    print(\"****************************************************************\")\n    print(\"Cluster\",j)\n    for i in np.random.choice( np.nonzero(kmeans.labels_ == j)[0], size=20, replace=False ):\n        print(corpus[i][0:70])"
+         ],
+         "cell_type" : "code",
+         "outputs" : []
+      },
+      {
+         "cell_type" : "markdown",
+         "source" : [
+            "## Homework\n\n"
+         ],
+         "metadata" : {}
+      },
+      {
+         "cell_type" : "markdown",
+         "source" : [
+            "Given a document, find (and print) documents which are nearby in the\n&ldquo;semantic space&rdquo; computed by SVD.\n\n"
+         ],
+         "metadata" : {}
+      }
+   ]
+}
diff --git a/week3/10lsa.org b/week3/10lsa.org
@@ -0,0 +1,69 @@
+#+TITLE: latent semantic analysis
+#+AUTHOR: Jim Fowler
+
+I should mention that this technique (latent semantic analysis,
+otherwise known as LSA) is related to principal component analysis
+(PCA).  Matt Osborne will be presenting much more on PCA during a
+special session.
+
+* Load some real-world data
+
+Let's see how this might work by reprising our reddit data set.
+
+#+BEGIN_SRC ipython 
+import json
+import bz2
+comments = []
+with bz2.open('/home/jim/downloads/RC_2010-10.bz2', 'r') as f:
+    for line in f:
+        comment = json.loads(line.strip().decode('utf-8'))
+        if comment['subreddit'] == 'politics':
+            if comment['body'] != '[deleted]':
+                comments.append( comment )
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+vectorizer = TfidfVectorizer()
+corpus = [comment['body'] for comment in comments]
+X = vectorizer.fit_transform(corpus)
+#+END_SRC
+
+At this point ~X~ is rows of vectors for each "document" which in this
+case is a reddit comment.
+
+* Reduce dimension
+
+#+BEGIN_SRC ipython 
+from sklearn.decomposition import  TruncatedSVD
+tsvd = TruncatedSVD(n_components=300)
+tsvd.fit(X)  
+X2 = tsvd.transform(X)
+#+END_SRC
+
+You might enjoy changing ~n_components~.  In this case, ~300~ is a
+"recommended number."
+
+* Then cluster
+
+Why is it reasonable (or a good idea) to first perform SVD before
+clustering?
+
+#+BEGIN_SRC ipython 
+from sklearn.cluster import KMeans
+kmeans = KMeans(n_clusters=10).fit(X2)
+kmeans.labels_
+#+END_SRC
+
+* Explore the clusters
+
+#+BEGIN_SRC ipython 
+for j in np.unique(kmeans.labels_):
+    print("****************************************************************")
+    print("Cluster",j)
+    for i in np.random.choice( np.nonzero(kmeans.labels_ == j)[0], size=20, replace=False ):
+        print(corpus[i][0:70])
+#+END_SRC
+
+* Homework
+
+Given a document, find (and print) documents which are nearby in the
+"semantic space" computed by SVD.
diff --git a/week3/11faces.ipynb b/week3/11faces.ipynb
@@ -0,0 +1,173 @@
+{
+   "nbformat_minor" : 0,
+   "metadata" : {
+      "org" : null,
+      "language_info" : {
+         "name" : "python",
+         "codemirror_mode" : {
+            "version" : 3,
+            "name" : "ipython"
+         },
+         "version" : "3.5.2",
+         "nbconvert_exporter" : "python",
+         "mimetype" : "text/x-python",
+         "pygments_lexer" : "ipython3",
+         "file_extension" : ".py"
+      },
+      "kernelspec" : {
+         "language" : "python",
+         "name" : "python3",
+         "display_name" : "Python 3"
+      }
+   },
+   "cells" : [
+      {
+         "cell_type" : "markdown",
+         "source" : [
+            "## Load some faces\n\n"
+         ],
+         "metadata" : {}
+      },
+      {
+         "metadata" : {},
+         "source" : [
+            "Download the data from [http://www.cad.zju.edu.cn/home/dengcai/Data/Yale/Yale_64x64.mat>](http://www.cad.zju.edu.cn/home/dengcai/Data/Yale/Yale_64x64.mat>)perhaps by using `wget` or `curl`.\n\nLet&rsquo;s load the data.  It&rsquo;s in a MATLAB format, but `scipy` can read\nthese.\n\n"
+         ],
+         "cell_type" : "markdown"
+      },
+      {
+         "cell_type" : "code",
+         "outputs" : [],
+         "execution_count" : 1,
+         "metadata" : {},
+         "source" : [
+            "import scipy.io\nX = scipy.io.loadmat('Yale_64x64.mat')['fea']\ny = scipy.io.loadmat('Yale_64x64.mat')['gnd'].reshape(-1)"
+         ]
+      },
+      {
+         "cell_type" : "markdown",
+         "metadata" : {},
+         "source" : [
+            "## Display a face\n\n"
+         ]
+      },
+      {
+         "metadata" : {},
+         "source" : [
+            "We&rsquo;ll use `cmap='gray'` so we have a grayscale colormap.  Then we can\ntake a look at one of the faces.  We&rsquo;ll `transpose` so the face is\nlooking at us.\n\n"
+         ],
+         "cell_type" : "markdown"
+      },
+      {
+         "source" : [
+            "face = X[17].reshape(64,64)\nimport matplotlib.pyplot as plt\nplt.imshow(face.transpose(),cmap='gray')\nplt.show()"
+         ],
+         "metadata" : {},
+         "execution_count" : 1,
+         "outputs" : [],
+         "cell_type" : "code"
+      },
+      {
+         "metadata" : {},
+         "source" : [
+            "## The mean face\n\n"
+         ],
+         "cell_type" : "markdown"
+      },
+      {
+         "metadata" : {},
+         "source" : [
+            "These faces are aligned.  We can see this by taking the &ldquo;mean face.&rdquo;\n\n"
+         ],
+         "cell_type" : "markdown"
+      },
+      {
+         "outputs" : [],
+         "cell_type" : "code",
+         "source" : [
+            "import numpy as np\nmean_face = np.mean(X, axis=0)\nimport matplotlib.pyplot as plt\nplt.imshow(mean_face.reshape(64,64).transpose(),cmap='gray')\nplt.show()"
+         ],
+         "execution_count" : 1,
+         "metadata" : {}
+      },
+      {
+         "cell_type" : "markdown",
+         "metadata" : {},
+         "source" : [
+            "## Reduce dimension\n\n"
+         ]
+      },
+      {
+         "cell_type" : "markdown",
+         "source" : [
+            "There are many ways to reduce the dimension of this data set.  One\n**very** aggressive thing to do is SVD.\n\n"
+         ],
+         "metadata" : {}
+      },
+      {
+         "execution_count" : 1,
+         "metadata" : {},
+         "source" : [
+            "import numpy as np\nU, s, V = np.linalg.svd(X - mean_face)\n\ns[3:] = 0\nS = np.zeros(X.shape)\nS[:len(s), :len(s)] = np.diag(s)\nUS = np.matmul(U,S)"
+         ],
+         "cell_type" : "code",
+         "outputs" : []
+      },
+      {
+         "cell_type" : "markdown",
+         "metadata" : {},
+         "source" : [
+            "This is throwing away a ton of data, and yet the faces are &ldquo;still\nthere.&rdquo;  For proof, let&rsquo;s load one.\n\n"
+         ]
+      },
+      {
+         "cell_type" : "code",
+         "outputs" : [],
+         "source" : [
+            "Xp = np.matmul(np.matmul(U,S),V)\nface = (Xp[17] + mean_face).reshape(64,64)\nplt.imshow(face.transpose(),cmap='gray')\nplt.show()"
+         ],
+         "execution_count" : 1,
+         "metadata" : {}
+      },
+      {
+         "source" : [
+            "We van interactively view these faces.\n\n"
+         ],
+         "metadata" : {},
+         "cell_type" : "markdown"
+      },
+      {
+         "cell_type" : "code",
+         "outputs" : [],
+         "source" : [
+            "%matplotlib inline\nfrom ipywidgets import interactive\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nimport numpy as np\nU, s, V = np.linalg.svd(X - mean_face)\n\ns[4:] = 0\nS = np.zeros(X.shape)\nS[:len(s), :len(s)] = np.diag(s)\nUS = np.matmul(U,S)\n\ndef f(a, b, c, d):\n    Xp = np.matmul(np.matmul(U,S),V)\n    face = (a*V[:,0] + b*V[:,1] + c*V[:,2] + d*V[:,3] + mean_face).reshape(64,64)\n    plt.imshow(face.transpose(),cmap='gray')\n    plt.show()\n\ninteractive_plot = interactive(f, a=(-5,5), b=(-5,5), c=(-5,5), d=(-5,5))\noutput = interactive_plot.children[-1]\noutput.layout.height = '350px'\ninteractive_plot"
+         ],
+         "metadata" : {},
+         "execution_count" : 1
+      },
+      {
+         "metadata" : {},
+         "source" : [
+            "## Plotting in 3D\n\n"
+         ],
+         "cell_type" : "markdown"
+      },
+      {
+         "metadata" : {},
+         "source" : [
+            "Can we identify any clusters in this low-dimensional projection?\n\n"
+         ],
+         "cell_type" : "markdown"
+      },
+      {
+         "source" : [
+            "from mpl_toolkits import mplot3d\nfig = plt.figure()\nax = plt.axes(projection='3d')\nax.scatter( US[:,0], US[:,1], US[:,2], c )\nplt.show()"
+         ],
+         "metadata" : {},
+         "execution_count" : 1,
+         "cell_type" : "code",
+         "outputs" : []
+      }
+   ],
+   "nbformat" : 4
+}