From e68fa0e5ad0a77e62f27ecffba3169ab14994665 Mon Sep 17 00:00:00 2001 From: Kevin Markham Date: Thu, 22 Oct 2015 22:48:00 -0400 Subject: [PATCH] update class 20 materials --- README.md | 2 +- code/20_regex_exercise.py | 61 ++ notebooks/20_regularization.ipynb | 942 +++++++++++++++++++++++++++--- 3 files changed, 933 insertions(+), 72 deletions(-) create mode 100644 code/20_regex_exercise.py diff --git a/README.md b/README.md index f5a0fe5..97921bc 100644 --- a/README.md +++ b/README.md @@ -555,7 +555,7 @@ Tuesday | Thursday * [Baltimore homicide data](data/homicides.txt) * [Regular expressions 101](https://regex101.com/#python): real-time testing of regular expressions * [Reference guide](code/20_regex_reference.py) - * Exercise + * [Exercise](code/20_regex_exercise.py) **Homework:** * Your final project is due next week! diff --git a/code/20_regex_exercise.py b/code/20_regex_exercise.py new file mode 100644 index 0000000..f0e3067 --- /dev/null +++ b/code/20_regex_exercise.py @@ -0,0 +1,61 @@ +''' +EXERCISE: Regular Expressions +''' + +# open file and store each line as one list element +with open('homicides.txt', mode='rU') as f: + data = [row for row in f] + + +''' +Create a list of ages +''' + +import re + +ages = [] +for row in data: + match = re.search(r'\d+ years? old', row) + if match: + ages.append(match.group()) + else: + ages.append('0') + +# split the string on spaces, only keep the first element, and convert to int +ages = [int(element.split()[0]) for element in ages] + +# calculate average age +sum(ages) / float(len(ages)) + +# check that 'data' and 'ages' are the same length +assert(len(data)==len(ages)) + + +''' +Create a list of ages (using match groups) +''' + +ages = [] +for row in data: + match = re.search(r'(\d+)( years? old)', row) + if match: + ages.append(int(match.group(1))) + else: + ages.append(0) + + +''' +Create a list of causes +''' + +causes = [] +for row in data: + match = re.search(r'Cause: (.+?)<', row) + if match: + causes.append(match.group(1).lower()) + else: + causes.append('unknown') + +# tally the causes +from collections import Counter +Counter(causes) diff --git a/notebooks/20_regularization.ipynb b/notebooks/20_regularization.ipynb index a4e5c95..7c124ca 100644 --- a/notebooks/20_regularization.ipynb +++ b/notebooks/20_regularization.ipynb @@ -278,11 +278,191 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...118119120121122123124125126127
08NaNNaNLakewoodcity10.190.330.020.900.12...0.120.260.200.060.040.90.50.320.140.20
153NaNNaNTukwilacity10.000.160.120.740.45...0.020.120.45NaNNaNNaNNaN0.00NaN0.67
224NaNNaNAberdeentown10.000.420.490.560.17...0.010.210.02NaNNaNNaNNaN0.00NaN0.43
334581440Willingborotownship10.040.771.000.080.12...0.020.390.28NaNNaNNaNNaN0.00NaN0.12
442956096Bethlehemtownship10.010.550.020.950.09...0.040.090.02NaNNaNNaNNaN0.00NaN0.03
\n", + "

5 rows × 128 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 \\\n", + "0 8 NaN NaN Lakewoodcity 1 0.19 0.33 0.02 0.90 0.12 \n", + "1 53 NaN NaN Tukwilacity 1 0.00 0.16 0.12 0.74 0.45 \n", + "2 24 NaN NaN Aberdeentown 1 0.00 0.42 0.49 0.56 0.17 \n", + "3 34 5 81440 Willingborotownship 1 0.04 0.77 1.00 0.08 0.12 \n", + "4 42 95 6096 Bethlehemtownship 1 0.01 0.55 0.02 0.95 0.09 \n", + "\n", + " ... 118 119 120 121 122 123 124 125 126 127 \n", + "0 ... 0.12 0.26 0.20 0.06 0.04 0.9 0.5 0.32 0.14 0.20 \n", + "1 ... 0.02 0.12 0.45 NaN NaN NaN NaN 0.00 NaN 0.67 \n", + "2 ... 0.01 0.21 0.02 NaN NaN NaN NaN 0.00 NaN 0.43 \n", + "3 ... 0.02 0.39 0.28 NaN NaN NaN NaN 0.00 NaN 0.12 \n", + "4 ... 0.04 0.09 0.02 NaN NaN NaN NaN 0.00 NaN 0.03 \n", + "\n", + "[5 rows x 128 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# read in the dataset\n", "import pandas as pd\n", @@ -293,11 +473,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 1994.000000\n", + "mean 0.237979\n", + "std 0.232985\n", + "min 0.000000\n", + "25% 0.070000\n", + "50% 0.150000\n", + "75% 0.330000\n", + "max 1.000000\n", + "Name: 127, dtype: float64" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# examine the response variable\n", "crime[127].describe()" @@ -305,7 +504,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "collapsed": false }, @@ -317,7 +516,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "collapsed": true }, @@ -329,11 +528,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(319, 123)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# check the shape\n", "crime.shape" @@ -341,7 +551,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "collapsed": true }, @@ -354,7 +564,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "collapsed": true }, @@ -374,11 +584,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# build a linear regression model\n", "from sklearn.linear_model import LinearRegression\n", @@ -388,11 +609,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ -3.66188167e+00 6.98124465e-01 -2.61955467e-01 -2.85270027e-01\n", + " -1.64740837e-01 2.46972333e-01 -1.09290051e+00 -5.96857796e-01\n", + " 1.11200239e+00 -7.21968931e-01 4.27346598e+00 -2.28040268e-01\n", + " 8.04875769e-01 -2.57934732e-01 -2.63458023e-01 -1.04616958e+00\n", + " 6.07784197e-01 7.73552561e-01 5.96468029e-02 6.90215922e-01\n", + " 2.16759430e-02 -4.87802949e-01 -5.18858404e-01 1.39478815e-01\n", + " -1.24417942e-01 3.15003821e-01 -1.52633736e-01 -9.65003927e-01\n", + " 1.17142163e+00 -3.08546690e-02 -9.29085548e-01 1.24654586e-01\n", + " 1.98104506e-01 7.30804821e-01 -1.77337294e-01 8.32927588e-02\n", + " 3.46045601e-01 5.01837338e-01 1.57062958e+00 -4.13478807e-01\n", + " 1.39350802e+00 -3.49428114e+00 7.09577818e-01 -8.32141352e-01\n", + " -1.39984927e+00 1.02482840e+00 2.13855006e-01 -6.18937325e-01\n", + " 5.28954490e-01 7.98294890e-02 5.93688560e-02 -1.68582667e-01\n", + " 7.31264051e-01 -1.39635208e+00 2.38507704e-01 5.50621439e-01\n", + " -5.61447867e-01 6.18989764e-01 2.55517024e+00 -3.71769599e+00\n", + " 7.09191935e-01 3.82041439e-01 8.23752836e-01 -1.67703547e+00\n", + " -1.73150450e+00 9.90120171e-01 -5.72745697e-01 -1.45877295e+00\n", + " 8.68032144e-01 5.15959984e-01 3.14453207e-02 2.01869791e-01\n", + " 9.65291940e-02 2.13034099e+00 -6.95374423e-02 4.62477023e-02\n", + " -1.10565955e-02 -1.34313780e-02 -1.04515494e-01 -8.76985171e-01\n", + " 4.26781907e-01 -1.85405795e-01 -8.16215517e-01 -2.86596076e-01\n", + " -1.56110708e-01 1.76468580e+00 -5.70163730e-01 -7.54066704e-02\n", + " -1.74212697e-01 -8.89747220e-02 2.26336403e-01 1.38030073e+00\n", + " -3.37304744e-01 -2.57856611e-02 8.91299188e-02 3.49876793e-01\n", + " -1.22428557e+00 -3.67941205e+01 -6.95699750e-01 2.95269279e-01\n", + " -1.48590316e-03 2.34206416e-01 -7.09533984e-03 3.67152957e+01\n", + " -8.90665109e-02 3.79550678e-02 3.19375782e-01 4.60708905e-01\n", + " 1.41090069e-01 -6.67017320e-01 -2.59035245e-01 -4.60600755e-04\n", + " -1.51868232e-02 7.54768410e-02 -2.36105498e-03 -1.50328233e-01\n", + " 1.85575558e-01 6.31979224e-01 -1.50253625e-01 1.87638817e-02\n", + " -3.38095851e-02 -4.46104032e-01]\n" + ] + } + ], "source": [ "# examine the coefficients\n", "print linreg.coef_" @@ -400,7 +659,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "collapsed": false }, @@ -412,11 +671,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.233813676495\n" + ] + } + ], "source": [ "# calculate RMSE\n", "from sklearn import metrics\n", @@ -437,11 +704,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.233813676495\n" + ] + } + ], "source": [ "# alpha=0 is equivalent to linear regression\n", "from sklearn.linear_model import Ridge\n", @@ -453,11 +728,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.164279068049\n" + ] + } + ], "source": [ "# try alpha=0.1\n", "ridgereg = Ridge(alpha=0.1, normalize=True)\n", @@ -468,11 +751,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ -4.00298418e-03 3.51647445e-02 6.03535935e-02 -7.68532502e-02\n", + " -1.76099849e-02 4.53791433e-02 8.81586468e-03 -2.88885814e-02\n", + " -1.92143587e-02 3.36122201e-02 5.71590736e-04 -4.85438136e-02\n", + " 5.55725157e-02 -1.15934270e-01 -1.11880845e-01 -3.32742094e-01\n", + " -1.12302031e-02 9.63833243e-02 -8.92057732e-02 8.42691702e-02\n", + " -1.67246717e-02 7.42520308e-03 -1.21294025e-01 -6.70155789e-02\n", + " -1.74250249e-03 1.69446833e-01 3.18217654e-02 -1.00209834e-01\n", + " 3.97535644e-02 -1.19173054e-01 -1.04445267e-01 -5.14946676e-03\n", + " 1.10071013e-01 -3.22958955e-02 -1.40601627e-01 7.72658029e-02\n", + " 9.07962536e-02 -3.78878862e-03 4.61941793e-02 6.30299731e-02\n", + " -3.09236932e-02 1.02883578e-02 9.70425568e-02 -1.28936944e-01\n", + " -1.38268907e-01 -6.37169778e-02 -8.80160419e-02 -4.01991014e-02\n", + " 8.11064596e-02 -6.30663975e-02 1.29756859e-01 -6.25210624e-02\n", + " 1.60531213e-02 -1.39061824e-01 6.39822353e-02 4.87118744e-02\n", + " -7.68217532e-03 -1.53523412e-03 1.73028280e-02 -1.37258659e-03\n", + " -1.97381922e-02 4.47492477e-02 3.53941624e-03 -1.64126843e-02\n", + " -1.62363185e-02 7.10860268e-02 -1.34543849e-01 3.03401863e-02\n", + " 2.87012058e-02 2.62507811e-01 3.87946361e-02 4.16976393e-02\n", + " 2.45959130e-02 4.02803695e-02 -1.15568319e-02 1.82352709e-02\n", + " -1.11769965e-04 1.17220288e-02 -3.27960499e-02 -2.06336390e-02\n", + " -2.01424775e-02 -1.55746075e-02 -1.50471159e-01 5.00237268e-02\n", + " 1.67270388e-02 1.27989507e-01 -7.55437715e-02 -7.22756020e-02\n", + " -8.80283128e-02 6.42301728e-02 1.39781081e-01 4.71861289e-02\n", + " -6.42667056e-02 3.16227166e-02 -1.36066226e-02 5.16507328e-02\n", + " -4.60206271e-02 6.55072592e-04 3.51488294e-02 -1.68717518e-02\n", + " -7.00033520e-03 4.99335627e-02 8.40464679e-02 3.87553978e-03\n", + " -1.23632746e-01 -2.24505480e-02 -2.47960018e-03 4.13468551e-02\n", + " 8.26295505e-02 -4.84167513e-02 8.21329530e-03 1.57843967e-02\n", + " -1.94698620e-02 4.09120489e-02 -4.42911592e-02 -5.64373896e-02\n", + " 1.17841094e-01 7.34994342e-02 -2.78153968e-02 3.74136314e-02\n", + " -7.67878399e-02 -4.65440973e-02]\n" + ] + } + ], "source": [ "# examine the coefficients\n", "print ridgereg.coef_" @@ -488,11 +809,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1.00000000e-02, 1.00000000e-01, 1.00000000e+00,\n", + " 1.00000000e+01, 1.00000000e+02])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# create an array of alpha values\n", "alpha_range = 10.**np.arange(-2, 3)\n", @@ -501,11 +834,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# select the best alpha with RidgeCV\n", "from sklearn.linear_model import RidgeCV\n", @@ -516,11 +860,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.163129782343\n" + ] + } + ], "source": [ "# predict method uses the best alpha value\n", "y_pred = ridgeregcv.predict(X_test)\n", @@ -540,11 +892,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0.00891952 -0.27423369 0. 0. 0.\n", + " -0. -0. 0. 0. 0. -0. -0.\n", + " -0. -0.19414627 0. 0. -0. -0. -0.\n", + " -0. -0. -0. -0. 0. 0. 0.\n", + " 0.04335664 -0. 0. -0. 0.03491474 -0.\n", + " -0.06685424 0. 0. -0. 0.10575313 0. 0.\n", + " 0.00890807 0. -0.1378172 -0.30954312 -0. -0. -0.\n", + " -0. 0. 0. 0. 0. -0. 0.\n", + " 0. 0. 0. 0. 0. -0. 0.\n", + " 0. 0. -0. 0. -0. -0. 0.\n", + " 0.05257892 -0. 0. -0. -0. 0. 0.\n", + " 0. 0. 0. -0. -0. -0. -0.\n", + " -0. -0. -0. 0. -0. -0. 0.\n", + " 0.13861081 0. -0. -0. 0. 0. 0.\n", + " 0. -0. 0. 0. 0. 0.03347908\n", + " 0. -0.01130055 -0. 0. 0. 0.00044205\n", + " 0. 0. 0. -0. 0. -0. -0.\n", + " 0.04153636 0. -0. 0.00719672 -0.000666 0. ]\n" + ] + } + ], "source": [ "# try alpha=0.001 and examine coefficients\n", "from sklearn.linear_model import Lasso\n", @@ -555,11 +932,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0. -0.03974695 0. 0. 0.\n", + " 0. 0. -0. 0. 0. -0. -0.\n", + " -0. -0. -0. 0. -0. -0. -0.\n", + " -0. -0. -0. -0. -0. -0. 0.\n", + " 0. 0. 0. -0. 0. -0. -0.\n", + " 0. 0. -0. 0. 0. 0. 0.\n", + " 0. -0. -0.27503063 -0. -0. -0. -0.\n", + " 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. -0. 0. 0.\n", + " 0. 0. 0. 0. -0. 0. 0.\n", + " -0. 0. -0. -0. 0. 0. -0.\n", + " 0. 0. -0. -0. -0. -0. -0.\n", + " -0. -0. 0. 0. -0. 0. 0.\n", + " 0. 0. -0. 0. 0. 0. 0.\n", + " -0. 0. 0. 0. 0. 0. -0.\n", + " -0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. -0.\n", + " 0. -0. 0. ]\n" + ] + } + ], "source": [ "# try alpha=0.01 and examine coefficients\n", "lassoreg = Lasso(alpha=0.01, normalize=True)\n", @@ -569,11 +971,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.198165225429\n" + ] + } + ], "source": [ "# calculate RMSE (for alpha=0.01)\n", "y_pred = lassoreg.predict(X_test)\n", @@ -590,11 +1000,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Kevin\\Anaconda\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:444: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations\n", + " ConvergenceWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "0.0015161594598125873" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# select the best alpha with LassoCV\n", "from sklearn.linear_model import LassoCV\n", @@ -605,11 +1034,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0. -0.28113506 0. 0. 0.\n", + " 0. 0. 0. 0. 0. -0. -0.\n", + " -0. -0.15481092 0. 0. -0. -0. -0.\n", + " -0. -0. -0. -0. 0. -0. 0.\n", + " 0.06451487 0. 0. -0. 0. -0.\n", + " -0.01920421 0. 0. -0. 0.03386202 0. 0.\n", + " 0.08901243 0. -0.08759757 -0.36986917 -0. -0. -0.\n", + " -0. 0. 0. 0. 0. -0. 0.\n", + " 0. 0. 0. 0. 0. -0. 0.\n", + " 0. 0. -0. 0. 0. -0. 0.\n", + " 0.01740599 -0. 0. -0. -0. 0. 0.\n", + " 0. 0. 0. -0. -0. -0. -0.\n", + " -0. -0. -0. 0. -0. -0. 0.\n", + " 0.13471036 0. -0. -0. 0. -0. 0.\n", + " 0. -0. 0. 0. 0. 0.0054122 0.\n", + " -0. -0. 0. 0. 0. 0. 0.\n", + " 0. -0. 0. -0. 0. 0.02738796\n", + " 0. -0. 0. -0. 0. ]\n" + ] + } + ], "source": [ "# examine the coefficients\n", "print lassoregcv.coef_" @@ -617,11 +1071,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.160209558014\n" + ] + } + ], "source": [ "# predict method uses the best alpha value\n", "y_pred = lassoregcv.predict(X_test)\n", @@ -647,11 +1109,146 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345678910111213
0114.231.712.4315.61272.803.060.282.295.641.043.921065
1113.201.782.1411.21002.652.760.261.284.381.053.401050
2113.162.362.6718.61012.803.240.302.815.681.033.171185
3114.371.952.5016.81133.853.490.242.187.800.863.451480
4113.242.592.8721.01182.802.690.391.824.321.042.93735
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 10 11 12 \\\n", + "0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 \n", + "1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 \n", + "2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 \n", + "3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 \n", + "4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 \n", + "\n", + " 13 \n", + "0 1065 \n", + "1 1050 \n", + "2 1185 \n", + "3 1480 \n", + "4 735 " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# read in the dataset\n", "url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'\n", @@ -661,11 +1258,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "2 71\n", + "1 59\n", + "3 48\n", + "dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# examine the response variable\n", "wine[0].value_counts()" @@ -673,7 +1284,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": { "collapsed": true }, @@ -686,7 +1297,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": { "collapsed": true }, @@ -706,11 +1317,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1000000000.0, class_weight=None, dual=False,\n", + " fit_intercept=True, intercept_scaling=1, max_iter=100,\n", + " multi_class='ovr', penalty='l2', random_state=None,\n", + " solver='liblinear', tol=0.0001, verbose=0)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# build a logistic regression model\n", "from sklearn.linear_model import LogisticRegression\n", @@ -720,11 +1345,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ -4.15397186e+00 4.90075538e+00 1.53790183e+01 -2.85235942e+00\n", + " 1.10505593e-01 -5.30164453e+00 1.05110893e+01 3.06125855e+00\n", + " -1.18150745e+01 -2.43836649e+00 -1.80124471e+00 7.09007536e+00\n", + " 6.93945792e-02]\n", + " [ 5.43369566e+00 -5.23569200e+00 -1.67764364e+01 1.64044307e+00\n", + " 5.76913231e-03 2.82443590e+00 4.90939605e+00 2.45082648e+00\n", + " 6.08259423e+00 -7.85245282e+00 3.48485003e+00 -7.70842670e+00\n", + " -4.90936762e-02]\n", + " [ -9.70207612e-01 2.08269720e+00 9.41526165e-01 2.38023989e-01\n", + " -2.49596817e-03 -9.80981243e-01 -6.54889317e+00 -4.83302817e-01\n", + " -2.65888456e+00 2.57458669e+00 -1.30417032e+00 -2.34300175e+00\n", + " 9.48521802e-03]]\n" + ] + } + ], "source": [ "# examine the coefficients\n", "print logreg.coef_" @@ -732,11 +1376,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 4.08866128e-10 3.95149488e-10 9.99999999e-01]\n", + " [ 4.06722935e-17 1.00000000e+00 4.32260610e-10]\n", + " [ 9.99999906e-01 3.46933978e-11 9.37658044e-08]\n", + " [ 4.64048343e-09 9.99999871e-01 1.24115833e-07]\n", + " [ 9.99730854e-01 3.16142539e-21 2.69146225e-04]\n", + " [ 1.42243183e-14 8.09034805e-07 9.99999191e-01]\n", + " [ 9.99829153e-01 1.47180023e-05 1.56129392e-04]\n", + " [ 9.99999720e-01 1.24359831e-23 2.80079015e-07]\n", + " [ 1.62930410e-17 3.49895213e-13 1.00000000e+00]\n", + " [ 1.97059220e-16 9.99997008e-01 2.99222229e-06]\n", + " [ 9.99967664e-01 2.23795926e-12 3.23360924e-05]\n", + " [ 9.73457858e-01 2.65419232e-02 2.19064985e-07]\n", + " [ 2.84224171e-20 1.00000000e+00 1.12562075e-10]\n", + " [ 9.99999998e-01 1.32844859e-13 1.72600662e-09]\n", + " [ 3.69546813e-09 9.99999996e-01 3.47691845e-10]\n", + " [ 4.22805011e-15 9.99999959e-01 4.13233076e-08]\n", + " [ 3.35466649e-29 4.79623565e-01 5.20376435e-01]\n", + " [ 9.99960400e-01 3.95740340e-05 2.64337565e-08]\n", + " [ 3.29043791e-14 9.99999930e-01 7.00568466e-08]\n", + " [ 9.99778348e-01 4.17075187e-21 2.21651755e-04]\n", + " [ 9.99999731e-01 2.62889100e-15 2.69237974e-07]\n", + " [ 6.19900193e-16 1.00000000e+00 2.86953165e-11]\n", + " [ 2.77940112e-09 9.69506740e-01 3.04932569e-02]\n", + " [ 5.28754692e-05 9.99947121e-01 3.47404069e-09]\n", + " [ 9.99947715e-01 5.22837028e-05 1.10248529e-09]\n", + " [ 5.82332195e-13 2.38782109e-08 9.99999976e-01]\n", + " [ 9.99999788e-01 2.08506990e-07 3.11782661e-09]\n", + " [ 9.99999791e-01 6.12671899e-08 1.47460916e-07]\n", + " [ 9.99986850e-01 8.10206146e-14 1.31497566e-05]\n", + " [ 9.54043555e-12 3.72425979e-19 1.00000000e+00]\n", + " [ 3.04071238e-11 1.00000000e+00 4.08122442e-11]\n", + " [ 3.50017326e-31 3.81668757e-19 1.00000000e+00]\n", + " [ 1.42219468e-22 6.83344681e-19 1.00000000e+00]\n", + " [ 9.99995753e-01 3.04811377e-20 4.24748096e-06]\n", + " [ 2.79451178e-01 7.20548821e-01 1.13159945e-09]\n", + " [ 5.67756081e-11 9.99999996e-01 3.89757045e-09]\n", + " [ 2.41389951e-17 9.99932283e-01 6.77171026e-05]\n", + " [ 2.28608435e-03 9.97713901e-01 1.48456046e-08]\n", + " [ 2.93122189e-12 9.99999997e-01 3.22321411e-09]\n", + " [ 9.99998259e-01 3.53259025e-08 1.70528897e-06]\n", + " [ 9.99999019e-01 1.03943395e-11 9.80508127e-07]\n", + " [ 3.41369632e-09 9.99903436e-01 9.65610816e-05]\n", + " [ 2.55526052e-20 7.80550210e-08 9.99999922e-01]\n", + " [ 9.99999985e-01 3.33192992e-10 1.45026119e-08]\n", + " [ 9.99999917e-01 3.19711403e-09 7.96376715e-08]]\n" + ] + } + ], "source": [ "# generate predicted probabilities\n", "y_pred_prob = logreg.predict_proba(X_test)\n", @@ -745,11 +1441,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.347281798715\n" + ] + } + ], "source": [ "# calculate log loss\n", "print metrics.log_loss(y_test, y_pred_prob)" @@ -768,7 +1472,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": { "collapsed": true }, @@ -784,11 +1488,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0.21044571 0. 0. 0. 0. 0.\n", + " 0.48723077 0. 0. 0. 0. 0.15360853\n", + " 1.47743702]\n", + " [-0.65689319 -0.05651272 -0.11386446 0. 0. 0. 0.\n", + " 0. 0. -0.73862636 0.24344904 0. -0.63405624]\n", + " [ 0. 0. 0. 0. 0. 0.\n", + " -0.84238099 0. 0. 0.61559726 -0.49014626 -0.30427496\n", + " 0. ]]\n" + ] + } + ], "source": [ "# try C=0.1 with L1 penalty\n", "logreg = LogisticRegression(C=0.1, penalty='l1')\n", @@ -798,11 +1517,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.362248219747\n" + ] + } + ], "source": [ "# generate predicted probabilities and calculate log loss\n", "y_pred_prob = logreg.predict_proba(X_test_scaled)\n", @@ -811,11 +1538,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0.59163934 0.06886667 0.33592964 -0.49616684 0.111539 0.21570086\n", + " 0.40524509 -0.15526139 -0.02534651 0.05399014 0.14877346 0.42327938\n", + " 0.89815007]\n", + " [-0.73545676 -0.32942948 -0.47995296 0.294866 -0.1500246 0.04264373\n", + " 0.14500586 0.07250763 0.17409795 -0.70726652 0.4128986 0.09997212\n", + " -0.81284365]\n", + " [ 0.20136567 0.30989025 0.15977925 0.18867218 0.04204443 -0.27108109\n", + " -0.55886639 0.07486943 -0.17471153 0.68266464 -0.52385748 -0.49566967\n", + " -0.02565631]]\n" + ] + } + ], "source": [ "# try C=0.1 with L2 penalty\n", "logreg = LogisticRegression(C=0.1, penalty='l2')\n", @@ -825,11 +1568,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.244588324539\n" + ] + } + ], "source": [ "# generate predicted probabilities and calculate log loss\n", "y_pred_prob = logreg.predict_proba(X_test_scaled)\n", @@ -846,7 +1597,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": { "collapsed": true }, @@ -859,11 +1610,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=10, error_score='raise',\n", + " estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='ovr',\n", + " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", + " verbose=0))]),\n", + " fit_params={}, iid=True, loss_func=None, n_jobs=1,\n", + " param_grid={'logisticregression__penalty': ['l1', 'l2'], 'logisticregression__C': array([ 1.00000e-02, 1.00000e-01, 1.00000e+00, 1.00000e+01,\n", + " 1.00000e+02])},\n", + " pre_dispatch='2*n_jobs', refit=True, score_func=None,\n", + " scoring='log_loss', verbose=0)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# grid search for best combination of C and penalty\n", "from sklearn.grid_search import GridSearchCV\n", @@ -876,11 +1647,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[mean: -1.09861, std: 0.00000, params: {'logisticregression__penalty': 'l1', 'logisticregression__C': 0.01},\n", + " mean: -0.62547, std: 0.03037, params: {'logisticregression__penalty': 'l2', 'logisticregression__C': 0.01},\n", + " mean: -0.35491, std: 0.06891, params: {'logisticregression__penalty': 'l1', 'logisticregression__C': 0.10000000000000001},\n", + " mean: -0.26801, std: 0.04840, params: {'logisticregression__penalty': 'l2', 'logisticregression__C': 0.10000000000000001},\n", + " mean: -0.09436, std: 0.06114, params: {'logisticregression__penalty': 'l1', 'logisticregression__C': 1.0},\n", + " mean: -0.10371, std: 0.04894, params: {'logisticregression__penalty': 'l2', 'logisticregression__C': 1.0},\n", + " mean: -0.05837, std: 0.06413, params: {'logisticregression__penalty': 'l1', 'logisticregression__C': 10.0},\n", + " mean: -0.06174, std: 0.05651, params: {'logisticregression__penalty': 'l2', 'logisticregression__C': 10.0},\n", + " mean: -0.07142, std: 0.09266, params: {'logisticregression__penalty': 'l1', 'logisticregression__C': 100.0},\n", + " mean: -0.06443, std: 0.08409, params: {'logisticregression__penalty': 'l2', 'logisticregression__C': 100.0}]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# print all log loss scores\n", "grid.grid_scores_" @@ -888,11 +1679,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-0.0583689728556\n", + "{'logisticregression__penalty': 'l1', 'logisticregression__C': 10.0}\n" + ] + } + ], "source": [ "# examine the best model\n", "print grid.best_score_\n",