From 7e011f0dc93351300776216ce3135b538b9156ca Mon Sep 17 00:00:00 2001 From: Affiong Akpanisong Date: Tue, 1 Apr 2025 16:47:38 +0100 Subject: [PATCH] Labs on Logistic regression --- lab-logistic-regression-with-python.ipynb | 1002 ++++++++++++++++++++- 1 file changed, 963 insertions(+), 39 deletions(-) diff --git a/lab-logistic-regression-with-python.ipynb b/lab-logistic-regression-with-python.ipynb index 05ead5e..c5abfe4 100644 --- a/lab-logistic-regression-with-python.ipynb +++ b/lab-logistic-regression-with-python.ipynb @@ -150,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "button": false, "new_sheet": false, @@ -226,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "button": false, "new_sheet": false, @@ -273,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "button": false, "new_sheet": false, @@ -281,7 +281,207 @@ "read_only": false } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tenureageaddressincomeedemployequipcallcardwirelesslongmon...pagerinternetcallwaitconferebillloglonglogtolllninccustcatchurn
011.033.07.0136.05.05.00.01.01.04.40...1.00.01.01.00.01.4823.0334.9134.01.0
133.033.012.033.02.00.00.00.00.09.45...0.00.00.00.00.02.2463.2403.4971.01.0
223.030.09.030.01.02.00.00.00.06.30...0.00.00.01.00.01.8413.2403.4013.00.0
338.035.05.076.02.010.01.01.01.06.05...1.01.01.01.01.01.8003.8074.3314.00.0
47.035.014.080.02.015.00.01.00.07.10...0.00.01.01.00.01.9603.0914.3823.00.0
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " tenure age address income ed employ equip callcard wireless \\\n", + "0 11.0 33.0 7.0 136.0 5.0 5.0 0.0 1.0 1.0 \n", + "1 33.0 33.0 12.0 33.0 2.0 0.0 0.0 0.0 0.0 \n", + "2 23.0 30.0 9.0 30.0 1.0 2.0 0.0 0.0 0.0 \n", + "3 38.0 35.0 5.0 76.0 2.0 10.0 1.0 1.0 1.0 \n", + "4 7.0 35.0 14.0 80.0 2.0 15.0 0.0 1.0 0.0 \n", + "\n", + " longmon ... pager internet callwait confer ebill loglong logtoll \\\n", + "0 4.40 ... 1.0 0.0 1.0 1.0 0.0 1.482 3.033 \n", + "1 9.45 ... 0.0 0.0 0.0 0.0 0.0 2.246 3.240 \n", + "2 6.30 ... 0.0 0.0 0.0 1.0 0.0 1.841 3.240 \n", + "3 6.05 ... 1.0 1.0 1.0 1.0 1.0 1.800 3.807 \n", + "4 7.10 ... 0.0 0.0 1.0 1.0 0.0 1.960 3.091 \n", + "\n", + " lninc custcat churn \n", + "0 4.913 4.0 1.0 \n", + "1 3.497 1.0 1.0 \n", + "2 3.401 3.0 0.0 \n", + "3 4.331 4.0 0.0 \n", + "4 4.382 3.0 0.0 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "churn_df = pd.read_csv(path)\n", "churn_df.head()" @@ -303,9 +503,133 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tenureageaddressincomeedemployequipcallcardwirelesschurn
011.033.07.0136.05.05.00.01.01.01
133.033.012.033.02.00.00.00.00.01
223.030.09.030.01.02.00.00.00.00
338.035.05.076.02.010.01.01.01.00
47.035.014.080.02.015.00.01.00.00
\n", + "
" + ], + "text/plain": [ + " tenure age address income ed employ equip callcard wireless \\\n", + "0 11.0 33.0 7.0 136.0 5.0 5.0 0.0 1.0 1.0 \n", + "1 33.0 33.0 12.0 33.0 2.0 0.0 0.0 0.0 0.0 \n", + "2 23.0 30.0 9.0 30.0 1.0 2.0 0.0 0.0 0.0 \n", + "3 38.0 35.0 5.0 76.0 2.0 10.0 1.0 1.0 1.0 \n", + "4 7.0 35.0 14.0 80.0 2.0 15.0 0.0 1.0 0.0 \n", + "\n", + " churn \n", + "0 1 \n", + "1 1 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "churn_df = churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip', 'callcard', 'wireless','churn']]\n", "churn_df['churn'] = churn_df['churn'].astype('int')\n", @@ -329,7 +653,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "button": false, "new_sheet": false, @@ -337,9 +661,20 @@ "read_only": false } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(200, 10)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# write your code here\n" + "churn_df.shape\n" ] }, { @@ -365,9 +700,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 11., 33., 7., 136., 5., 5., 0.],\n", + " [ 33., 33., 12., 33., 2., 0., 0.],\n", + " [ 23., 30., 9., 30., 1., 2., 0.],\n", + " [ 38., 35., 5., 76., 2., 10., 1.],\n", + " [ 7., 35., 14., 80., 2., 15., 0.]])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "X = np.asarray(churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip']])\n", "X[0:5]" @@ -375,9 +725,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 1, 0, 0, 0])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "y = np.asarray(churn_df['churn'])\n", "y [0:5]" @@ -392,9 +753,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-1.13518441, -0.62595491, -0.4588971 , 0.4751423 , 1.6961288 ,\n", + " -0.58477841, -0.85972695],\n", + " [-0.11604313, -0.62595491, 0.03454064, -0.32886061, -0.6433592 ,\n", + " -1.14437497, -0.85972695],\n", + " [-0.57928917, -0.85594447, -0.261522 , -0.35227817, -1.42318853,\n", + " -0.92053635, -0.85972695],\n", + " [ 0.11557989, -0.47262854, -0.65627219, 0.00679109, -0.6433592 ,\n", + " -0.02518185, 1.16316 ],\n", + " [-1.32048283, -0.47262854, 0.23191574, 0.03801451, -0.6433592 ,\n", + " 0.53441472, -0.85972695]])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn import preprocessing\n", "X = preprocessing.StandardScaler().fit(X).transform(X)\n", @@ -417,9 +798,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train set: (160, 7) (160,)\n", + "Test set: (40, 7) (40,)\n" + ] + } + ], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)\n", @@ -447,9 +837,427 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
LogisticRegression(C=0.01, solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LogisticRegression(C=0.01, solver='liblinear')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import confusion_matrix\n", @@ -466,9 +1274,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "yhat = LR.predict(X_test)\n", "yhat" @@ -483,9 +1303,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.54132919, 0.45867081],\n", + " [0.60593357, 0.39406643],\n", + " [0.56277713, 0.43722287],\n", + " [0.63432489, 0.36567511],\n", + " [0.56431839, 0.43568161],\n", + " [0.55386646, 0.44613354],\n", + " [0.52237207, 0.47762793],\n", + " [0.60514349, 0.39485651],\n", + " [0.41069572, 0.58930428],\n", + " [0.6333873 , 0.3666127 ],\n", + " [0.58068791, 0.41931209],\n", + " [0.62768628, 0.37231372],\n", + " [0.47559883, 0.52440117],\n", + " [0.4267593 , 0.5732407 ],\n", + " [0.66172417, 0.33827583],\n", + " [0.55092315, 0.44907685],\n", + " [0.51749946, 0.48250054],\n", + " [0.485743 , 0.514257 ],\n", + " [0.49011451, 0.50988549],\n", + " [0.52423349, 0.47576651],\n", + " [0.61619519, 0.38380481],\n", + " [0.52696302, 0.47303698],\n", + " [0.63957168, 0.36042832],\n", + " [0.52205164, 0.47794836],\n", + " [0.50572852, 0.49427148],\n", + " [0.70706202, 0.29293798],\n", + " [0.55266286, 0.44733714],\n", + " [0.52271594, 0.47728406],\n", + " [0.51638863, 0.48361137],\n", + " [0.71331391, 0.28668609],\n", + " [0.67862111, 0.32137889],\n", + " [0.50896403, 0.49103597],\n", + " [0.42348082, 0.57651918],\n", + " [0.71495838, 0.28504162],\n", + " [0.59711064, 0.40288936],\n", + " [0.63808839, 0.36191161],\n", + " [0.39957895, 0.60042105],\n", + " [0.52127638, 0.47872362],\n", + " [0.65975464, 0.34024536],\n", + " [0.5114172 , 0.4885828 ]])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "yhat_prob = LR.predict_proba(X_test)\n", "yhat_prob" @@ -509,9 +1379,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.7058823529411765" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import jaccard_score\n", "jaccard_score(y_test, yhat,pos_label=0)" @@ -528,9 +1409,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 6 9]\n", + " [ 1 24]]\n" + ] + } + ], "source": [ "from sklearn.metrics import classification_report, confusion_matrix\n", "import itertools\n", @@ -572,9 +1462,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion matrix, without normalization\n", + "[[ 6 9]\n", + " [ 1 24]]\n" + ] + } + ], "source": [ "# Compute confusion matrix\n", "cnf_matrix = confusion_matrix(y_test, yhat, labels=[1,0])\n", @@ -604,9 +1504,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.73 0.96 0.83 25\n", + " 1 0.86 0.40 0.55 15\n", + "\n", + " accuracy 0.75 40\n", + " macro avg 0.79 0.68 0.69 40\n", + "weighted avg 0.78 0.75 0.72 40\n", + "\n" + ] + } + ], "source": [ "print (classification_report(y_test, yhat))\n" ] @@ -643,9 +1559,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.6017092478101185" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import log_loss\n", "log_loss(y_test, yhat_prob)" @@ -664,10 +1591,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# write your code here\n", - "\n" - ] + "source": [] }, { "cell_type": "markdown", @@ -695,7 +1619,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -709,7 +1633,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.2" }, "widgets": { "state": {},