From fc8842566976bea6479a18419b5350287c812768 Mon Sep 17 00:00:00 2001 From: Michael Libio Date: Wed, 14 May 2025 16:38:51 +0200 Subject: [PATCH] Completed extra lab work --- .gitignore | 1 + lab-logistic-regression-with-python.ipynb | 1071 +++++++++++++++++++-- 2 files changed, 1018 insertions(+), 54 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ec7fd08 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +ChurnData.csv diff --git a/lab-logistic-regression-with-python.ipynb b/lab-logistic-regression-with-python.ipynb index 05ead5e..72b53fe 100644 --- a/lab-logistic-regression-with-python.ipynb +++ b/lab-logistic-regression-with-python.ipynb @@ -123,16 +123,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "import piplite\n", - "await piplite.install(['pandas'])\n", - "await piplite.install(['matplotlib'])\n", - "await piplite.install(['numpy'])\n", - "await piplite.install(['scikit-learn'])\n", - "await piplite.install(['scipy'])\n" + "# import piplite\n", + "# await piplite.install(['pandas'])\n", + "# await piplite.install(['matplotlib'])\n", + "# await piplite.install(['numpy'])\n", + "# await piplite.install(['scikit-learn'])\n", + "# await piplite.install(['scipy'])\n" ] }, { @@ -150,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "button": false, "new_sheet": false, @@ -171,17 +171,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "from pyodide.http import pyfetch\n", + "import urllib.request # Add this import\n", "\n", - "async def download(url, filename):\n", - " response = await pyfetch(url)\n", - " if response.status == 200:\n", - " with open(filename, \"wb\") as f:\n", - " f.write(await response.bytes())\n" + "def download(url, filename):\n", + " print(f\"Downloading {url} to {filename} using urllib.request...\")\n", + " try:\n", + " urllib.request.urlretrieve(url, filename)\n", + " print(f\"Successfully downloaded {filename}.\")\n", + " except Exception as e:\n", + " print(f\"Failed to download {filename}. Error: {e}\")\n", + " # Depending on your needs, you might want to re-raise the exception\n", + " # or handle it more gracefully.\n", + " # raise\n" ] }, { @@ -226,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "button": false, "new_sheet": false, @@ -262,9 +267,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/ChurnData.csv to ChurnData.csv using urllib.request...\n", + "Successfully downloaded ChurnData.csv.\n" + ] + }, + { + "ename": "TypeError", + "evalue": "object NoneType can't be used in 'await' expression", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m download(path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mChurnData.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 2\u001b[0m path\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mChurnData.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[0;31mTypeError\u001b[0m: object NoneType can't be used in 'await' expression" + ] + } + ], "source": [ "\n", "await download(path, \"ChurnData.csv\")\n", @@ -273,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "button": false, "new_sheet": false, @@ -281,7 +306,207 @@ "read_only": false } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tenureageaddressincomeedemployequipcallcardwirelesslongmon...pagerinternetcallwaitconferebillloglonglogtolllninccustcatchurn
011.033.07.0136.05.05.00.01.01.04.40...1.00.01.01.00.01.4823.0334.9134.01.0
133.033.012.033.02.00.00.00.00.09.45...0.00.00.00.00.02.2463.2403.4971.01.0
223.030.09.030.01.02.00.00.00.06.30...0.00.00.01.00.01.8413.2403.4013.00.0
338.035.05.076.02.010.01.01.01.06.05...1.01.01.01.01.01.8003.8074.3314.00.0
47.035.014.080.02.015.00.01.00.07.10...0.00.01.01.00.01.9603.0914.3823.00.0
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " tenure age address income ed employ equip callcard wireless \\\n", + "0 11.0 33.0 7.0 136.0 5.0 5.0 0.0 1.0 1.0 \n", + "1 33.0 33.0 12.0 33.0 2.0 0.0 0.0 0.0 0.0 \n", + "2 23.0 30.0 9.0 30.0 1.0 2.0 0.0 0.0 0.0 \n", + "3 38.0 35.0 5.0 76.0 2.0 10.0 1.0 1.0 1.0 \n", + "4 7.0 35.0 14.0 80.0 2.0 15.0 0.0 1.0 0.0 \n", + "\n", + " longmon ... pager internet callwait confer ebill loglong logtoll \\\n", + "0 4.40 ... 1.0 0.0 1.0 1.0 0.0 1.482 3.033 \n", + "1 9.45 ... 0.0 0.0 0.0 0.0 0.0 2.246 3.240 \n", + "2 6.30 ... 0.0 0.0 0.0 1.0 0.0 1.841 3.240 \n", + "3 6.05 ... 1.0 1.0 1.0 1.0 1.0 1.800 3.807 \n", + "4 7.10 ... 0.0 0.0 1.0 1.0 0.0 1.960 3.091 \n", + "\n", + " lninc custcat churn \n", + "0 4.913 4.0 1.0 \n", + "1 3.497 1.0 1.0 \n", + "2 3.401 3.0 0.0 \n", + "3 4.331 4.0 0.0 \n", + "4 4.382 3.0 0.0 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "churn_df = pd.read_csv(path)\n", "churn_df.head()" @@ -303,9 +528,133 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tenureageaddressincomeedemployequipcallcardwirelesschurn
011.033.07.0136.05.05.00.01.01.01
133.033.012.033.02.00.00.00.00.01
223.030.09.030.01.02.00.00.00.00
338.035.05.076.02.010.01.01.01.00
47.035.014.080.02.015.00.01.00.00
\n", + "
" + ], + "text/plain": [ + " tenure age address income ed employ equip callcard wireless \\\n", + "0 11.0 33.0 7.0 136.0 5.0 5.0 0.0 1.0 1.0 \n", + "1 33.0 33.0 12.0 33.0 2.0 0.0 0.0 0.0 0.0 \n", + "2 23.0 30.0 9.0 30.0 1.0 2.0 0.0 0.0 0.0 \n", + "3 38.0 35.0 5.0 76.0 2.0 10.0 1.0 1.0 1.0 \n", + "4 7.0 35.0 14.0 80.0 2.0 15.0 0.0 1.0 0.0 \n", + "\n", + " churn \n", + "0 1 \n", + "1 1 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "churn_df = churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip', 'callcard', 'wireless','churn']]\n", "churn_df['churn'] = churn_df['churn'].astype('int')\n", @@ -329,7 +678,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "button": false, "new_sheet": false, @@ -337,9 +686,21 @@ "read_only": false } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(200, 10)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# write your code here\n" + "# write your code here\n", + "churn_df.shape" ] }, { @@ -365,9 +726,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 11., 33., 7., 136., 5., 5., 0.],\n", + " [ 33., 33., 12., 33., 2., 0., 0.],\n", + " [ 23., 30., 9., 30., 1., 2., 0.],\n", + " [ 38., 35., 5., 76., 2., 10., 1.],\n", + " [ 7., 35., 14., 80., 2., 15., 0.]])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "X = np.asarray(churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip']])\n", "X[0:5]" @@ -375,9 +751,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 1, 0, 0, 0])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "y = np.asarray(churn_df['churn'])\n", "y [0:5]" @@ -392,9 +779,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-1.13518441, -0.62595491, -0.4588971 , 0.4751423 , 1.6961288 ,\n", + " -0.58477841, -0.85972695],\n", + " [-0.11604313, -0.62595491, 0.03454064, -0.32886061, -0.6433592 ,\n", + " -1.14437497, -0.85972695],\n", + " [-0.57928917, -0.85594447, -0.261522 , -0.35227817, -1.42318853,\n", + " -0.92053635, -0.85972695],\n", + " [ 0.11557989, -0.47262854, -0.65627219, 0.00679109, -0.6433592 ,\n", + " -0.02518185, 1.16316 ],\n", + " [-1.32048283, -0.47262854, 0.23191574, 0.03801451, -0.6433592 ,\n", + " 0.53441472, -0.85972695]])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn import preprocessing\n", "X = preprocessing.StandardScaler().fit(X).transform(X)\n", @@ -417,9 +824,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train set: (160, 7) (160,)\n", + "Test set: (40, 7) (40,)\n" + ] + } + ], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)\n", @@ -447,9 +863,427 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
LogisticRegression(C=0.01, solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LogisticRegression(C=0.01, solver='liblinear')" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import confusion_matrix\n", @@ -466,9 +1300,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "yhat = LR.predict(X_test)\n", "yhat" @@ -483,9 +1329,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.54132919, 0.45867081],\n", + " [0.60593357, 0.39406643],\n", + " [0.56277713, 0.43722287],\n", + " [0.63432489, 0.36567511],\n", + " [0.56431839, 0.43568161],\n", + " [0.55386646, 0.44613354],\n", + " [0.52237207, 0.47762793],\n", + " [0.60514349, 0.39485651],\n", + " [0.41069572, 0.58930428],\n", + " [0.6333873 , 0.3666127 ],\n", + " [0.58068791, 0.41931209],\n", + " [0.62768628, 0.37231372],\n", + " [0.47559883, 0.52440117],\n", + " [0.4267593 , 0.5732407 ],\n", + " [0.66172417, 0.33827583],\n", + " [0.55092315, 0.44907685],\n", + " [0.51749946, 0.48250054],\n", + " [0.485743 , 0.514257 ],\n", + " [0.49011451, 0.50988549],\n", + " [0.52423349, 0.47576651],\n", + " [0.61619519, 0.38380481],\n", + " [0.52696302, 0.47303698],\n", + " [0.63957168, 0.36042832],\n", + " [0.52205164, 0.47794836],\n", + " [0.50572852, 0.49427148],\n", + " [0.70706202, 0.29293798],\n", + " [0.55266286, 0.44733714],\n", + " [0.52271594, 0.47728406],\n", + " [0.51638863, 0.48361137],\n", + " [0.71331391, 0.28668609],\n", + " [0.67862111, 0.32137889],\n", + " [0.50896403, 0.49103597],\n", + " [0.42348082, 0.57651918],\n", + " [0.71495838, 0.28504162],\n", + " [0.59711064, 0.40288936],\n", + " [0.63808839, 0.36191161],\n", + " [0.39957895, 0.60042105],\n", + " [0.52127638, 0.47872362],\n", + " [0.65975464, 0.34024536],\n", + " [0.5114172 , 0.4885828 ]])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "yhat_prob = LR.predict_proba(X_test)\n", "yhat_prob" @@ -509,9 +1405,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.7058823529411765" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import jaccard_score\n", "jaccard_score(y_test, yhat,pos_label=0)" @@ -528,9 +1435,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 6 9]\n", + " [ 1 24]]\n" + ] + } + ], "source": [ "from sklearn.metrics import classification_report, confusion_matrix\n", "import itertools\n", @@ -572,9 +1488,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion matrix, without normalization\n", + "[[ 6 9]\n", + " [ 1 24]]\n" + ] + } + ], "source": [ "# Compute confusion matrix\n", "cnf_matrix = confusion_matrix(y_test, yhat, labels=[1,0])\n", @@ -604,9 +1530,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.73 0.96 0.83 25\n", + " 1 0.86 0.40 0.55 15\n", + "\n", + " accuracy 0.75 40\n", + " macro avg 0.79 0.68 0.69 40\n", + "weighted avg 0.78 0.75 0.72 40\n", + "\n" + ] + } + ], "source": [ "print (classification_report(y_test, yhat))\n" ] @@ -643,9 +1585,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.6017092478101185" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import log_loss\n", "log_loss(y_test, yhat_prob)" @@ -661,12 +1614,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LogLoss: : 0.61\n" + ] + } + ], "source": [ "# write your code here\n", - "\n" + "LR2 = LogisticRegression(C=0.01, solver='sag').fit(X_train,y_train)\n", + "yhat_prob2 = LR2.predict_proba(X_test)\n", + "print (\"LogLoss: : %.2f\" % log_loss(y_test, yhat_prob2))\n" ] }, { @@ -695,7 +1658,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -709,7 +1672,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.7" }, "widgets": { "state": {},