woow! It was fun doing this practical project, Dataset principles!!

kinsDev · Apr 30, 2024 · 05d623f · 05d623f
1 parent aae31ef
commit 05d623f
Show file tree

Hide file tree

Showing 4 changed files with 1,185 additions and 0 deletions.
diff --git a/Model Deployment Workflow/Dataset Principles.ipynb b/Model Deployment Workflow/Dataset Principles.ipynb
@@ -0,0 +1,182 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2024-04-30T07:38:56.606835Z",
+     "start_time": "2024-04-30T07:38:56.597195400Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.datasets import make_regression\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "# Creating a regression dataset with 1000 samples, 5 feature columns, 2 which are actually useful, and 1 target column\n",
+    "regression_dataset = make_regression(\n",
+    "    n_samples=1000,\n",
+    "    n_features=5,\n",
+    "    n_informative=2,\n",
+    "    n_targets=1,\n",
+    "    random_state=0\n",
+    ")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-04-30T07:38:57.593961Z",
+     "start_time": "2024-04-30T07:38:57.585953700Z"
+    }
+   },
+   "id": "76c2f57ef7a4876",
+   "execution_count": 7
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "# Create a dataframe with Data, Feature_names/ features, and target\n",
+    "df = pd.DataFrame(regression_dataset[0], columns = regression_dataset[1][:5]) # data and name the columns using feature_names\n",
+    "df['target'] = regression_dataset[1]# adding the target to the dataframe"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-04-30T07:46:00.103493100Z",
+     "start_time": "2024-04-30T07:46:00.090060600Z"
+    }
+   },
+   "id": "c0a46b7decc674e7",
+   "execution_count": 16
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "   70.618082792805  52.75786995768049  -43.728455572844375  \\\n0         0.236225          -0.323289            -0.018429   \n1        -0.801497           0.271170            -0.525641   \n2         0.687881           0.417044            -1.203735   \n3        -0.679593          -1.063433            -1.797456   \n4         0.096479          -0.507060             0.522083   \n\n   156.83512459721914  102.74870615767526      target  \n0           -1.548471            1.311427   70.618083  \n1           -0.887780            0.936399   52.757870  \n2            0.498727           -0.737932  -43.728456  \n3            0.913202            2.211304  156.835125  \n4            0.155794            1.520004  102.748706  ",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>70.618082792805</th>\n      <th>52.75786995768049</th>\n      <th>-43.728455572844375</th>\n      <th>156.83512459721914</th>\n      <th>102.74870615767526</th>\n      <th>target</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0.236225</td>\n      <td>-0.323289</td>\n      <td>-0.018429</td>\n      <td>-1.548471</td>\n      <td>1.311427</td>\n      <td>70.618083</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>-0.801497</td>\n      <td>0.271170</td>\n      <td>-0.525641</td>\n      <td>-0.887780</td>\n      <td>0.936399</td>\n      <td>52.757870</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0.687881</td>\n      <td>0.417044</td>\n      <td>-1.203735</td>\n      <td>0.498727</td>\n      <td>-0.737932</td>\n      <td>-43.728456</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>-0.679593</td>\n      <td>-1.063433</td>\n      <td>-1.797456</td>\n      <td>0.913202</td>\n      <td>2.211304</td>\n      <td>156.835125</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0.096479</td>\n      <td>-0.507060</td>\n      <td>0.522083</td>\n      <td>0.155794</td>\n      <td>1.520004</td>\n      <td>102.748706</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-04-30T07:46:07.103675200Z",
+     "start_time": "2024-04-30T07:46:07.095122Z"
+    }
+   },
+   "id": "4577c2d80d39372a",
+   "execution_count": 18
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Dataset {(640, 6)}\n",
+      "Validation Dataset {(160, 6)}\n",
+      "Testing Dataset {(200, 6)}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create a train: 0.8 | test: 0.2 ratio dataset\n",
+    "df_train, df_test = train_test_split(\n",
+    "    df,\n",
+    "    test_size=0.2,\n",
+    "    random_state = 0\n",
+    ")\n",
+    "\n",
+    "# Create a train: 0.6 | validation: 0.2 ratio dataset\n",
+    "df_train, df_val = train_test_split(\n",
+    "    df_train,\n",
+    "    test_size=0.2,\n",
+    "    random_state = 0\n",
+    ")\n",
+    "# Final dataset sizes: train: 0.6, validation: 0.2, test: 0.2,\n",
+    "\n",
+    "# Output each shape to confirm the size of train/validation/test\n",
+    "print(\"Training Dataset\", {df_train.shape})\n",
+    "print(\"Validation Dataset\", {df_val.shape})\n",
+    "print(\"Testing Dataset\", {df_test.shape})"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-04-30T07:46:08.406249500Z",
+     "start_time": "2024-04-30T07:46:08.399261200Z"
+    }
+   },
+   "id": "1cc0ab92f1415d66",
+   "execution_count": 19
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "# Output all datasets to csv\n",
+    "df_train.to_csv('Training.csv', index=False)\n",
+    "df_val.to_csv('Validation.csv', index=False)\n",
+    "df_test.to_csv('Test.csv', index=False)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-04-30T07:47:24.995714600Z",
+     "start_time": "2024-04-30T07:47:24.848175800Z"
+    }
+   },
+   "id": "1ca804b42c807944",
+   "execution_count": 20
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "35815eb5e913dc8b"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}