Skip to content

Commit

Permalink
woow! It was fun doing this practical project, Dataset principles!!
Browse files Browse the repository at this point in the history
  • Loading branch information
kinsDev committed Apr 30, 2024
1 parent aae31ef commit 05d623f
Show file tree
Hide file tree
Showing 4 changed files with 1,185 additions and 0 deletions.
182 changes: 182 additions & 0 deletions Model Deployment Workflow/Dataset Principles.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-04-30T07:38:56.606835Z",
"start_time": "2024-04-30T07:38:56.597195400Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.datasets import make_regression\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"outputs": [],
"source": [
"# Creating a regression dataset with 1000 samples, 5 feature columns, 2 which are actually useful, and 1 target column\n",
"regression_dataset = make_regression(\n",
" n_samples=1000,\n",
" n_features=5,\n",
" n_informative=2,\n",
" n_targets=1,\n",
" random_state=0\n",
")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-04-30T07:38:57.593961Z",
"start_time": "2024-04-30T07:38:57.585953700Z"
}
},
"id": "76c2f57ef7a4876",
"execution_count": 7
},
{
"cell_type": "code",
"outputs": [],
"source": [
"# Create a dataframe with Data, Feature_names/ features, and target\n",
"df = pd.DataFrame(regression_dataset[0], columns = regression_dataset[1][:5]) # data and name the columns using feature_names\n",
"df['target'] = regression_dataset[1]# adding the target to the dataframe"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-04-30T07:46:00.103493100Z",
"start_time": "2024-04-30T07:46:00.090060600Z"
}
},
"id": "c0a46b7decc674e7",
"execution_count": 16
},
{
"cell_type": "code",
"outputs": [
{
"data": {
"text/plain": " 70.618082792805 52.75786995768049 -43.728455572844375 \\\n0 0.236225 -0.323289 -0.018429 \n1 -0.801497 0.271170 -0.525641 \n2 0.687881 0.417044 -1.203735 \n3 -0.679593 -1.063433 -1.797456 \n4 0.096479 -0.507060 0.522083 \n\n 156.83512459721914 102.74870615767526 target \n0 -1.548471 1.311427 70.618083 \n1 -0.887780 0.936399 52.757870 \n2 0.498727 -0.737932 -43.728456 \n3 0.913202 2.211304 156.835125 \n4 0.155794 1.520004 102.748706 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>70.618082792805</th>\n <th>52.75786995768049</th>\n <th>-43.728455572844375</th>\n <th>156.83512459721914</th>\n <th>102.74870615767526</th>\n <th>target</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.236225</td>\n <td>-0.323289</td>\n <td>-0.018429</td>\n <td>-1.548471</td>\n <td>1.311427</td>\n <td>70.618083</td>\n </tr>\n <tr>\n <th>1</th>\n <td>-0.801497</td>\n <td>0.271170</td>\n <td>-0.525641</td>\n <td>-0.887780</td>\n <td>0.936399</td>\n <td>52.757870</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.687881</td>\n <td>0.417044</td>\n <td>-1.203735</td>\n <td>0.498727</td>\n <td>-0.737932</td>\n <td>-43.728456</td>\n </tr>\n <tr>\n <th>3</th>\n <td>-0.679593</td>\n <td>-1.063433</td>\n <td>-1.797456</td>\n <td>0.913202</td>\n <td>2.211304</td>\n <td>156.835125</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.096479</td>\n <td>-0.507060</td>\n <td>0.522083</td>\n <td>0.155794</td>\n <td>1.520004</td>\n <td>102.748706</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-04-30T07:46:07.103675200Z",
"start_time": "2024-04-30T07:46:07.095122Z"
}
},
"id": "4577c2d80d39372a",
"execution_count": 18
},
{
"cell_type": "code",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training Dataset {(640, 6)}\n",
"Validation Dataset {(160, 6)}\n",
"Testing Dataset {(200, 6)}\n"
]
}
],
"source": [
"# Create a train: 0.8 | test: 0.2 ratio dataset\n",
"df_train, df_test = train_test_split(\n",
" df,\n",
" test_size=0.2,\n",
" random_state = 0\n",
")\n",
"\n",
"# Create a train: 0.6 | validation: 0.2 ratio dataset\n",
"df_train, df_val = train_test_split(\n",
" df_train,\n",
" test_size=0.2,\n",
" random_state = 0\n",
")\n",
"# Final dataset sizes: train: 0.6, validation: 0.2, test: 0.2,\n",
"\n",
"# Output each shape to confirm the size of train/validation/test\n",
"print(\"Training Dataset\", {df_train.shape})\n",
"print(\"Validation Dataset\", {df_val.shape})\n",
"print(\"Testing Dataset\", {df_test.shape})"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-04-30T07:46:08.406249500Z",
"start_time": "2024-04-30T07:46:08.399261200Z"
}
},
"id": "1cc0ab92f1415d66",
"execution_count": 19
},
{
"cell_type": "code",
"outputs": [],
"source": [
"# Output all datasets to csv\n",
"df_train.to_csv('Training.csv', index=False)\n",
"df_val.to_csv('Validation.csv', index=False)\n",
"df_test.to_csv('Test.csv', index=False)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-04-30T07:47:24.995714600Z",
"start_time": "2024-04-30T07:47:24.848175800Z"
}
},
"id": "1ca804b42c807944",
"execution_count": 20
},
{
"cell_type": "code",
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
},
"id": "35815eb5e913dc8b"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 05d623f

Please sign in to comment.