-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
woow! It was fun doing this practical project, Dataset principles!!
- Loading branch information
Showing
4 changed files
with
1,185 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "initial_id", | ||
"metadata": { | ||
"collapsed": true, | ||
"ExecuteTime": { | ||
"end_time": "2024-04-30T07:38:56.606835Z", | ||
"start_time": "2024-04-30T07:38:56.597195400Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from sklearn.datasets import make_regression\n", | ||
"from sklearn.model_selection import train_test_split" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"outputs": [], | ||
"source": [ | ||
"# Creating a regression dataset with 1000 samples, 5 feature columns, 2 which are actually useful, and 1 target column\n", | ||
"regression_dataset = make_regression(\n", | ||
" n_samples=1000,\n", | ||
" n_features=5,\n", | ||
" n_informative=2,\n", | ||
" n_targets=1,\n", | ||
" random_state=0\n", | ||
")" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"ExecuteTime": { | ||
"end_time": "2024-04-30T07:38:57.593961Z", | ||
"start_time": "2024-04-30T07:38:57.585953700Z" | ||
} | ||
}, | ||
"id": "76c2f57ef7a4876", | ||
"execution_count": 7 | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"outputs": [], | ||
"source": [ | ||
"# Create a dataframe with Data, Feature_names/ features, and target\n", | ||
"df = pd.DataFrame(regression_dataset[0], columns = regression_dataset[1][:5]) # data and name the columns using feature_names\n", | ||
"df['target'] = regression_dataset[1]# adding the target to the dataframe" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"ExecuteTime": { | ||
"end_time": "2024-04-30T07:46:00.103493100Z", | ||
"start_time": "2024-04-30T07:46:00.090060600Z" | ||
} | ||
}, | ||
"id": "c0a46b7decc674e7", | ||
"execution_count": 16 | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": " 70.618082792805 52.75786995768049 -43.728455572844375 \\\n0 0.236225 -0.323289 -0.018429 \n1 -0.801497 0.271170 -0.525641 \n2 0.687881 0.417044 -1.203735 \n3 -0.679593 -1.063433 -1.797456 \n4 0.096479 -0.507060 0.522083 \n\n 156.83512459721914 102.74870615767526 target \n0 -1.548471 1.311427 70.618083 \n1 -0.887780 0.936399 52.757870 \n2 0.498727 -0.737932 -43.728456 \n3 0.913202 2.211304 156.835125 \n4 0.155794 1.520004 102.748706 ", | ||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>70.618082792805</th>\n <th>52.75786995768049</th>\n <th>-43.728455572844375</th>\n <th>156.83512459721914</th>\n <th>102.74870615767526</th>\n <th>target</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.236225</td>\n <td>-0.323289</td>\n <td>-0.018429</td>\n <td>-1.548471</td>\n <td>1.311427</td>\n <td>70.618083</td>\n </tr>\n <tr>\n <th>1</th>\n <td>-0.801497</td>\n <td>0.271170</td>\n <td>-0.525641</td>\n <td>-0.887780</td>\n <td>0.936399</td>\n <td>52.757870</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.687881</td>\n <td>0.417044</td>\n <td>-1.203735</td>\n <td>0.498727</td>\n <td>-0.737932</td>\n <td>-43.728456</td>\n </tr>\n <tr>\n <th>3</th>\n <td>-0.679593</td>\n <td>-1.063433</td>\n <td>-1.797456</td>\n <td>0.913202</td>\n <td>2.211304</td>\n <td>156.835125</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.096479</td>\n <td>-0.507060</td>\n <td>0.522083</td>\n <td>0.155794</td>\n <td>1.520004</td>\n <td>102.748706</td>\n </tr>\n </tbody>\n</table>\n</div>" | ||
}, | ||
"execution_count": 18, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df.head()" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"ExecuteTime": { | ||
"end_time": "2024-04-30T07:46:07.103675200Z", | ||
"start_time": "2024-04-30T07:46:07.095122Z" | ||
} | ||
}, | ||
"id": "4577c2d80d39372a", | ||
"execution_count": 18 | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Training Dataset {(640, 6)}\n", | ||
"Validation Dataset {(160, 6)}\n", | ||
"Testing Dataset {(200, 6)}\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Create a train: 0.8 | test: 0.2 ratio dataset\n", | ||
"df_train, df_test = train_test_split(\n", | ||
" df,\n", | ||
" test_size=0.2,\n", | ||
" random_state = 0\n", | ||
")\n", | ||
"\n", | ||
"# Create a train: 0.6 | validation: 0.2 ratio dataset\n", | ||
"df_train, df_val = train_test_split(\n", | ||
" df_train,\n", | ||
" test_size=0.2,\n", | ||
" random_state = 0\n", | ||
")\n", | ||
"# Final dataset sizes: train: 0.6, validation: 0.2, test: 0.2,\n", | ||
"\n", | ||
"# Output each shape to confirm the size of train/validation/test\n", | ||
"print(\"Training Dataset\", {df_train.shape})\n", | ||
"print(\"Validation Dataset\", {df_val.shape})\n", | ||
"print(\"Testing Dataset\", {df_test.shape})" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"ExecuteTime": { | ||
"end_time": "2024-04-30T07:46:08.406249500Z", | ||
"start_time": "2024-04-30T07:46:08.399261200Z" | ||
} | ||
}, | ||
"id": "1cc0ab92f1415d66", | ||
"execution_count": 19 | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"outputs": [], | ||
"source": [ | ||
"# Output all datasets to csv\n", | ||
"df_train.to_csv('Training.csv', index=False)\n", | ||
"df_val.to_csv('Validation.csv', index=False)\n", | ||
"df_test.to_csv('Test.csv', index=False)" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"ExecuteTime": { | ||
"end_time": "2024-04-30T07:47:24.995714600Z", | ||
"start_time": "2024-04-30T07:47:24.848175800Z" | ||
} | ||
}, | ||
"id": "1ca804b42c807944", | ||
"execution_count": 20 | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"outputs": [], | ||
"source": [], | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"id": "35815eb5e913dc8b" | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.