From 95a9fea91f96f9e82bf1cd6e7afe33b15c9034c1 Mon Sep 17 00:00:00 2001 From: MJossier Date: Sat, 13 Jan 2024 16:25:04 +0000 Subject: [PATCH] c --- Lab_Feature_Engeniring.ipynb | 2467 ++++++++++++++++++++++++++++++++++ 1 file changed, 2467 insertions(+) create mode 100644 Lab_Feature_Engeniring.ipynb diff --git a/Lab_Feature_Engeniring.ipynb b/Lab_Feature_Engeniring.ipynb new file mode 100644 index 0000000..d023f8e --- /dev/null +++ b/Lab_Feature_Engeniring.ipynb @@ -0,0 +1,2467 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b6e2c739", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "01fc8b1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWOSOURCETCODESTATEZIPMAILCODEPVASTATEDOBNOEXCHRECINHSE...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
08901GRI0IL6108137120...0.00L4EXXX39.0C
19401BOA1CA9132652020...0.00L2GXXX1.0A
29001AMH1NC2701700...0.01L4EXXX60.0C
38701BRY0CA9595328010...0.01L4EXXX41.0C
486010FL3317620010X...0.01L2FXXX26.0A
..................................................................
954079601ASE1AK9950400...0.00L1GXXX12.0C
954089601DCD1TX7737950010...0.01L1FXXX2.0A
954099501MBC1MI4891038010...0.01L3EXXX34.0B
954108601PRV0CA9132040050X...18.01L4FXXX11.0A
954118801MCC2NC2840918010X...0.01L1GC1C12.0C
\n", + "

95412 rows × 481 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n", + "0 8901 GRI 0 IL 61081 3712 0 \n", + "1 9401 BOA 1 CA 91326 5202 0 \n", + "2 9001 AMH 1 NC 27017 0 0 \n", + "3 8701 BRY 0 CA 95953 2801 0 \n", + "4 8601 0 FL 33176 2001 0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 ASE 1 AK 99504 0 0 \n", + "95408 9601 DCD 1 TX 77379 5001 0 \n", + "95409 9501 MBC 1 MI 48910 3801 0 \n", + "95410 8601 PRV 0 CA 91320 4005 0 \n", + "95411 8801 MCC 2 NC 28409 1801 0 \n", + "\n", + " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 X ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 X ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 481 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv('learningSet.txt')\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cd08f0a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column_namenulls_percentage
414RDATE_50.999906
436RAMNT_50.999906
412RDATE_30.997464
434RAMNT_30.997464
413RDATE_40.997055
.........
168ETHC30.000000
167ETHC20.000000
166ETHC10.000000
165HHD120.000000
240TPE110.000000
\n", + "

481 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " column_name nulls_percentage\n", + "414 RDATE_5 0.999906\n", + "436 RAMNT_5 0.999906\n", + "412 RDATE_3 0.997464\n", + "434 RAMNT_3 0.997464\n", + "413 RDATE_4 0.997055\n", + ".. ... ...\n", + "168 ETHC3 0.000000\n", + "167 ETHC2 0.000000\n", + "166 ETHC1 0.000000\n", + "165 HHD12 0.000000\n", + "240 TPE11 0.000000\n", + "\n", + "[481 rows x 2 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sparcity\n", + "# It would be nice to have a df where I have the column name and then the % missing rows. How can I do that?\n", + "\n", + "# First we check the #of missing rows\n", + "nulls_percent_df = data.isna().sum()\n", + "nulls_percent_df\n", + "\n", + "# Then we calculate the %\n", + "nulls_percent_df = data.isna().sum()/len(data)\n", + "nulls_percent_df\n", + "\n", + "# We put it into a dataframe\n", + "nulls_percent_df = pd.DataFrame(data.isna().sum()/len(data))\n", + "nulls_percent_df\n", + "\n", + "# We reset the index because I have OCD\n", + "nulls_percent_df = pd.DataFrame(data.isna().sum()/len(data)).reset_index()\n", + "nulls_percent_df\n", + "\n", + "# Lets be more concrete\n", + "nulls_percent_df.columns = ['column_name', 'nulls_percentage']\n", + "nulls_percent_df\n", + "\n", + "# Lets be picky\n", + "nulls_percent_df.sort_values(by=['nulls_percentage'], ascending = False)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9bdbfa38", + "metadata": {}, + "outputs": [], + "source": [ + "# Question to students: What can we do when we have missing values?\n", + "\n", + "# First we will start by removing some of the columns that have a high percentage of null values - setting a threshold\n", + "\n", + "threshold = 0.25\n", + "\n", + "condition = nulls_percent_df['nulls_percentage'] > threshold\n", + "columns_above_threshold = nulls_percent_df[condition]\n", + "columns_above_threshold\n", + "\n", + "# How many are they?\n", + "len(columns_above_threshold['column_name']) \n", + "\n", + "# Lets put this columns name into a list - Trust me, it's easier\n", + "drop_columns_list = list(columns_above_threshold['column_name'])\n", + "# adding the two columns to be deleted as per exercise\n", + "drop_columns_list.extend(['OSOURCE', 'ZIP'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "387b6b23", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWOSOURCETCODESTATEZIPMAILCODEPVASTATEDOBNOEXCHRECINHSE...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
08901GRI0IL6108137120...0.00L4EXXX39.0C
19401BOA1CA9132652020...0.00L2GXXX1.0A
29001AMH1NC2701700...0.01L4EXXX60.0C
38701BRY0CA9595328010...0.01L4EXXX41.0C
486010FL3317620010X...0.01L2FXXX26.0A
..................................................................
954079601ASE1AK9950400...0.00L1GXXX12.0C
954089601DCD1TX7737950010...0.01L1FXXX2.0A
954099501MBC1MI4891038010...0.01L3EXXX34.0B
954108601PRV0CA9132040050X...18.01L4FXXX11.0A
954118801MCC2NC2840918010X...0.01L1GC1C12.0C
\n", + "

95412 rows × 481 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n", + "0 8901 GRI 0 IL 61081 3712 0 \n", + "1 9401 BOA 1 CA 91326 5202 0 \n", + "2 9001 AMH 1 NC 27017 0 0 \n", + "3 8701 BRY 0 CA 95953 2801 0 \n", + "4 8601 0 FL 33176 2001 0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 ASE 1 AK 99504 0 0 \n", + "95408 9601 DCD 1 TX 77379 5001 0 \n", + "95409 9501 MBC 1 MI 48910 3801 0 \n", + "95410 8601 PRV 0 CA 91320 4005 0 \n", + "95411 8801 MCC 2 NC 28409 1801 0 \n", + "\n", + " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 X ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 X ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 481 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0adcd07d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWTCODESTATEMAILCODEPVASTATEDOBNOEXCHRECINHSERECP3RECPGVG...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
089010IL37120...0.00L4EXXX39.0C
194011CA52020...0.00L2GXXX1.0A
290011NC00...0.01L4EXXX60.0C
387010CA28010...0.01L4EXXX41.0C
486010FL20010XX...0.01L2FXXX26.0A
..................................................................
9540796011AK00...0.00L1GXXX12.0C
9540896011TX50010...0.01L1FXXX2.0A
9540995011MI38010X...0.01L3EXXX34.0B
9541086010CA40050X...18.01L4FXXX11.0A
9541188012NC18010XX...0.01L1GC1C12.0C
\n", + "

95412 rows × 407 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW TCODE STATE MAILCODE PVASTATE DOB NOEXCH RECINHSE RECP3 \\\n", + "0 8901 0 IL 3712 0 \n", + "1 9401 1 CA 5202 0 \n", + "2 9001 1 NC 0 0 \n", + "3 8701 0 CA 2801 0 \n", + "4 8601 0 FL 2001 0 X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 1 AK 0 0 \n", + "95408 9601 1 TX 5001 0 \n", + "95409 9501 1 MI 3801 0 X \n", + "95410 8601 0 CA 4005 0 X \n", + "95411 8801 2 NC 1801 0 X \n", + "\n", + " RECPGVG ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 407 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# to remove the selected columns with too much null value\n", + "data = data.drop(columns=drop_columns_list)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e55b5b7e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GENDER\n", + "F 51277\n", + "M 39094\n", + " 2957\n", + "U 1715\n", + "J 365\n", + "C 2\n", + "A 2\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Count the number of time a value appear in column Gender\n", + "\n", + "print(data['GENDER'].value_counts())\n", + "data['GENDER'] = data['GENDER'].fillna('F')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6bf35a0b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['F', 'M', ' ', 'C', 'U', 'J', 'A'], dtype=object)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# checking Null Value\n", + "\n", + "data['GENDER'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cf4eeab3", + "metadata": {}, + "outputs": [], + "source": [ + "# replace value by other if it's not masculin\n", + "data['GENDER'].replace([' ', 'C', 'U', 'J', 'A'], 'Other', inplace=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9f3334f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['F', 'M', 'Other'], dtype=object)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# checking that there is \n", + "\n", + "data['GENDER'].unique()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "99d3c02c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 F\n", + "1 M\n", + "2 M\n", + "3 F\n", + "4 F\n", + " ..\n", + "95407 M\n", + "95408 M\n", + "95409 M\n", + "95410 F\n", + "95411 F\n", + "Name: GENDER, Length: 95412, dtype: object" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['GENDER']" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2677b4ba", + "metadata": {}, + "outputs": [], + "source": [ + "# Here we will work on cleaning some of the other columns in the dataset using the techniques that we used before in the lessons.\n", + "\n", + "# Check for null values in the numerical columns.\n", + "\n", + "# Use appropriate EDA technique where ever necessary.\n", + "\n", + "# separating Data frame to create nuumerical column\n", + "numerical_columns = data.select_dtypes(include=['int64', 'float64'])\n", + "numerical_columns = data.select_dtypes(include=['int64', 'float64'])\n", + "\n", + "# checking the Null Value \n", + "numerical_columns.isna().sum()\n", + "\n", + "column_na = []\n", + "\n", + "# creating a list to select all column with Null value \n", + "\n", + "for column in numerical_columns.columns :\n", + " if numerical_columns[column].isna().sum() > 0 :\n", + " column_na.append(column)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b8ce3719", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['AGE',\n", + " 'INCOME',\n", + " 'MSA',\n", + " 'ADI',\n", + " 'DMA',\n", + " 'ADATE_3',\n", + " 'ADATE_4',\n", + " 'ADATE_6',\n", + " 'ADATE_7',\n", + " 'ADATE_8',\n", + " 'ADATE_9',\n", + " 'ADATE_11',\n", + " 'ADATE_12',\n", + " 'ADATE_14',\n", + " 'ADATE_16',\n", + " 'ADATE_18',\n", + " 'NEXTDATE',\n", + " 'TIMELAG',\n", + " 'CLUSTER2']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "column_na" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4e4ee021", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Use appropriate methods to clean the columns GEOCODE2, WEALTH1, ADI, DMA,and MSA.\n", + "\n", + "# looking to the unique value \n", + "data['GEOCODE2'].unique()\n", + "# looking to the number of value \n", + "data['GEOCODE2'].value_counts()\n", + "# checking number of Null values\n", + "data['GEOCODE2'].isnull().sum()\n", + "\n", + "# inputing the mode for the Null value for Geocode \n", + "\n", + "mode_value = data['GEOCODE2'].mode()[0]\n", + "data['GEOCODE2'] = data['GEOCODE2'].fillna(mode_value)\n", + "\n", + "# replacing the space by the mode value \n", + "data['GEOCODE2'].replace(' ', data['GEOCODE2'].mode()[0], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e158bb65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['C', 'A', 'D', 'B'], dtype=object)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# chekling number of unique value , Nan and space value have been removed and replace by the mode of data geocode column\n", + "data['GEOCODE2'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ad75e367", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWTCODESTATEMAILCODEPVASTATEDOBNOEXCHRECINHSERECP3RECPGVG...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
089010IL37120...0.00L4EXXX39.0C
194011CA52020...0.00L2GXXX1.0A
290011NC00...0.01L4EXXX60.0C
387010CA28010...0.01L4EXXX41.0C
486010FL20010XX...0.01L2FXXX26.0A
..................................................................
9540796011AK00...0.00L1GXXX12.0C
9540896011TX50010...0.01L1FXXX2.0A
9540995011MI38010X...0.01L3EXXX34.0B
9541086010CA40050X...18.01L4FXXX11.0A
9541188012NC18010XX...0.01L1GC1C12.0C
\n", + "

95412 rows × 384 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW TCODE STATE MAILCODE PVASTATE DOB NOEXCH RECINHSE RECP3 \\\n", + "0 8901 0 IL 3712 0 \n", + "1 9401 1 CA 5202 0 \n", + "2 9001 1 NC 0 0 \n", + "3 8701 0 CA 2801 0 \n", + "4 8601 0 FL 2001 0 X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 1 AK 0 0 \n", + "95408 9601 1 TX 5001 0 \n", + "95409 9501 1 MI 3801 0 X \n", + "95410 8601 0 CA 4005 0 X \n", + "95411 8801 2 NC 1801 0 X \n", + "\n", + " RECPGVG ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 384 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# dropping not useful column for more clarity\n", + "# Understand the values of RFA_2\n", + "data['RFA_2'].value_counts()\n", + "\n", + "# We will keep the columns with RFA_2 information. We will delete rest of the columns\n", + "drop_list = []\n", + "\n", + "for col_name in data.columns:\n", + " if \"RFA\" in col_name:\n", + " drop_list.append(col_name)\n", + "drop_list\n", + "\n", + "# Remove the RFA_2\n", + "drop_list.remove('RFA_2R')\n", + "drop_list.remove('RFA_2A')\n", + "drop_list.remove('RFA_2F')\n", + "drop_list\n", + "\n", + "data = data.drop(columns=drop_list)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "cadfe109", + "metadata": {}, + "outputs": [], + "source": [ + "# filling nan vallue with the mode of the column ADI\n", + "\n", + "data['ADI']\n", + "\n", + "mode_value = data['ADI'].mode()[0]\n", + "data['ADI'] = data['ADI'].fillna(mode_value)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "516f81d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# checking for unique value\n", + "\n", + "data['ADI'].unique()\n", + "\n", + "# checking for nan value \n", + "\n", + "data['ADI'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "ced4bf63", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "132\n", + "132\n" + ] + } + ], + "source": [ + "# will clean the column DMA,\n", + "\n", + "# checking for nan value first\n", + "\n", + "print(data['DMA'].isna().sum())\n", + "\n", + "\n", + "# same for MSA column\n", + "\n", + "print(data['MSA'].isna().sum())\n", + "\n", + "\n", + "# reallys small amount of null Value so will use the mode to fill Nan value at it won 't change that much the feature" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ced1f08d", + "metadata": {}, + "outputs": [], + "source": [ + "# fill Nan value in the column DMA and seeing the results\n", + "\n", + "mode_value = data['DMA'].mode()[0]\n", + "data['DMA'] = data['DMA'].fillna(mode_value)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "d555c7cc", + "metadata": {}, + "outputs": [], + "source": [ + "# same with column MSA \n", + "\n", + "mode_value = data['MSA'].mode()[0]\n", + "data['MSA'] = data['MSA'].fillna(mode_value)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "01574b4e", + "metadata": {}, + "outputs": [], + "source": [ + "# column Wealth 1 have already been dropped because the threshold value set was <25 % of the value can be null\n", + "\n", + "# and Wealth 1 was more than this threshlod meaning had more than 25% of the value null, and this is not suitable to perform a good model" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "871eb0f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWTCODESTATEMAILCODEPVASTATEDOBNOEXCHRECINHSERECP3RECPGVG...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
089010IL37120...0.00L4EXXX39.0C
194011CA52020...0.00L2GXXX1.0A
290011NC00...0.01L4EXXX60.0C
387010CA28010...0.01L4EXXX41.0C
486010FL20010XX...0.01L2FXXX26.0A
..................................................................
9540796011AK00...0.00L1GXXX12.0C
9540896011TX50010...0.01L1FXXX2.0A
9540995011MI38010X...0.01L3EXXX34.0B
9541086010CA40050X...18.01L4FXXX11.0A
9541188012NC18010XX...0.01L1GC1C12.0C
\n", + "

95412 rows × 384 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW TCODE STATE MAILCODE PVASTATE DOB NOEXCH RECINHSE RECP3 \\\n", + "0 8901 0 IL 3712 0 \n", + "1 9401 1 CA 5202 0 \n", + "2 9001 1 NC 0 0 \n", + "3 8701 0 CA 2801 0 \n", + "4 8601 0 FL 2001 0 X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 1 AK 0 0 \n", + "95408 9601 1 TX 5001 0 \n", + "95409 9501 1 MI 3801 0 X \n", + "95410 8601 0 CA 4005 0 X \n", + "95411 8801 2 NC 1801 0 X \n", + "\n", + " RECPGVG ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 384 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# checking data now \n", + "\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e6a96b5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c617a4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c52b1343", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}