diff --git a/lab-feature-engineering.ipynb b/lab-feature-engineering.ipynb
new file mode 100644
index 0000000..9b542e7
--- /dev/null
+++ b/lab-feature-engineering.ipynb
@@ -0,0 +1,3104 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 244,
+ "id": "e1aeb6d7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "import warnings\n",
+ "warnings.filterwarnings ('ignore')\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline\n",
+ "\n",
+ "import seaborn as sns\n",
+ "\n",
+ "import statsmodels.api as sm\n",
+ "from statsmodels.formula.api import ols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 245,
+ "id": "150bf899",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ODATEDW | \n",
+ " OSOURCE | \n",
+ " TCODE | \n",
+ " STATE | \n",
+ " ZIP | \n",
+ " MAILCODE | \n",
+ " PVASTATE | \n",
+ " DOB | \n",
+ " NOEXCH | \n",
+ " RECINHSE | \n",
+ " ... | \n",
+ " TARGET_D | \n",
+ " HPHONE_D | \n",
+ " RFA_2R | \n",
+ " RFA_2F | \n",
+ " RFA_2A | \n",
+ " MDMAUD_R | \n",
+ " MDMAUD_F | \n",
+ " MDMAUD_A | \n",
+ " CLUSTER2 | \n",
+ " GEOCODE2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 8901 | \n",
+ " GRI | \n",
+ " 0 | \n",
+ " IL | \n",
+ " 61081 | \n",
+ " | \n",
+ " | \n",
+ " 3712 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " L | \n",
+ " 4 | \n",
+ " E | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 39.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 9401 | \n",
+ " BOA | \n",
+ " 1 | \n",
+ " CA | \n",
+ " 91326 | \n",
+ " | \n",
+ " | \n",
+ " 5202 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " L | \n",
+ " 2 | \n",
+ " G | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 1.0 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 9001 | \n",
+ " AMH | \n",
+ " 1 | \n",
+ " NC | \n",
+ " 27017 | \n",
+ " | \n",
+ " | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 4 | \n",
+ " E | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 60.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 8701 | \n",
+ " BRY | \n",
+ " 0 | \n",
+ " CA | \n",
+ " 95953 | \n",
+ " | \n",
+ " | \n",
+ " 2801 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 4 | \n",
+ " E | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 41.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 8601 | \n",
+ " | \n",
+ " 0 | \n",
+ " FL | \n",
+ " 33176 | \n",
+ " | \n",
+ " | \n",
+ " 2001 | \n",
+ " 0 | \n",
+ " X | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 2 | \n",
+ " F | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 26.0 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 95407 | \n",
+ " 9601 | \n",
+ " ASE | \n",
+ " 1 | \n",
+ " AK | \n",
+ " 99504 | \n",
+ " | \n",
+ " | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " L | \n",
+ " 1 | \n",
+ " G | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 12.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 95408 | \n",
+ " 9601 | \n",
+ " DCD | \n",
+ " 1 | \n",
+ " TX | \n",
+ " 77379 | \n",
+ " | \n",
+ " | \n",
+ " 5001 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 1 | \n",
+ " F | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 2.0 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ " | 95409 | \n",
+ " 9501 | \n",
+ " MBC | \n",
+ " 1 | \n",
+ " MI | \n",
+ " 48910 | \n",
+ " | \n",
+ " | \n",
+ " 3801 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 3 | \n",
+ " E | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 34.0 | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " | 95410 | \n",
+ " 8601 | \n",
+ " PRV | \n",
+ " 0 | \n",
+ " CA | \n",
+ " 91320 | \n",
+ " | \n",
+ " | \n",
+ " 4005 | \n",
+ " 0 | \n",
+ " X | \n",
+ " ... | \n",
+ " 18.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 4 | \n",
+ " F | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 11.0 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ " | 95411 | \n",
+ " 8801 | \n",
+ " MCC | \n",
+ " 2 | \n",
+ " NC | \n",
+ " 28409 | \n",
+ " | \n",
+ " | \n",
+ " 1801 | \n",
+ " 0 | \n",
+ " X | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 1 | \n",
+ " G | \n",
+ " C | \n",
+ " 1 | \n",
+ " C | \n",
+ " 12.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
95412 rows × 481 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n",
+ "0 8901 GRI 0 IL 61081 3712 0 \n",
+ "1 9401 BOA 1 CA 91326 5202 0 \n",
+ "2 9001 AMH 1 NC 27017 0 0 \n",
+ "3 8701 BRY 0 CA 95953 2801 0 \n",
+ "4 8601 0 FL 33176 2001 0 \n",
+ "... ... ... ... ... ... ... ... ... ... \n",
+ "95407 9601 ASE 1 AK 99504 0 0 \n",
+ "95408 9601 DCD 1 TX 77379 5001 0 \n",
+ "95409 9501 MBC 1 MI 48910 3801 0 \n",
+ "95410 8601 PRV 0 CA 91320 4005 0 \n",
+ "95411 8801 MCC 2 NC 28409 1801 0 \n",
+ "\n",
+ " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n",
+ "0 ... 0.0 0 L 4 E X X \n",
+ "1 ... 0.0 0 L 2 G X X \n",
+ "2 ... 0.0 1 L 4 E X X \n",
+ "3 ... 0.0 1 L 4 E X X \n",
+ "4 X ... 0.0 1 L 2 F X X \n",
+ "... ... ... ... ... ... ... ... ... ... \n",
+ "95407 ... 0.0 0 L 1 G X X \n",
+ "95408 ... 0.0 1 L 1 F X X \n",
+ "95409 ... 0.0 1 L 3 E X X \n",
+ "95410 X ... 18.0 1 L 4 F X X \n",
+ "95411 X ... 0.0 1 L 1 G C 1 \n",
+ "\n",
+ " MDMAUD_A CLUSTER2 GEOCODE2 \n",
+ "0 X 39.0 C \n",
+ "1 X 1.0 A \n",
+ "2 X 60.0 C \n",
+ "3 X 41.0 C \n",
+ "4 X 26.0 A \n",
+ "... ... ... ... \n",
+ "95407 X 12.0 C \n",
+ "95408 X 2.0 A \n",
+ "95409 X 34.0 B \n",
+ "95410 X 11.0 A \n",
+ "95411 C 12.0 C \n",
+ "\n",
+ "[95412 rows x 481 columns]"
+ ]
+ },
+ "execution_count": 245,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = pd.read_csv(r\"learningSet.txt\")\n",
+ "data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "498bfe98",
+ "metadata": {},
+ "source": [
+ "#### Check for null values in all the columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 246,
+ "id": "cbe29b65",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "ODATEDW 0\n",
+ "OSOURCE 0\n",
+ "TCODE 0\n",
+ "STATE 0\n",
+ "ZIP 0\n",
+ " ... \n",
+ "MDMAUD_R 0\n",
+ "MDMAUD_F 0\n",
+ "MDMAUD_A 0\n",
+ "CLUSTER2 132\n",
+ "GEOCODE2 132\n",
+ "Length: 481, dtype: int64"
+ ]
+ },
+ "execution_count": 246,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.isna().sum()\n",
+ "# most columns might have missing values, if we drop NAs, we are going to remove basically all rows\n",
+ "# so an option is to *remove the columns* with the most missing values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "98b12828",
+ "metadata": {},
+ "source": [
+ "#### Identify columns that over 85% missing values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 247,
+ "id": "2069937d",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "ODATEDW 0.000000\n",
+ "OSOURCE 0.000000\n",
+ "TCODE 0.000000\n",
+ "STATE 0.000000\n",
+ "ZIP 0.000000\n",
+ " ... \n",
+ "MDMAUD_R 0.000000\n",
+ "MDMAUD_F 0.000000\n",
+ "MDMAUD_A 0.000000\n",
+ "CLUSTER2 0.001383\n",
+ "GEOCODE2 0.001383\n",
+ "Length: 481, dtype: float64"
+ ]
+ },
+ "execution_count": 247,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "null_df = data.isna().sum()/len(data)\n",
+ "null_df\n",
+ "# we need to stablish a percentage of NAs to drop columns or not"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 248,
+ "id": "5dd5e223",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " column_names | \n",
+ " percentage_of_nulls | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " ODATEDW | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " OSOURCE | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " TCODE | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " STATE | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " ZIP | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 476 | \n",
+ " MDMAUD_R | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 477 | \n",
+ " MDMAUD_F | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 478 | \n",
+ " MDMAUD_A | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 479 | \n",
+ " CLUSTER2 | \n",
+ " 0.001383 | \n",
+ "
\n",
+ " \n",
+ " | 480 | \n",
+ " GEOCODE2 | \n",
+ " 0.001383 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
481 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " column_names percentage_of_nulls\n",
+ "0 ODATEDW 0.000000\n",
+ "1 OSOURCE 0.000000\n",
+ "2 TCODE 0.000000\n",
+ "3 STATE 0.000000\n",
+ "4 ZIP 0.000000\n",
+ ".. ... ...\n",
+ "476 MDMAUD_R 0.000000\n",
+ "477 MDMAUD_F 0.000000\n",
+ "478 MDMAUD_A 0.000000\n",
+ "479 CLUSTER2 0.001383\n",
+ "480 GEOCODE2 0.001383\n",
+ "\n",
+ "[481 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 248,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "null_df = pd.DataFrame(null_df).reset_index()\n",
+ "null_df.columns = ['column_names','percentage_of_nulls']\n",
+ "null_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 249,
+ "id": "7e27ac51",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " column_names | \n",
+ " percentage_of_nulls | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 414 | \n",
+ " RDATE_5 | \n",
+ " 0.999906 | \n",
+ "
\n",
+ " \n",
+ " | 436 | \n",
+ " RAMNT_5 | \n",
+ " 0.999906 | \n",
+ "
\n",
+ " \n",
+ " | 412 | \n",
+ " RDATE_3 | \n",
+ " 0.997464 | \n",
+ "
\n",
+ " \n",
+ " | 434 | \n",
+ " RAMNT_3 | \n",
+ " 0.997464 | \n",
+ "
\n",
+ " \n",
+ " | 413 | \n",
+ " RDATE_4 | \n",
+ " 0.997055 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 168 | \n",
+ " ETHC3 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 167 | \n",
+ " ETHC2 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 166 | \n",
+ " ETHC1 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 165 | \n",
+ " HHD12 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 240 | \n",
+ " TPE11 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
481 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " column_names percentage_of_nulls\n",
+ "414 RDATE_5 0.999906\n",
+ "436 RAMNT_5 0.999906\n",
+ "412 RDATE_3 0.997464\n",
+ "434 RAMNT_3 0.997464\n",
+ "413 RDATE_4 0.997055\n",
+ ".. ... ...\n",
+ "168 ETHC3 0.000000\n",
+ "167 ETHC2 0.000000\n",
+ "166 ETHC1 0.000000\n",
+ "165 HHD12 0.000000\n",
+ "240 TPE11 0.000000\n",
+ "\n",
+ "[481 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 249,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "null_df = null_df.sort_values(by = 'percentage_of_nulls', ascending = False )\n",
+ "null_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 250,
+ "id": "2b8e862f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " column_names | \n",
+ " percentage_of_nulls | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 414 | \n",
+ " RDATE_5 | \n",
+ " 0.999906 | \n",
+ "
\n",
+ " \n",
+ " | 436 | \n",
+ " RAMNT_5 | \n",
+ " 0.999906 | \n",
+ "
\n",
+ " \n",
+ " | 412 | \n",
+ " RDATE_3 | \n",
+ " 0.997464 | \n",
+ "
\n",
+ " \n",
+ " | 434 | \n",
+ " RAMNT_3 | \n",
+ " 0.997464 | \n",
+ "
\n",
+ " \n",
+ " | 413 | \n",
+ " RDATE_4 | \n",
+ " 0.997055 | \n",
+ "
\n",
+ " \n",
+ " | 435 | \n",
+ " RAMNT_4 | \n",
+ " 0.997055 | \n",
+ "
\n",
+ " \n",
+ " | 437 | \n",
+ " RAMNT_6 | \n",
+ " 0.991867 | \n",
+ "
\n",
+ " \n",
+ " | 415 | \n",
+ " RDATE_6 | \n",
+ " 0.991867 | \n",
+ "
\n",
+ " \n",
+ " | 446 | \n",
+ " RAMNT_15 | \n",
+ " 0.923888 | \n",
+ "
\n",
+ " \n",
+ " | 424 | \n",
+ " RDATE_15 | \n",
+ " 0.923888 | \n",
+ "
\n",
+ " \n",
+ " | 432 | \n",
+ " RDATE_23 | \n",
+ " 0.917631 | \n",
+ "
\n",
+ " \n",
+ " | 454 | \n",
+ " RAMNT_23 | \n",
+ " 0.917631 | \n",
+ "
\n",
+ " \n",
+ " | 429 | \n",
+ " RDATE_20 | \n",
+ " 0.917327 | \n",
+ "
\n",
+ " \n",
+ " | 451 | \n",
+ " RAMNT_20 | \n",
+ " 0.917327 | \n",
+ "
\n",
+ " \n",
+ " | 438 | \n",
+ " RAMNT_7 | \n",
+ " 0.906773 | \n",
+ "
\n",
+ " \n",
+ " | 416 | \n",
+ " RDATE_7 | \n",
+ " 0.906773 | \n",
+ "
\n",
+ " \n",
+ " | 448 | \n",
+ " RAMNT_17 | \n",
+ " 0.901469 | \n",
+ "
\n",
+ " \n",
+ " | 426 | \n",
+ " RDATE_17 | \n",
+ " 0.901469 | \n",
+ "
\n",
+ " \n",
+ " | 430 | \n",
+ " RDATE_21 | \n",
+ " 0.900296 | \n",
+ "
\n",
+ " \n",
+ " | 452 | \n",
+ " RAMNT_21 | \n",
+ " 0.900296 | \n",
+ "
\n",
+ " \n",
+ " | 441 | \n",
+ " RAMNT_10 | \n",
+ " 0.890360 | \n",
+ "
\n",
+ " \n",
+ " | 419 | \n",
+ " RDATE_10 | \n",
+ " 0.890360 | \n",
+ "
\n",
+ " \n",
+ " | 422 | \n",
+ " RDATE_13 | \n",
+ " 0.871609 | \n",
+ "
\n",
+ " \n",
+ " | 444 | \n",
+ " RAMNT_13 | \n",
+ " 0.871609 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " NUMCHLD | \n",
+ " 0.870184 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " column_names percentage_of_nulls\n",
+ "414 RDATE_5 0.999906\n",
+ "436 RAMNT_5 0.999906\n",
+ "412 RDATE_3 0.997464\n",
+ "434 RAMNT_3 0.997464\n",
+ "413 RDATE_4 0.997055\n",
+ "435 RAMNT_4 0.997055\n",
+ "437 RAMNT_6 0.991867\n",
+ "415 RDATE_6 0.991867\n",
+ "446 RAMNT_15 0.923888\n",
+ "424 RDATE_15 0.923888\n",
+ "432 RDATE_23 0.917631\n",
+ "454 RAMNT_23 0.917631\n",
+ "429 RDATE_20 0.917327\n",
+ "451 RAMNT_20 0.917327\n",
+ "438 RAMNT_7 0.906773\n",
+ "416 RDATE_7 0.906773\n",
+ "448 RAMNT_17 0.901469\n",
+ "426 RDATE_17 0.901469\n",
+ "430 RDATE_21 0.900296\n",
+ "452 RAMNT_21 0.900296\n",
+ "441 RAMNT_10 0.890360\n",
+ "419 RDATE_10 0.890360\n",
+ "422 RDATE_13 0.871609\n",
+ "444 RAMNT_13 0.871609\n",
+ "23 NUMCHLD 0.870184"
+ ]
+ },
+ "execution_count": 250,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "threshold = 0.85\n",
+ "\n",
+ "condition = null_df['percentage_of_nulls'] > threshold\n",
+ "columns_above_threshold = null_df[condition]\n",
+ "columns_above_threshold"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8416a5d2",
+ "metadata": {},
+ "source": [
+ "Remove those columns from the dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 251,
+ "id": "25aa7d8e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "drop_column_list = list(columns_above_threshold['column_names'])\n",
+ "data = data.drop(columns = drop_column_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 252,
+ "id": "38744dde",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ODATEDW | \n",
+ " OSOURCE | \n",
+ " TCODE | \n",
+ " STATE | \n",
+ " ZIP | \n",
+ " MAILCODE | \n",
+ " PVASTATE | \n",
+ " DOB | \n",
+ " NOEXCH | \n",
+ " RECINHSE | \n",
+ " ... | \n",
+ " TARGET_D | \n",
+ " HPHONE_D | \n",
+ " RFA_2R | \n",
+ " RFA_2F | \n",
+ " RFA_2A | \n",
+ " MDMAUD_R | \n",
+ " MDMAUD_F | \n",
+ " MDMAUD_A | \n",
+ " CLUSTER2 | \n",
+ " GEOCODE2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 8901 | \n",
+ " GRI | \n",
+ " 0 | \n",
+ " IL | \n",
+ " 61081 | \n",
+ " | \n",
+ " | \n",
+ " 3712 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " L | \n",
+ " 4 | \n",
+ " E | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 39.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 9401 | \n",
+ " BOA | \n",
+ " 1 | \n",
+ " CA | \n",
+ " 91326 | \n",
+ " | \n",
+ " | \n",
+ " 5202 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " L | \n",
+ " 2 | \n",
+ " G | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 1.0 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 9001 | \n",
+ " AMH | \n",
+ " 1 | \n",
+ " NC | \n",
+ " 27017 | \n",
+ " | \n",
+ " | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 4 | \n",
+ " E | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 60.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 8701 | \n",
+ " BRY | \n",
+ " 0 | \n",
+ " CA | \n",
+ " 95953 | \n",
+ " | \n",
+ " | \n",
+ " 2801 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 4 | \n",
+ " E | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 41.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 8601 | \n",
+ " | \n",
+ " 0 | \n",
+ " FL | \n",
+ " 33176 | \n",
+ " | \n",
+ " | \n",
+ " 2001 | \n",
+ " 0 | \n",
+ " X | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 2 | \n",
+ " F | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 26.0 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 95407 | \n",
+ " 9601 | \n",
+ " ASE | \n",
+ " 1 | \n",
+ " AK | \n",
+ " 99504 | \n",
+ " | \n",
+ " | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " L | \n",
+ " 1 | \n",
+ " G | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 12.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 95408 | \n",
+ " 9601 | \n",
+ " DCD | \n",
+ " 1 | \n",
+ " TX | \n",
+ " 77379 | \n",
+ " | \n",
+ " | \n",
+ " 5001 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 1 | \n",
+ " F | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 2.0 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ " | 95409 | \n",
+ " 9501 | \n",
+ " MBC | \n",
+ " 1 | \n",
+ " MI | \n",
+ " 48910 | \n",
+ " | \n",
+ " | \n",
+ " 3801 | \n",
+ " 0 | \n",
+ " | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 3 | \n",
+ " E | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 34.0 | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " | 95410 | \n",
+ " 8601 | \n",
+ " PRV | \n",
+ " 0 | \n",
+ " CA | \n",
+ " 91320 | \n",
+ " | \n",
+ " | \n",
+ " 4005 | \n",
+ " 0 | \n",
+ " X | \n",
+ " ... | \n",
+ " 18.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 4 | \n",
+ " F | \n",
+ " X | \n",
+ " X | \n",
+ " X | \n",
+ " 11.0 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ " | 95411 | \n",
+ " 8801 | \n",
+ " MCC | \n",
+ " 2 | \n",
+ " NC | \n",
+ " 28409 | \n",
+ " | \n",
+ " | \n",
+ " 1801 | \n",
+ " 0 | \n",
+ " X | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " L | \n",
+ " 1 | \n",
+ " G | \n",
+ " C | \n",
+ " 1 | \n",
+ " C | \n",
+ " 12.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
95412 rows × 456 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n",
+ "0 8901 GRI 0 IL 61081 3712 0 \n",
+ "1 9401 BOA 1 CA 91326 5202 0 \n",
+ "2 9001 AMH 1 NC 27017 0 0 \n",
+ "3 8701 BRY 0 CA 95953 2801 0 \n",
+ "4 8601 0 FL 33176 2001 0 \n",
+ "... ... ... ... ... ... ... ... ... ... \n",
+ "95407 9601 ASE 1 AK 99504 0 0 \n",
+ "95408 9601 DCD 1 TX 77379 5001 0 \n",
+ "95409 9501 MBC 1 MI 48910 3801 0 \n",
+ "95410 8601 PRV 0 CA 91320 4005 0 \n",
+ "95411 8801 MCC 2 NC 28409 1801 0 \n",
+ "\n",
+ " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n",
+ "0 ... 0.0 0 L 4 E X X \n",
+ "1 ... 0.0 0 L 2 G X X \n",
+ "2 ... 0.0 1 L 4 E X X \n",
+ "3 ... 0.0 1 L 4 E X X \n",
+ "4 X ... 0.0 1 L 2 F X X \n",
+ "... ... ... ... ... ... ... ... ... ... \n",
+ "95407 ... 0.0 0 L 1 G X X \n",
+ "95408 ... 0.0 1 L 1 F X X \n",
+ "95409 ... 0.0 1 L 3 E X X \n",
+ "95410 X ... 18.0 1 L 4 F X X \n",
+ "95411 X ... 0.0 1 L 1 G C 1 \n",
+ "\n",
+ " MDMAUD_A CLUSTER2 GEOCODE2 \n",
+ "0 X 39.0 C \n",
+ "1 X 1.0 A \n",
+ "2 X 60.0 C \n",
+ "3 X 41.0 C \n",
+ "4 X 26.0 A \n",
+ "... ... ... ... \n",
+ "95407 X 12.0 C \n",
+ "95408 X 2.0 A \n",
+ "95409 X 34.0 B \n",
+ "95410 X 11.0 A \n",
+ "95411 C 12.0 C \n",
+ "\n",
+ "[95412 rows x 456 columns]"
+ ]
+ },
+ "execution_count": 252,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data\n",
+ "# dropped from 481 to 456 columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e9bbb0d7",
+ "metadata": {},
+ "source": [
+ "#### Categorical columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 253,
+ "id": "8e3f1b32",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['OSOURCE',\n",
+ " 'STATE',\n",
+ " 'ZIP',\n",
+ " 'MAILCODE',\n",
+ " 'PVASTATE',\n",
+ " 'NOEXCH',\n",
+ " 'RECINHSE',\n",
+ " 'RECP3',\n",
+ " 'RECPGVG',\n",
+ " 'RECSWEEP',\n",
+ " 'MDMAUD',\n",
+ " 'DOMAIN',\n",
+ " 'CLUSTER',\n",
+ " 'AGEFLAG',\n",
+ " 'HOMEOWNR',\n",
+ " 'CHILD03',\n",
+ " 'CHILD07',\n",
+ " 'CHILD12',\n",
+ " 'CHILD18',\n",
+ " 'GENDER',\n",
+ " 'DATASRCE',\n",
+ " 'SOLP3',\n",
+ " 'SOLIH',\n",
+ " 'MAJOR',\n",
+ " 'GEOCODE',\n",
+ " 'COLLECT1',\n",
+ " 'VETERANS',\n",
+ " 'BIBLE',\n",
+ " 'CATLG',\n",
+ " 'HOMEE',\n",
+ " 'PETS',\n",
+ " 'CDPLAY',\n",
+ " 'STEREO',\n",
+ " 'PCOWNERS',\n",
+ " 'PHOTO',\n",
+ " 'CRAFTS',\n",
+ " 'FISHER',\n",
+ " 'GARDENIN',\n",
+ " 'BOATS',\n",
+ " 'WALKER',\n",
+ " 'KIDSTUFF',\n",
+ " 'CARDS',\n",
+ " 'PLATES',\n",
+ " 'LIFESRC',\n",
+ " 'PEPSTRFL',\n",
+ " 'RFA_2',\n",
+ " 'RFA_3',\n",
+ " 'RFA_4',\n",
+ " 'RFA_5',\n",
+ " 'RFA_6',\n",
+ " 'RFA_7',\n",
+ " 'RFA_8',\n",
+ " 'RFA_9',\n",
+ " 'RFA_10',\n",
+ " 'RFA_11',\n",
+ " 'RFA_12',\n",
+ " 'RFA_13',\n",
+ " 'RFA_14',\n",
+ " 'RFA_15',\n",
+ " 'RFA_16',\n",
+ " 'RFA_17',\n",
+ " 'RFA_18',\n",
+ " 'RFA_19',\n",
+ " 'RFA_20',\n",
+ " 'RFA_21',\n",
+ " 'RFA_22',\n",
+ " 'RFA_23',\n",
+ " 'RFA_24',\n",
+ " 'RFA_2R',\n",
+ " 'RFA_2A',\n",
+ " 'MDMAUD_R',\n",
+ " 'MDMAUD_F',\n",
+ " 'MDMAUD_A',\n",
+ " 'GEOCODE2']"
+ ]
+ },
+ "execution_count": 253,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "categorical_columns = data.select_dtypes(exclude=['number']).columns.tolist()\n",
+ "categorical_columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "66161dec",
+ "metadata": {},
+ "source": [
+ "Create a new empty list called drop_list. We will append this list and then drop all the columns in this list later:\n",
+ "\n",
+ "OSOURCE - symbol definitions not provided, too many categories\n",
+ "\n",
+ "ZIP CODE - we are including state already"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 254,
+ "id": "de53d72d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "drop_list = []\n",
+ "drop_list.append('OSOURCE')\n",
+ "drop_list.append('ZIP')\n",
+ "data.drop(columns=drop_list, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "00911ba5",
+ "metadata": {},
+ "source": [
+ "#### Reduce the number of categories in the column GENDER. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3bbd4f17",
+ "metadata": {},
+ "source": [
+ "The column should only have either \"M\" for males, \"F\" for females, and \"other\" for all the rest."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 255,
+ "id": "9e3a5c53",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "GENDER\n",
+ "F 51277\n",
+ "M 39094\n",
+ " 2957\n",
+ "U 1715\n",
+ "J 365\n",
+ "C 2\n",
+ "A 2\n",
+ "Name: count, dtype: int64\n",
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "print (data['GENDER'].value_counts())\n",
+ "print (data['GENDER'].isna().sum())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 256,
+ "id": "c7fce732",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "data['GENDER'] = data['GENDER'].fillna('F')\n",
+ "data['GENDER'] = data['GENDER'].apply(lambda x: x if x in ['M', 'F'] else 'other')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 257,
+ "id": "2aea059b",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GENDER\n",
+ "F 51277\n",
+ "M 39094\n",
+ "other 5041\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 257,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data['GENDER'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "84790566",
+ "metadata": {},
+ "source": [
+ "#### Numerical columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 258,
+ "id": "0656fae2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['ODATEDW',\n",
+ " 'TCODE',\n",
+ " 'DOB',\n",
+ " 'AGE',\n",
+ " 'INCOME',\n",
+ " 'WEALTH1',\n",
+ " 'HIT',\n",
+ " 'MBCRAFT',\n",
+ " 'MBGARDEN',\n",
+ " 'MBBOOKS',\n",
+ " 'MBCOLECT',\n",
+ " 'MAGFAML',\n",
+ " 'MAGFEM',\n",
+ " 'MAGMALE',\n",
+ " 'PUBGARDN',\n",
+ " 'PUBCULIN',\n",
+ " 'PUBHLTH',\n",
+ " 'PUBDOITY',\n",
+ " 'PUBNEWFN',\n",
+ " 'PUBPHOTO',\n",
+ " 'PUBOPP',\n",
+ " 'MALEMILI',\n",
+ " 'MALEVET',\n",
+ " 'VIETVETS',\n",
+ " 'WWIIVETS',\n",
+ " 'LOCALGOV',\n",
+ " 'STATEGOV',\n",
+ " 'FEDGOV',\n",
+ " 'WEALTH2',\n",
+ " 'POP901',\n",
+ " 'POP902',\n",
+ " 'POP903',\n",
+ " 'POP90C1',\n",
+ " 'POP90C2',\n",
+ " 'POP90C3',\n",
+ " 'POP90C4',\n",
+ " 'POP90C5',\n",
+ " 'ETH1',\n",
+ " 'ETH2',\n",
+ " 'ETH3',\n",
+ " 'ETH4',\n",
+ " 'ETH5',\n",
+ " 'ETH6',\n",
+ " 'ETH7',\n",
+ " 'ETH8',\n",
+ " 'ETH9',\n",
+ " 'ETH10',\n",
+ " 'ETH11',\n",
+ " 'ETH12',\n",
+ " 'ETH13',\n",
+ " 'ETH14',\n",
+ " 'ETH15',\n",
+ " 'ETH16',\n",
+ " 'AGE901',\n",
+ " 'AGE902',\n",
+ " 'AGE903',\n",
+ " 'AGE904',\n",
+ " 'AGE905',\n",
+ " 'AGE906',\n",
+ " 'AGE907',\n",
+ " 'CHIL1',\n",
+ " 'CHIL2',\n",
+ " 'CHIL3',\n",
+ " 'AGEC1',\n",
+ " 'AGEC2',\n",
+ " 'AGEC3',\n",
+ " 'AGEC4',\n",
+ " 'AGEC5',\n",
+ " 'AGEC6',\n",
+ " 'AGEC7',\n",
+ " 'CHILC1',\n",
+ " 'CHILC2',\n",
+ " 'CHILC3',\n",
+ " 'CHILC4',\n",
+ " 'CHILC5',\n",
+ " 'HHAGE1',\n",
+ " 'HHAGE2',\n",
+ " 'HHAGE3',\n",
+ " 'HHN1',\n",
+ " 'HHN2',\n",
+ " 'HHN3',\n",
+ " 'HHN4',\n",
+ " 'HHN5',\n",
+ " 'HHN6',\n",
+ " 'MARR1',\n",
+ " 'MARR2',\n",
+ " 'MARR3',\n",
+ " 'MARR4',\n",
+ " 'HHP1',\n",
+ " 'HHP2',\n",
+ " 'DW1',\n",
+ " 'DW2',\n",
+ " 'DW3',\n",
+ " 'DW4',\n",
+ " 'DW5',\n",
+ " 'DW6',\n",
+ " 'DW7',\n",
+ " 'DW8',\n",
+ " 'DW9',\n",
+ " 'HV1',\n",
+ " 'HV2',\n",
+ " 'HV3',\n",
+ " 'HV4',\n",
+ " 'HU1',\n",
+ " 'HU2',\n",
+ " 'HU3',\n",
+ " 'HU4',\n",
+ " 'HU5',\n",
+ " 'HHD1',\n",
+ " 'HHD2',\n",
+ " 'HHD3',\n",
+ " 'HHD4',\n",
+ " 'HHD5',\n",
+ " 'HHD6',\n",
+ " 'HHD7',\n",
+ " 'HHD8',\n",
+ " 'HHD9',\n",
+ " 'HHD10',\n",
+ " 'HHD11',\n",
+ " 'HHD12',\n",
+ " 'ETHC1',\n",
+ " 'ETHC2',\n",
+ " 'ETHC3',\n",
+ " 'ETHC4',\n",
+ " 'ETHC5',\n",
+ " 'ETHC6',\n",
+ " 'HVP1',\n",
+ " 'HVP2',\n",
+ " 'HVP3',\n",
+ " 'HVP4',\n",
+ " 'HVP5',\n",
+ " 'HVP6',\n",
+ " 'HUR1',\n",
+ " 'HUR2',\n",
+ " 'RHP1',\n",
+ " 'RHP2',\n",
+ " 'RHP3',\n",
+ " 'RHP4',\n",
+ " 'HUPA1',\n",
+ " 'HUPA2',\n",
+ " 'HUPA3',\n",
+ " 'HUPA4',\n",
+ " 'HUPA5',\n",
+ " 'HUPA6',\n",
+ " 'HUPA7',\n",
+ " 'RP1',\n",
+ " 'RP2',\n",
+ " 'RP3',\n",
+ " 'RP4',\n",
+ " 'MSA',\n",
+ " 'ADI',\n",
+ " 'DMA',\n",
+ " 'IC1',\n",
+ " 'IC2',\n",
+ " 'IC3',\n",
+ " 'IC4',\n",
+ " 'IC5',\n",
+ " 'IC6',\n",
+ " 'IC7',\n",
+ " 'IC8',\n",
+ " 'IC9',\n",
+ " 'IC10',\n",
+ " 'IC11',\n",
+ " 'IC12',\n",
+ " 'IC13',\n",
+ " 'IC14',\n",
+ " 'IC15',\n",
+ " 'IC16',\n",
+ " 'IC17',\n",
+ " 'IC18',\n",
+ " 'IC19',\n",
+ " 'IC20',\n",
+ " 'IC21',\n",
+ " 'IC22',\n",
+ " 'IC23',\n",
+ " 'HHAS1',\n",
+ " 'HHAS2',\n",
+ " 'HHAS3',\n",
+ " 'HHAS4',\n",
+ " 'MC1',\n",
+ " 'MC2',\n",
+ " 'MC3',\n",
+ " 'TPE1',\n",
+ " 'TPE2',\n",
+ " 'TPE3',\n",
+ " 'TPE4',\n",
+ " 'TPE5',\n",
+ " 'TPE6',\n",
+ " 'TPE7',\n",
+ " 'TPE8',\n",
+ " 'TPE9',\n",
+ " 'PEC1',\n",
+ " 'PEC2',\n",
+ " 'TPE10',\n",
+ " 'TPE11',\n",
+ " 'TPE12',\n",
+ " 'TPE13',\n",
+ " 'LFC1',\n",
+ " 'LFC2',\n",
+ " 'LFC3',\n",
+ " 'LFC4',\n",
+ " 'LFC5',\n",
+ " 'LFC6',\n",
+ " 'LFC7',\n",
+ " 'LFC8',\n",
+ " 'LFC9',\n",
+ " 'LFC10',\n",
+ " 'OCC1',\n",
+ " 'OCC2',\n",
+ " 'OCC3',\n",
+ " 'OCC4',\n",
+ " 'OCC5',\n",
+ " 'OCC6',\n",
+ " 'OCC7',\n",
+ " 'OCC8',\n",
+ " 'OCC9',\n",
+ " 'OCC10',\n",
+ " 'OCC11',\n",
+ " 'OCC12',\n",
+ " 'OCC13',\n",
+ " 'EIC1',\n",
+ " 'EIC2',\n",
+ " 'EIC3',\n",
+ " 'EIC4',\n",
+ " 'EIC5',\n",
+ " 'EIC6',\n",
+ " 'EIC7',\n",
+ " 'EIC8',\n",
+ " 'EIC9',\n",
+ " 'EIC10',\n",
+ " 'EIC11',\n",
+ " 'EIC12',\n",
+ " 'EIC13',\n",
+ " 'EIC14',\n",
+ " 'EIC15',\n",
+ " 'EIC16',\n",
+ " 'OEDC1',\n",
+ " 'OEDC2',\n",
+ " 'OEDC3',\n",
+ " 'OEDC4',\n",
+ " 'OEDC5',\n",
+ " 'OEDC6',\n",
+ " 'OEDC7',\n",
+ " 'EC1',\n",
+ " 'EC2',\n",
+ " 'EC3',\n",
+ " 'EC4',\n",
+ " 'EC5',\n",
+ " 'EC6',\n",
+ " 'EC7',\n",
+ " 'EC8',\n",
+ " 'SEC1',\n",
+ " 'SEC2',\n",
+ " 'SEC3',\n",
+ " 'SEC4',\n",
+ " 'SEC5',\n",
+ " 'AFC1',\n",
+ " 'AFC2',\n",
+ " 'AFC3',\n",
+ " 'AFC4',\n",
+ " 'AFC5',\n",
+ " 'AFC6',\n",
+ " 'VC1',\n",
+ " 'VC2',\n",
+ " 'VC3',\n",
+ " 'VC4',\n",
+ " 'ANC1',\n",
+ " 'ANC2',\n",
+ " 'ANC3',\n",
+ " 'ANC4',\n",
+ " 'ANC5',\n",
+ " 'ANC6',\n",
+ " 'ANC7',\n",
+ " 'ANC8',\n",
+ " 'ANC9',\n",
+ " 'ANC10',\n",
+ " 'ANC11',\n",
+ " 'ANC12',\n",
+ " 'ANC13',\n",
+ " 'ANC14',\n",
+ " 'ANC15',\n",
+ " 'POBC1',\n",
+ " 'POBC2',\n",
+ " 'LSC1',\n",
+ " 'LSC2',\n",
+ " 'LSC3',\n",
+ " 'LSC4',\n",
+ " 'VOC1',\n",
+ " 'VOC2',\n",
+ " 'VOC3',\n",
+ " 'HC1',\n",
+ " 'HC2',\n",
+ " 'HC3',\n",
+ " 'HC4',\n",
+ " 'HC5',\n",
+ " 'HC6',\n",
+ " 'HC7',\n",
+ " 'HC8',\n",
+ " 'HC9',\n",
+ " 'HC10',\n",
+ " 'HC11',\n",
+ " 'HC12',\n",
+ " 'HC13',\n",
+ " 'HC14',\n",
+ " 'HC15',\n",
+ " 'HC16',\n",
+ " 'HC17',\n",
+ " 'HC18',\n",
+ " 'HC19',\n",
+ " 'HC20',\n",
+ " 'HC21',\n",
+ " 'MHUC1',\n",
+ " 'MHUC2',\n",
+ " 'AC1',\n",
+ " 'AC2',\n",
+ " 'ADATE_2',\n",
+ " 'ADATE_3',\n",
+ " 'ADATE_4',\n",
+ " 'ADATE_5',\n",
+ " 'ADATE_6',\n",
+ " 'ADATE_7',\n",
+ " 'ADATE_8',\n",
+ " 'ADATE_9',\n",
+ " 'ADATE_10',\n",
+ " 'ADATE_11',\n",
+ " 'ADATE_12',\n",
+ " 'ADATE_13',\n",
+ " 'ADATE_14',\n",
+ " 'ADATE_15',\n",
+ " 'ADATE_16',\n",
+ " 'ADATE_17',\n",
+ " 'ADATE_18',\n",
+ " 'ADATE_19',\n",
+ " 'ADATE_20',\n",
+ " 'ADATE_21',\n",
+ " 'ADATE_22',\n",
+ " 'ADATE_23',\n",
+ " 'ADATE_24',\n",
+ " 'CARDPROM',\n",
+ " 'MAXADATE',\n",
+ " 'NUMPROM',\n",
+ " 'CARDPM12',\n",
+ " 'NUMPRM12',\n",
+ " 'RDATE_8',\n",
+ " 'RDATE_9',\n",
+ " 'RDATE_11',\n",
+ " 'RDATE_12',\n",
+ " 'RDATE_14',\n",
+ " 'RDATE_16',\n",
+ " 'RDATE_18',\n",
+ " 'RDATE_19',\n",
+ " 'RDATE_22',\n",
+ " 'RDATE_24',\n",
+ " 'RAMNT_8',\n",
+ " 'RAMNT_9',\n",
+ " 'RAMNT_11',\n",
+ " 'RAMNT_12',\n",
+ " 'RAMNT_14',\n",
+ " 'RAMNT_16',\n",
+ " 'RAMNT_18',\n",
+ " 'RAMNT_19',\n",
+ " 'RAMNT_22',\n",
+ " 'RAMNT_24',\n",
+ " 'RAMNTALL',\n",
+ " 'NGIFTALL',\n",
+ " 'CARDGIFT',\n",
+ " 'MINRAMNT',\n",
+ " 'MINRDATE',\n",
+ " 'MAXRAMNT',\n",
+ " 'MAXRDATE',\n",
+ " 'LASTGIFT',\n",
+ " 'LASTDATE',\n",
+ " 'FISTDATE',\n",
+ " 'NEXTDATE',\n",
+ " 'TIMELAG',\n",
+ " 'AVGGIFT',\n",
+ " 'CONTROLN',\n",
+ " 'TARGET_B',\n",
+ " 'TARGET_D',\n",
+ " 'HPHONE_D',\n",
+ " 'RFA_2F',\n",
+ " 'CLUSTER2']"
+ ]
+ },
+ "execution_count": 258,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "numerical_columns = data.select_dtypes(include=['number']).columns.tolist()\n",
+ "numerical_columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f1bf093f",
+ "metadata": {},
+ "source": [
+ "#### Clean the columns GEOCODE2, WEALTH1, ADI, DMA,and MSA."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3413f398",
+ "metadata": {},
+ "source": [
+ "#### GEOCODE2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 259,
+ "id": "a1b4afcd",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "GEOCODE2\n",
+ "A 34484\n",
+ "B 28505\n",
+ "D 16580\n",
+ "C 15524\n",
+ " 187\n",
+ "Name: count, dtype: int64\n",
+ "object\n",
+ "132\n"
+ ]
+ }
+ ],
+ "source": [
+ "print (data['GEOCODE2'].value_counts())\n",
+ "print (data['GEOCODE2'].dtype)\n",
+ "print (data['GEOCODE2'].isna().sum())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 260,
+ "id": "38024672",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = data[data['GEOCODE2'].notna()]\n",
+ "data['GEOCODE2'].replace(' ', 'A', inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 261,
+ "id": "0f73334d",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "GEOCODE2\n",
+ "A 34671\n",
+ "B 28505\n",
+ "D 16580\n",
+ "C 15524\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "print (data['GEOCODE2'].value_counts())\n",
+ "# GROUP BASED IMPUTATION\n",
+ "# dropped NAs and replaced spaces by category A which is the MODE VALUE"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "09fd8a18",
+ "metadata": {},
+ "source": [
+ "#### WEALTH1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 262,
+ "id": "96027cdc",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "WEALTH1\n",
+ "9.0 7580\n",
+ "8.0 6785\n",
+ "7.0 6196\n",
+ "6.0 5823\n",
+ "5.0 5277\n",
+ "4.0 4808\n",
+ "3.0 4233\n",
+ "2.0 4083\n",
+ "1.0 3452\n",
+ "0.0 2411\n",
+ "Name: count, dtype: int64\n",
+ "float64\n",
+ "44632\n"
+ ]
+ }
+ ],
+ "source": [
+ "print (data['WEALTH1'].value_counts())\n",
+ "print (data['WEALTH1'].dtype)\n",
+ "print (data['WEALTH1'].isna().sum())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 263,
+ "id": "cf0eb193",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " column_names | \n",
+ " percentage_of_nulls | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 26 | \n",
+ " WEALTH1 | \n",
+ " 0.46883 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " column_names percentage_of_nulls\n",
+ "26 WEALTH1 0.46883"
+ ]
+ },
+ "execution_count": 263,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "null_df.loc[null_df['column_names'] == 'WEALTH1']\n",
+ "# 47% of missing values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 264,
+ "id": "f1ea590e",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.distplot(data['WEALTH1'])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 265,
+ "id": "6de4ca6f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Proportion of each unique value in WEALTH1:\n",
+ "WEALTH1\n",
+ "9.0 0.149660\n",
+ "8.0 0.133964\n",
+ "7.0 0.122335\n",
+ "6.0 0.114970\n",
+ "5.0 0.104190\n",
+ "4.0 0.094930\n",
+ "3.0 0.083577\n",
+ "2.0 0.080615\n",
+ "1.0 0.068157\n",
+ "0.0 0.047603\n",
+ "Name: proportion, dtype: float64\n",
+ "\n",
+ "Filled missing values in WEALTH1:\n",
+ "WEALTH1\n",
+ "9.0 0.148688\n",
+ "8.0 0.133249\n",
+ "7.0 0.121505\n",
+ "6.0 0.114788\n",
+ "5.0 0.104628\n",
+ "4.0 0.095476\n",
+ "3.0 0.084488\n",
+ "2.0 0.081035\n",
+ "1.0 0.068052\n",
+ "0.0 0.048090\n",
+ "Name: proportion, dtype: float64\n",
+ "Null values: 0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# PROPORTIONAL IMPUTATION TECHNIQUE\n",
+ "\n",
+ "# Calculate the proportion of each unique value in the non-null data\n",
+ "value_counts = data['WEALTH1'].value_counts(normalize=True)\n",
+ "print(\"Proportion of each unique value in WEALTH1:\")\n",
+ "print(value_counts)\n",
+ "\n",
+ "# Create a list of values to fill the missing entries, based on these proportions\n",
+ "fill_values = np.random.choice(value_counts.index, size=data['WEALTH1'].isnull().sum(), p=value_counts.values)\n",
+ "\n",
+ "# Fill the missing values with these proportional values\n",
+ "data.loc[data['WEALTH1'].isnull(), 'WEALTH1'] = fill_values\n",
+ "\n",
+ "# Check the result\n",
+ "print(\"\\nFilled missing values in WEALTH1:\")\n",
+ "print(data['WEALTH1'].value_counts(normalize=True))\n",
+ "print(\"Null values: \", (data['WEALTH1'].isnull().sum()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "64c14952",
+ "metadata": {},
+ "source": [
+ "#### ADI"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 266,
+ "id": "e8d8fbf3",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ADI\n",
+ "13.0 7296\n",
+ "51.0 4622\n",
+ "65.0 3765\n",
+ "57.0 2836\n",
+ "105.0 2617\n",
+ " ... \n",
+ "651.0 1\n",
+ "103.0 1\n",
+ "601.0 1\n",
+ "161.0 1\n",
+ "147.0 1\n",
+ "Name: count, Length: 204, dtype: int64\n",
+ "float64\n",
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "print (data['ADI'].value_counts())\n",
+ "print (data['ADI'].dtype)\n",
+ "print (data['ADI'].isna().sum())\n",
+ "# some values have very low counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 267,
+ "id": "81aad058",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(data['ADI'])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 268,
+ "id": "f0c45f13",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 95280.000000\n",
+ "mean 187.356402\n",
+ "std 137.019184\n",
+ "min 0.000000\n",
+ "25% 65.000000\n",
+ "50% 175.000000\n",
+ "75% 279.000000\n",
+ "max 651.000000\n",
+ "Name: ADI, dtype: float64"
+ ]
+ },
+ "execution_count": 268,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data['ADI'].describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 269,
+ "id": "3759fa9f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "min: 0.0, Q1: 65.0, Q2: 175.0, Q3: 279.0, Upper Bound: 600.0, max: 651.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# GROUP VALUES AND ASSIGN BINS FOR THE DISTRIBUTION\n",
+ "\n",
+ "min_val = data['ADI'].min()\n",
+ "Q1 = data['ADI'].quantile(0.25)\n",
+ "Q2 = data['ADI'].median()\n",
+ "Q3 = data['ADI'].quantile(0.75)\n",
+ "IQR = Q3 - Q1\n",
+ "upper_bound = Q3 + 1.5 * IQR\n",
+ "max_val = data['ADI'].max()\n",
+ "\n",
+ "print(f\"min: {min_val}, Q1: {Q1}, Q2: {Q2}, Q3: {Q3}, Upper Bound: {upper_bound}, max: {max_val}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 270,
+ "id": "b0fb25c5",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Counts in each group:\n",
+ "ADI_bin\n",
+ "Lower Bound 24122\n",
+ "Q1 23617\n",
+ "Q3 24093\n",
+ "Upper Bound 22840\n",
+ "Outliers 608\n",
+ "dtype: int64\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Define bins and labels\n",
+ "bins = [min_val, Q1, Q2, Q3, upper_bound, max_val] \n",
+ "labels = ['Lower Bound', 'Q1', 'Q3', 'Upper Bound', 'Outliers'] \n",
+ "\n",
+ "# Create a new column with the binned ranges\n",
+ "data['ADI_bin'] = pd.cut(data['ADI'], bins=bins, labels=labels, include_lowest=True)\n",
+ "\n",
+ "# Group by the new 'ADI_bin' column\n",
+ "grouped = data.groupby('ADI_bin')\n",
+ "\n",
+ "# Get counts for each group\n",
+ "print(\"Counts in each group:\")\n",
+ "print(grouped.size())\n",
+ "print()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 271,
+ "id": "75dd33cf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 3\n",
+ "1 0\n",
+ "2 4\n",
+ "3 1\n",
+ "4 1\n",
+ " ..\n",
+ "95407 0\n",
+ "95408 3\n",
+ "95409 0\n",
+ "95410 0\n",
+ "95411 4\n",
+ "Name: ADI, Length: 95280, dtype: category\n",
+ "Categories (5, int64): [0 < 1 < 3 < 4 < 5]"
+ ]
+ },
+ "execution_count": 271,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Define the ordinal encoding for each variable\n",
+ "coverage_mapping = {'Lower Bound': 0, 'Q1': 1, 'Q3': 3, 'Upper Bound': 4, 'Outliers': 5}\n",
+ "\n",
+ "# Apply ordinal encoding with the custom mappings\n",
+ "data['ADI'] = data['ADI_bin'].map(coverage_mapping)\n",
+ "\n",
+ "# Drop the 'ADI_bin' column if no longer needed\n",
+ "data.drop('ADI_bin', axis=1, inplace=True)\n",
+ "data['ADI']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "81511999",
+ "metadata": {},
+ "source": [
+ "#### DMA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 272,
+ "id": "6808e0d2",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "DMA\n",
+ "803.0 7296\n",
+ "602.0 4632\n",
+ "807.0 3765\n",
+ "505.0 2839\n",
+ "819.0 2588\n",
+ " ... \n",
+ "569.0 1\n",
+ "554.0 1\n",
+ "584.0 1\n",
+ "552.0 1\n",
+ "516.0 1\n",
+ "Name: count, Length: 206, dtype: int64\n",
+ "float64\n",
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "print (data['DMA'].value_counts())\n",
+ "print (data['DMA'].dtype)\n",
+ "print (data['DMA'].isna().sum())\n",
+ "# some values have very low counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 273,
+ "id": "e827bba6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(data['DMA'])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 274,
+ "id": "19e1abda",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 95280.000000\n",
+ "mean 664.004072\n",
+ "std 116.363600\n",
+ "min 0.000000\n",
+ "25% 561.000000\n",
+ "50% 635.000000\n",
+ "75% 801.000000\n",
+ "max 881.000000\n",
+ "Name: DMA, dtype: float64"
+ ]
+ },
+ "execution_count": 274,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data['DMA'].describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 275,
+ "id": "c3a37da5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "min: 0.0, Lower Bound: 201.0, Q1: 561.0, Q2: 635.0, Q3: 801.0, max: 881.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# GROUP VALUES AND ASSIGN BINS FOR THE DISTRIBUTION\n",
+ "\n",
+ "# Calculate quantiles and IQR for the DMA column\n",
+ "min_val_dma = data['DMA'].min()\n",
+ "Q1_dma = data['DMA'].quantile(0.25)\n",
+ "Q2_dma = data['DMA'].median()\n",
+ "Q3_dma = data['DMA'].quantile(0.75)\n",
+ "IQR_dma = Q3_dma - Q1_dma\n",
+ "max_val_dma = data['DMA'].max()\n",
+ "lower_bound_dma = Q1_dma - 1.5 * IQR_dma\n",
+ "\n",
+ "print(f\"min: {min_val_dma}, Lower Bound: {lower_bound_dma}, Q1: {Q1_dma}, Q2: {Q2_dma}, Q3: {Q3_dma}, max: {max_val_dma}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 276,
+ "id": "93b2d43d",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Counts in each group:\n",
+ "DMA_bin\n",
+ "Outliers 187\n",
+ "Lower Bound 23635\n",
+ "Q1 24069\n",
+ "Q3 23907\n",
+ "Upper Bound 23482\n",
+ "dtype: int64\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Define bins and labels\n",
+ "bins_dma = [min_val_dma, lower_bound_dma, Q1_dma, Q2_dma, Q3_dma, max_val_dma] \n",
+ "labels_dma = ['Outliers','Lower Bound', 'Q1', 'Q3', 'Upper Bound'] \n",
+ "\n",
+ "# Create a new column with the binned ranges\n",
+ "data['DMA_bin'] = pd.cut(data['DMA'], bins=bins_dma, labels=labels_dma, include_lowest=True)\n",
+ "\n",
+ "# Group by the new 'DMA_bin' column\n",
+ "grouped_dma = data.groupby('DMA_bin')\n",
+ "\n",
+ "# Get counts for each group\n",
+ "print(\"Counts in each group:\")\n",
+ "print(grouped_dma.size())\n",
+ "print()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 277,
+ "id": "a476fb5d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 3\n",
+ "1 4\n",
+ "2 1\n",
+ "3 4\n",
+ "4 1\n",
+ " ..\n",
+ "95407 3\n",
+ "95408 2\n",
+ "95409 1\n",
+ "95410 4\n",
+ "95411 1\n",
+ "Name: DMA, Length: 95280, dtype: category\n",
+ "Categories (5, int64): [0 < 1 < 2 < 3 < 4]"
+ ]
+ },
+ "execution_count": 277,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Define the ordinal encoding for each variable\n",
+ "dma_mapping = {'Outliers':0, 'Lower Bound': 1, 'Q1': 2, 'Q3': 3, 'Upper Bound': 4}\n",
+ "\n",
+ "# Apply ordinal encoding with the custom mappings\n",
+ "data['DMA'] = data['DMA_bin'].map(dma_mapping)\n",
+ "\n",
+ "# Drop the 'DMA_bin' column if no longer needed\n",
+ "data.drop('DMA_bin', axis=1, inplace=True)\n",
+ "data['DMA']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ab6a8c40",
+ "metadata": {},
+ "source": [
+ "### MSA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 278,
+ "id": "30d4e872",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "MSA\n",
+ "0.0 21333\n",
+ "4480.0 4606\n",
+ "1600.0 4059\n",
+ "2160.0 2586\n",
+ "520.0 1685\n",
+ " ... \n",
+ "9140.0 1\n",
+ "3200.0 1\n",
+ "9280.0 1\n",
+ "743.0 1\n",
+ "8480.0 1\n",
+ "Name: count, Length: 298, dtype: int64\n",
+ "float64\n",
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "print (data['MSA'].value_counts())\n",
+ "print (data['MSA'].dtype)\n",
+ "print (data['MSA'].isna().sum())\n",
+ "# some values have very low counts and 0 has a very high count"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 279,
+ "id": "62fbf77d",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.distplot(data['MSA'])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 280,
+ "id": "cf701682",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(data['MSA'])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 281,
+ "id": "71386412",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 95280.000000\n",
+ "mean 3527.744102\n",
+ "std 2863.904737\n",
+ "min 0.000000\n",
+ "25% 520.000000\n",
+ "50% 3350.000000\n",
+ "75% 5960.000000\n",
+ "max 9360.000000\n",
+ "Name: MSA, dtype: float64"
+ ]
+ },
+ "execution_count": 281,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data['MSA'].describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 285,
+ "id": "3445cb8b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "min: 0.0, Q1: 520.0, Q2: 3350.0, Q3: 5960.0, max: 9360.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate quantiles and IQR for the MSA column\n",
+ "min_val_msa = data['MSA'].min()\n",
+ "Q1_msa = data['MSA'].quantile(0.25)\n",
+ "Q2_msa = data['MSA'].median()\n",
+ "Q3_msa = data['MSA'].quantile(0.75)\n",
+ "IQR_msa = Q3_msa - Q1_msa\n",
+ "max_val_msa = data['MSA'].max()\n",
+ "lower_bound_msa = Q1_msa - 1.5 * IQR_msa\n",
+ "upper_bound_msa = Q3_msa + 1.5 * IQR_msa\n",
+ "\n",
+ "print(f\"min: {min_val_msa}, Q1: {Q1_msa}, Q2: {Q2_msa}, Q3: {Q3_msa}, max: {max_val_msa}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 287,
+ "id": "1b69528e",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Counts in each group:\n",
+ "MSA_bin\n",
+ "Lower Bound 24487\n",
+ "Q1 23188\n",
+ "Q3 23959\n",
+ "Upper Bound 23646\n",
+ "dtype: int64\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Define bins and labels\n",
+ "bins_msa = [min_val_msa, Q1_msa, Q2_msa, Q3_msa, max_val_msa] \n",
+ "labels_msa = ['Lower Bound', 'Q1', 'Q3', 'Upper Bound'] \n",
+ "\n",
+ "# Create a new column with the binned ranges\n",
+ "data['MSA_bin'] = pd.cut(data['MSA'], bins=bins_msa, labels=labels_msa, include_lowest=True)\n",
+ "\n",
+ "# Group by the new 'DMA_bin' column\n",
+ "grouped_msa = data.groupby('MSA_bin')\n",
+ "\n",
+ "# Get counts for each group\n",
+ "print(\"Counts in each group:\")\n",
+ "print(grouped_msa.size())\n",
+ "print()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 288,
+ "id": "67721709",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 0\n",
+ "1 2\n",
+ "2 0\n",
+ "3 3\n",
+ "4 2\n",
+ " ..\n",
+ "95407 0\n",
+ "95408 2\n",
+ "95409 2\n",
+ "95410 3\n",
+ "95411 3\n",
+ "Name: MSA, Length: 95280, dtype: category\n",
+ "Categories (4, int64): [0 < 1 < 2 < 3]"
+ ]
+ },
+ "execution_count": 288,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Define the ordinal encoding for each variable\n",
+ "msa_mapping = {'Lower Bound':0, 'Q1':1, 'Q3':2, 'Upper Bound':3}\n",
+ "\n",
+ "# Apply ordinal encoding with the custom mappings\n",
+ "data['MSA'] = data['MSA_bin'].map(msa_mapping)\n",
+ "\n",
+ "# Drop the 'MSA_bin' column if no longer needed\n",
+ "data.drop('MSA_bin', axis=1, inplace=True)\n",
+ "data['MSA']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b91220a1",
+ "metadata": {},
+ "source": [
+ "#### Columns cleaned"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 289,
+ "id": "1264908b",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GENDER | \n",
+ " GEOCODE2 | \n",
+ " WEALTH1 | \n",
+ " ADI | \n",
+ " DMA | \n",
+ " MSA | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " F | \n",
+ " C | \n",
+ " 5.0 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " M | \n",
+ " A | \n",
+ " 9.0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " M | \n",
+ " C | \n",
+ " 1.0 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " F | \n",
+ " C | \n",
+ " 4.0 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " F | \n",
+ " A | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 95407 | \n",
+ " M | \n",
+ " C | \n",
+ " 4.0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 95408 | \n",
+ " M | \n",
+ " A | \n",
+ " 9.0 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 95409 | \n",
+ " M | \n",
+ " B | \n",
+ " 4.0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 95410 | \n",
+ " F | \n",
+ " A | \n",
+ " 8.0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 95411 | \n",
+ " F | \n",
+ " C | \n",
+ " 8.0 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
95280 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GENDER GEOCODE2 WEALTH1 ADI DMA MSA\n",
+ "0 F C 5.0 3 3 0\n",
+ "1 M A 9.0 0 4 2\n",
+ "2 M C 1.0 4 1 0\n",
+ "3 F C 4.0 1 4 3\n",
+ "4 F A 2.0 1 1 2\n",
+ "... ... ... ... .. .. ..\n",
+ "95407 M C 4.0 0 3 0\n",
+ "95408 M A 9.0 3 2 2\n",
+ "95409 M B 4.0 0 1 2\n",
+ "95410 F A 8.0 0 4 3\n",
+ "95411 F C 8.0 4 1 3\n",
+ "\n",
+ "[95280 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 289,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[['GENDER','GEOCODE2','WEALTH1','ADI','DMA','MSA']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 290,
+ "id": "cb32e020",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(95280, 454)"
+ ]
+ },
+ "execution_count": 290,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 291,
+ "id": "bf0c7685",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#data.to_csv('learningSet.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bce6d1b8",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}