diff --git a/.ipynb_checkpoints/lab_feature_engeneering-checkpoint.ipynb b/.ipynb_checkpoints/lab_feature_engeneering-checkpoint.ipynb new file mode 100644 index 0000000..44a6d6c --- /dev/null +++ b/.ipynb_checkpoints/lab_feature_engeneering-checkpoint.ipynb @@ -0,0 +1,2706 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "60b86b8b", + "metadata": {}, + "source": [ + "\n", + "Here we will work on cleaning some of the other columns in the dataset using the techniques that we used before in the lessons.\n", + "\n", + "- Check for null values in the numerical columns.\n", + "- Use appropriate methods to clean the columns `GEOCODE2`, `WEALTH1`, `ADI`, `DMA`,and `MSA`.\n", + "- Use appropriate EDA technique where ever necessary.\n", + " ```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1384030c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8b83680e", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWOSOURCETCODESTATEZIPMAILCODEPVASTATEDOBNOEXCHRECINHSE...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
08901GRI0IL6108137120...0.00L4EXXX39.0C
19401BOA1CA9132652020...0.00L2GXXX1.0A
29001AMH1NC2701700...0.01L4EXXX60.0C
38701BRY0CA9595328010...0.01L4EXXX41.0C
486010FL3317620010X...0.01L2FXXX26.0A
..................................................................
954079601ASE1AK9950400...0.00L1GXXX12.0C
954089601DCD1TX7737950010...0.01L1FXXX2.0A
954099501MBC1MI4891038010...0.01L3EXXX34.0B
954108601PRV0CA9132040050X...18.01L4FXXX11.0A
954118801MCC2NC2840918010X...0.01L1GC1C12.0C
\n", + "

95412 rows × 481 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n", + "0 8901 GRI 0 IL 61081 3712 0 \n", + "1 9401 BOA 1 CA 91326 5202 0 \n", + "2 9001 AMH 1 NC 27017 0 0 \n", + "3 8701 BRY 0 CA 95953 2801 0 \n", + "4 8601 0 FL 33176 2001 0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 ASE 1 AK 99504 0 0 \n", + "95408 9601 DCD 1 TX 77379 5001 0 \n", + "95409 9501 MBC 1 MI 48910 3801 0 \n", + "95410 8601 PRV 0 CA 91320 4005 0 \n", + "95411 8801 MCC 2 NC 28409 1801 0 \n", + "\n", + " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 X ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 X ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 481 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv(\"learningSet.txt\")\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3fcd94ee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['OSOURCE', 'ZIP']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Lets create a drop_list we will use later as well\n", + "drop_list = list(data[['OSOURCE', 'ZIP']])\n", + "drop_list" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "47815d17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column_namenulls_percentage
414RDATE_50.999906
436RAMNT_50.999906
412RDATE_30.997464
434RAMNT_30.997464
413RDATE_40.997055
.........
168ETHC30.000000
167ETHC20.000000
166ETHC10.000000
165HHD120.000000
240TPE110.000000
\n", + "

481 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " column_name nulls_percentage\n", + "414 RDATE_5 0.999906\n", + "436 RAMNT_5 0.999906\n", + "412 RDATE_3 0.997464\n", + "434 RAMNT_3 0.997464\n", + "413 RDATE_4 0.997055\n", + ".. ... ...\n", + "168 ETHC3 0.000000\n", + "167 ETHC2 0.000000\n", + "166 ETHC1 0.000000\n", + "165 HHD12 0.000000\n", + "240 TPE11 0.000000\n", + "\n", + "[481 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Lets deal with sparcity part \n", + "# Lets check null values in percentage \n", + "\n", + "nulls_percent_df= data.isna().sum()/len(data)\n", + "nulls_percent_df\n", + "\n", + "# put it in a dataframe \n", + "nulls_percent_df= pd.DataFrame(data.isna().sum()/len(data))\n", + "nulls_percent_df\n", + "\n", + "# Take out of the index \n", + "nulls_percent_df= pd.DataFrame(data.isna().sum()/len(data)).reset_index()\n", + "nulls_percent_df\n", + "\n", + "# Lets change columns name\n", + "nulls_percent_df.columns = ['column_name', 'nulls_percentage']\n", + "nulls_percent_df\n", + "\n", + "# Lets sort \n", + "nulls_percent_df.sort_values(by = ['nulls_percentage'], ascending = False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4040a63b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['NUMCHLD',\n", + " 'WEALTH1',\n", + " 'MBCRAFT',\n", + " 'MBGARDEN',\n", + " 'MBBOOKS',\n", + " 'MBCOLECT',\n", + " 'MAGFAML',\n", + " 'MAGFEM',\n", + " 'MAGMALE',\n", + " 'PUBGARDN',\n", + " 'PUBCULIN',\n", + " 'PUBHLTH',\n", + " 'PUBDOITY',\n", + " 'PUBNEWFN',\n", + " 'PUBPHOTO',\n", + " 'PUBOPP',\n", + " 'WEALTH2',\n", + " 'ADATE_5',\n", + " 'ADATE_10',\n", + " 'ADATE_13',\n", + " 'ADATE_15',\n", + " 'ADATE_17',\n", + " 'ADATE_19',\n", + " 'ADATE_20',\n", + " 'ADATE_21',\n", + " 'ADATE_22',\n", + " 'ADATE_23',\n", + " 'ADATE_24',\n", + " 'RDATE_3',\n", + " 'RDATE_4',\n", + " 'RDATE_5',\n", + " 'RDATE_6',\n", + " 'RDATE_7',\n", + " 'RDATE_8',\n", + " 'RDATE_9',\n", + " 'RDATE_10',\n", + " 'RDATE_11',\n", + " 'RDATE_12',\n", + " 'RDATE_13',\n", + " 'RDATE_14',\n", + " 'RDATE_15',\n", + " 'RDATE_16',\n", + " 'RDATE_17',\n", + " 'RDATE_18',\n", + " 'RDATE_19',\n", + " 'RDATE_20',\n", + " 'RDATE_21',\n", + " 'RDATE_22',\n", + " 'RDATE_23',\n", + " 'RDATE_24',\n", + " 'RAMNT_3',\n", + " 'RAMNT_4',\n", + " 'RAMNT_5',\n", + " 'RAMNT_6',\n", + " 'RAMNT_7',\n", + " 'RAMNT_8',\n", + " 'RAMNT_9',\n", + " 'RAMNT_10',\n", + " 'RAMNT_11',\n", + " 'RAMNT_12',\n", + " 'RAMNT_13',\n", + " 'RAMNT_14',\n", + " 'RAMNT_15',\n", + " 'RAMNT_16',\n", + " 'RAMNT_17',\n", + " 'RAMNT_18',\n", + " 'RAMNT_19',\n", + " 'RAMNT_20',\n", + " 'RAMNT_21',\n", + " 'RAMNT_22',\n", + " 'RAMNT_23',\n", + " 'RAMNT_24']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# First create the variable with the threshold \n", + "threshold =0.25 \n", + "\n", + "# define a condition \n", + "condition = nulls_percent_df['nulls_percentage']>threshold\n", + "columns_above_threshold = nulls_percent_df[condition]\n", + "columns_above_threshold\n", + "\n", + "# Create a list with column names\n", + "drop_columns_list = list(columns_above_threshold['column_name'])\n", + "drop_columns_list" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bca2c434", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWOSOURCETCODESTATEZIPMAILCODEPVASTATEDOBNOEXCHRECINHSE...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
08901GRI0IL6108137120...0.00L4EXXX39.0C
19401BOA1CA9132652020...0.00L2GXXX1.0A
29001AMH1NC2701700...0.01L4EXXX60.0C
38701BRY0CA9595328010...0.01L4EXXX41.0C
486010FL3317620010X...0.01L2FXXX26.0A
..................................................................
954079601ASE1AK9950400...0.00L1GXXX12.0C
954089601DCD1TX7737950010...0.01L1FXXX2.0A
954099501MBC1MI4891038010...0.01L3EXXX34.0B
954108601PRV0CA9132040050X...18.01L4FXXX11.0A
954118801MCC2NC2840918010X...0.01L1GC1C12.0C
\n", + "

95412 rows × 409 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n", + "0 8901 GRI 0 IL 61081 3712 0 \n", + "1 9401 BOA 1 CA 91326 5202 0 \n", + "2 9001 AMH 1 NC 27017 0 0 \n", + "3 8701 BRY 0 CA 95953 2801 0 \n", + "4 8601 0 FL 33176 2001 0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 ASE 1 AK 99504 0 0 \n", + "95408 9601 DCD 1 TX 77379 5001 0 \n", + "95409 9501 MBC 1 MI 48910 3801 0 \n", + "95410 8601 PRV 0 CA 91320 4005 0 \n", + "95411 8801 MCC 2 NC 28409 1801 0 \n", + "\n", + " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 X ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 X ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 409 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Removing the null values with a threshold above 25% using the list created above that shows all the columns within this th\n", + "data_drop1 = data.drop(columns=drop_columns_list)\n", + "data_drop1" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "afc4d49c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "F 51277\n", + "M 39094\n", + " 2957\n", + "U 1715\n", + "J 365\n", + "C 2\n", + "A 2\n", + "Name: GENDER, dtype: int64\n" + ] + } + ], + "source": [ + "# Check and fill the null values with F in the GENDER column\n", + "print(data['GENDER'].value_counts())\n", + "data['GENDER'] = data['GENDER'].fillna('F')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "432c1919", + "metadata": {}, + "outputs": [], + "source": [ + "# Lets settle the values of GENDER to only M F or other \n", + "def frequent_values(df, column, n=2, replace_value='other'):\n", + " value_counts = df[column].value_counts()\n", + " top_n_values = value_counts.index[:n]\n", + " df[column] = df[column].apply(lambda x: x if x in top_n_values else replace_value)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c3336b39", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWOSOURCETCODESTATEZIPMAILCODEPVASTATEDOBNOEXCHRECINHSE...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
08901GRI0IL6108137120...0.00L4EXXX39.0C
19401BOA1CA9132652020...0.00L2GXXX1.0A
29001AMH1NC2701700...0.01L4EXXX60.0C
38701BRY0CA9595328010...0.01L4EXXX41.0C
486010FL3317620010X...0.01L2FXXX26.0A
..................................................................
954079601ASE1AK9950400...0.00L1GXXX12.0C
954089601DCD1TX7737950010...0.01L1FXXX2.0A
954099501MBC1MI4891038010...0.01L3EXXX34.0B
954108601PRV0CA9132040050X...18.01L4FXXX11.0A
954118801MCC2NC2840918010X...0.01L1GC1C12.0C
\n", + "

95412 rows × 481 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n", + "0 8901 GRI 0 IL 61081 3712 0 \n", + "1 9401 BOA 1 CA 91326 5202 0 \n", + "2 9001 AMH 1 NC 27017 0 0 \n", + "3 8701 BRY 0 CA 95953 2801 0 \n", + "4 8601 0 FL 33176 2001 0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 ASE 1 AK 99504 0 0 \n", + "95408 9601 DCD 1 TX 77379 5001 0 \n", + "95409 9501 MBC 1 MI 48910 3801 0 \n", + "95410 8601 PRV 0 CA 91320 4005 0 \n", + "95411 8801 MCC 2 NC 28409 1801 0 \n", + "\n", + " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 X ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 X ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 481 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "frequent_values(data, 'GENDER', n=2, replace_value='other')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c4b726ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['F', 'M', 'other'], dtype=object)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['GENDER'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d3cf8e4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWTCODEDOBAGENUMCHLDINCOMEWEALTH1HITMBCRAFTMBGARDEN...FISTDATENEXTDATETIMELAGAVGGIFTCONTROLNTARGET_BTARGET_DHPHONE_DRFA_2FCLUSTER2
089010371260.0NaNNaNNaN0NaNNaN...89119003.04.07.7419359551500.00439.0
194011520246.01.06.09.0160.00.0...93109504.018.015.66666714853500.0021.0
2900110NaNNaN3.01.020.00.0...90019101.012.07.4814811507800.01460.0
387010280170.0NaN1.04.020.00.0...87028711.09.06.81250017255600.01441.0
486010200178.01.03.02.0601.00.0...79038005.014.06.864865711200.01226.0
..................................................................
95407960110NaNNaNNaNNaN0NaNNaN...9602NaNNaN25.00000018456800.00112.0
9540896011500148.01.07.09.010.00.0...9603NaNNaN20.00000012270600.0112.0
9540995011380160.0NaNNaNNaN0NaNNaN...94109501.03.08.28571418964100.01334.0
9541086010400558.0NaN7.0NaN0NaNNaN...86128704.04.012.1463414693118.01411.0
9541188012180180.0NaN5.08.030.00.0...88038809.06.096.79487218511400.01112.0
\n", + "

95412 rows × 407 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW TCODE DOB AGE NUMCHLD INCOME WEALTH1 HIT MBCRAFT \\\n", + "0 8901 0 3712 60.0 NaN NaN NaN 0 NaN \n", + "1 9401 1 5202 46.0 1.0 6.0 9.0 16 0.0 \n", + "2 9001 1 0 NaN NaN 3.0 1.0 2 0.0 \n", + "3 8701 0 2801 70.0 NaN 1.0 4.0 2 0.0 \n", + "4 8601 0 2001 78.0 1.0 3.0 2.0 60 1.0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 1 0 NaN NaN NaN NaN 0 NaN \n", + "95408 9601 1 5001 48.0 1.0 7.0 9.0 1 0.0 \n", + "95409 9501 1 3801 60.0 NaN NaN NaN 0 NaN \n", + "95410 8601 0 4005 58.0 NaN 7.0 NaN 0 NaN \n", + "95411 8801 2 1801 80.0 NaN 5.0 8.0 3 0.0 \n", + "\n", + " MBGARDEN ... FISTDATE NEXTDATE TIMELAG AVGGIFT CONTROLN \\\n", + "0 NaN ... 8911 9003.0 4.0 7.741935 95515 \n", + "1 0.0 ... 9310 9504.0 18.0 15.666667 148535 \n", + "2 0.0 ... 9001 9101.0 12.0 7.481481 15078 \n", + "3 0.0 ... 8702 8711.0 9.0 6.812500 172556 \n", + "4 0.0 ... 7903 8005.0 14.0 6.864865 7112 \n", + "... ... ... ... ... ... ... ... \n", + "95407 NaN ... 9602 NaN NaN 25.000000 184568 \n", + "95408 0.0 ... 9603 NaN NaN 20.000000 122706 \n", + "95409 NaN ... 9410 9501.0 3.0 8.285714 189641 \n", + "95410 NaN ... 8612 8704.0 4.0 12.146341 4693 \n", + "95411 0.0 ... 8803 8809.0 6.0 96.794872 185114 \n", + "\n", + " TARGET_B TARGET_D HPHONE_D RFA_2F CLUSTER2 \n", + "0 0 0.0 0 4 39.0 \n", + "1 0 0.0 0 2 1.0 \n", + "2 0 0.0 1 4 60.0 \n", + "3 0 0.0 1 4 41.0 \n", + "4 0 0.0 1 2 26.0 \n", + "... ... ... ... ... ... \n", + "95407 0 0.0 0 1 12.0 \n", + "95408 0 0.0 1 1 2.0 \n", + "95409 0 0.0 1 3 34.0 \n", + "95410 1 18.0 1 4 11.0 \n", + "95411 0 0.0 1 1 12.0 \n", + "\n", + "[95412 rows x 407 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numerical_data = data.select_dtypes(include = np.number)\n", + "numerical_data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0fbb2c34", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column_namenulls_percentage
346RDATE_50.999906
368RAMNT_50.999906
344RDATE_30.997464
366RAMNT_30.997464
345RDATE_40.997055
.........
145HUPA70.000000
144HUPA60.000000
143HUPA50.000000
142HUPA40.000000
203LFC60.000000
\n", + "

407 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " column_name nulls_percentage\n", + "346 RDATE_5 0.999906\n", + "368 RAMNT_5 0.999906\n", + "344 RDATE_3 0.997464\n", + "366 RAMNT_3 0.997464\n", + "345 RDATE_4 0.997055\n", + ".. ... ...\n", + "145 HUPA7 0.000000\n", + "144 HUPA6 0.000000\n", + "143 HUPA5 0.000000\n", + "142 HUPA4 0.000000\n", + "203 LFC6 0.000000\n", + "\n", + "[407 rows x 2 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Checking for null values in numerical_data DF\n", + "nulls_percent_numerical= pd.DataFrame(numerical_data.isna().sum()/len(numerical_data)).reset_index()\n", + "nulls_percent_numerical.columns = ['column_name', 'nulls_percentage']\n", + "\n", + "# Sorting the values to see the highest first\n", + "nulls_percent_numerical.sort_values(by = ['nulls_percentage'], ascending = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e9e8845a", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Variables to treat GEOCODE2, WEALTH1, ADI, DMA,and MSA.\n", + "data['GEOCODE2'].value_counts()\n", + "# In here we can see that there is a a blanc variables, but is not a NAN otherwise it ould appear NAN, so I will just turn the blanc value into the most common value A\n", + "data['GEOCODE2'].replace(' ', 'A', inplace = True)\n", + "data['GEOCODE2'].value_counts()\n", + "\n", + "#lets plot this variable first and check the null values \n", + "data['GEOCODE2'].isna().sum()\n", + "\n", + "sns.histplot(data['GEOCODE2'])\n", + "plt.show()\n", + "\n", + "# Since there is only 132, since this is a discrete varibale I will just fill it with the mode\n", + "data['GEOCODE2']= data['GEOCODE2'].fillna('A')\n", + "data['GEOCODE2'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5ab2104f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9.0 7585\n", + "8.0 6793\n", + "7.0 6198\n", + "6.0 5825\n", + "5.0 5280\n", + "4.0 4810\n", + "3.0 4237\n", + "2.0 4085\n", + "1.0 3454\n", + "0.0 2413\n", + "Name: WEALTH1, dtype: int64\n", + "float64\n", + "44732\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWOSOURCETCODESTATEZIPMAILCODEPVASTATEDOBNOEXCHRECINHSE...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
08901GRI0IL6108137120...0.00L4EXXX39.0C
19401BOA1CA9132652020...0.00L2GXXX1.0A
29001AMH1NC2701700...0.01L4EXXX60.0C
38701BRY0CA9595328010...0.01L4EXXX41.0C
486010FL3317620010X...0.01L2FXXX26.0A
..................................................................
954079601ASE1AK9950400...0.00L1GXXX12.0C
954089601DCD1TX7737950010...0.01L1FXXX2.0A
954099501MBC1MI4891038010...0.01L3EXXX34.0B
954108601PRV0CA9132040050X...18.01L4FXXX11.0A
954118801MCC2NC2840918010X...0.01L1GC1C12.0C
\n", + "

95412 rows × 480 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n", + "0 8901 GRI 0 IL 61081 3712 0 \n", + "1 9401 BOA 1 CA 91326 5202 0 \n", + "2 9001 AMH 1 NC 27017 0 0 \n", + "3 8701 BRY 0 CA 95953 2801 0 \n", + "4 8601 0 FL 33176 2001 0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 ASE 1 AK 99504 0 0 \n", + "95408 9601 DCD 1 TX 77379 5001 0 \n", + "95409 9501 MBC 1 MI 48910 3801 0 \n", + "95410 8601 PRV 0 CA 91320 4005 0 \n", + "95411 8801 MCC 2 NC 28409 1801 0 \n", + "\n", + " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 X ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 X ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 480 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# For the Wealth1 variable \n", + "#First check frequency and if it is th write type and it is on float\n", + "print(data['WEALTH1'].value_counts())\n", + "print(data['WEALTH1'].dtypes)\n", + "\n", + "# Check null values \n", + "print(data['WEALTH1'].isna().sum())\n", + "\n", + "# There is almost 50% of null values in this column so I will just drop it \n", + "\n", + "data = data.drop(columns = 'WEALTH1', axis=1)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f4a07c91", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13.0 7296\n", + "51.0 4622\n", + "65.0 3765\n", + "57.0 2836\n", + "105.0 2617\n", + " ... \n", + "651.0 1\n", + "103.0 1\n", + "601.0 1\n", + "161.0 1\n", + "147.0 1\n", + "Name: ADI, Length: 204, dtype: int64\n", + "float64\n", + "132\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# ADI variable\n", + "#First check frequency and if it is th write type and it is on float\n", + "print(data['ADI'].value_counts())\n", + "print(data['ADI'].dtypes)\n", + "\n", + "# Check null values \n", + "print(data['ADI'].isna().sum())\n", + "\n", + "sns.distplot(data['ADI'])\n", + "plt.show()\n", + "\n", + "# Since this is a continuous variable and we only have 132 null values I will fill them with the mean\n", + "data['ADI'] = data['ADI'].fillna(np.mean(data['ADI']))\n", + "\n", + "# Check if the distribution changed\n", + "sns.distplot(data['ADI'])\n", + "plt.show()\n", + "\n", + "# Check if all the null values were actually filled\n", + "print(data['ADI'].isna().sum())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "44164ac6", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "803.0 7296\n", + "602.0 4632\n", + "807.0 3765\n", + "505.0 2839\n", + "819.0 2588\n", + " ... \n", + "502.0 2\n", + "569.0 1\n", + "554.0 1\n", + "552.0 1\n", + "516.0 1\n", + "Name: DMA, Length: 201, dtype: int64\n", + "float64\n", + "132\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# DMA variable\n", + "#First check frequency and if it is th write type and it is on float\n", + "print(data['DMA'].value_counts())\n", + "print(data['DMA'].dtypes)\n", + "\n", + "# Check null values \n", + "print(data['DMA'].isna().sum())\n", + "\n", + "sns.distplot(data['DMA'])\n", + "plt.show()\n", + "\n", + "# Since this is a continuous variable and we only have 132 null values I will fill them with the mean\n", + "data['DMA'] = data['DMA'].fillna(np.mean(data['DMA']))\n", + "\n", + "# Check if the distribution changed\n", + "sns.distplot(data['DMA'])\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "3119646d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.0 21187\n", + "4480.0 4606\n", + "1600.0 4059\n", + "2160.0 2586\n", + "520.0 1685\n", + " ... \n", + "9140.0 1\n", + "3200.0 1\n", + "9280.0 1\n", + "743.0 1\n", + "8480.0 1\n", + "Name: MSA, Length: 294, dtype: int64\n", + "float64\n", + "132\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# MSA variable\n", + "#First check frequency and if it is th write type and it is on float\n", + "print(data['MSA'].value_counts())\n", + "print(data['MSA'].dtypes)\n", + "\n", + "# Check null values \n", + "print(data['MSA'].isna().sum())\n", + "\n", + "sns.distplot(data['MSA'])\n", + "plt.show()\n", + "\n", + "# Since this is a continuous variable and we only have 132 null values I will fill them with the mean\n", + "data['MSA'] = data['MSA'].fillna(np.mean(data['MSA']))\n", + "\n", + "# Check if the distribution changed\n", + "sns.distplot(data['MSA'])\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lab_feature_engeneering.ipynb b/lab_feature_engeneering.ipynb new file mode 100644 index 0000000..44a6d6c --- /dev/null +++ b/lab_feature_engeneering.ipynb @@ -0,0 +1,2706 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "60b86b8b", + "metadata": {}, + "source": [ + "\n", + "Here we will work on cleaning some of the other columns in the dataset using the techniques that we used before in the lessons.\n", + "\n", + "- Check for null values in the numerical columns.\n", + "- Use appropriate methods to clean the columns `GEOCODE2`, `WEALTH1`, `ADI`, `DMA`,and `MSA`.\n", + "- Use appropriate EDA technique where ever necessary.\n", + " ```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1384030c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8b83680e", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWOSOURCETCODESTATEZIPMAILCODEPVASTATEDOBNOEXCHRECINHSE...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
08901GRI0IL6108137120...0.00L4EXXX39.0C
19401BOA1CA9132652020...0.00L2GXXX1.0A
29001AMH1NC2701700...0.01L4EXXX60.0C
38701BRY0CA9595328010...0.01L4EXXX41.0C
486010FL3317620010X...0.01L2FXXX26.0A
..................................................................
954079601ASE1AK9950400...0.00L1GXXX12.0C
954089601DCD1TX7737950010...0.01L1FXXX2.0A
954099501MBC1MI4891038010...0.01L3EXXX34.0B
954108601PRV0CA9132040050X...18.01L4FXXX11.0A
954118801MCC2NC2840918010X...0.01L1GC1C12.0C
\n", + "

95412 rows × 481 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n", + "0 8901 GRI 0 IL 61081 3712 0 \n", + "1 9401 BOA 1 CA 91326 5202 0 \n", + "2 9001 AMH 1 NC 27017 0 0 \n", + "3 8701 BRY 0 CA 95953 2801 0 \n", + "4 8601 0 FL 33176 2001 0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 ASE 1 AK 99504 0 0 \n", + "95408 9601 DCD 1 TX 77379 5001 0 \n", + "95409 9501 MBC 1 MI 48910 3801 0 \n", + "95410 8601 PRV 0 CA 91320 4005 0 \n", + "95411 8801 MCC 2 NC 28409 1801 0 \n", + "\n", + " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 X ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 X ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 481 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv(\"learningSet.txt\")\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3fcd94ee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['OSOURCE', 'ZIP']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Lets create a drop_list we will use later as well\n", + "drop_list = list(data[['OSOURCE', 'ZIP']])\n", + "drop_list" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "47815d17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column_namenulls_percentage
414RDATE_50.999906
436RAMNT_50.999906
412RDATE_30.997464
434RAMNT_30.997464
413RDATE_40.997055
.........
168ETHC30.000000
167ETHC20.000000
166ETHC10.000000
165HHD120.000000
240TPE110.000000
\n", + "

481 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " column_name nulls_percentage\n", + "414 RDATE_5 0.999906\n", + "436 RAMNT_5 0.999906\n", + "412 RDATE_3 0.997464\n", + "434 RAMNT_3 0.997464\n", + "413 RDATE_4 0.997055\n", + ".. ... ...\n", + "168 ETHC3 0.000000\n", + "167 ETHC2 0.000000\n", + "166 ETHC1 0.000000\n", + "165 HHD12 0.000000\n", + "240 TPE11 0.000000\n", + "\n", + "[481 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Lets deal with sparcity part \n", + "# Lets check null values in percentage \n", + "\n", + "nulls_percent_df= data.isna().sum()/len(data)\n", + "nulls_percent_df\n", + "\n", + "# put it in a dataframe \n", + "nulls_percent_df= pd.DataFrame(data.isna().sum()/len(data))\n", + "nulls_percent_df\n", + "\n", + "# Take out of the index \n", + "nulls_percent_df= pd.DataFrame(data.isna().sum()/len(data)).reset_index()\n", + "nulls_percent_df\n", + "\n", + "# Lets change columns name\n", + "nulls_percent_df.columns = ['column_name', 'nulls_percentage']\n", + "nulls_percent_df\n", + "\n", + "# Lets sort \n", + "nulls_percent_df.sort_values(by = ['nulls_percentage'], ascending = False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4040a63b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['NUMCHLD',\n", + " 'WEALTH1',\n", + " 'MBCRAFT',\n", + " 'MBGARDEN',\n", + " 'MBBOOKS',\n", + " 'MBCOLECT',\n", + " 'MAGFAML',\n", + " 'MAGFEM',\n", + " 'MAGMALE',\n", + " 'PUBGARDN',\n", + " 'PUBCULIN',\n", + " 'PUBHLTH',\n", + " 'PUBDOITY',\n", + " 'PUBNEWFN',\n", + " 'PUBPHOTO',\n", + " 'PUBOPP',\n", + " 'WEALTH2',\n", + " 'ADATE_5',\n", + " 'ADATE_10',\n", + " 'ADATE_13',\n", + " 'ADATE_15',\n", + " 'ADATE_17',\n", + " 'ADATE_19',\n", + " 'ADATE_20',\n", + " 'ADATE_21',\n", + " 'ADATE_22',\n", + " 'ADATE_23',\n", + " 'ADATE_24',\n", + " 'RDATE_3',\n", + " 'RDATE_4',\n", + " 'RDATE_5',\n", + " 'RDATE_6',\n", + " 'RDATE_7',\n", + " 'RDATE_8',\n", + " 'RDATE_9',\n", + " 'RDATE_10',\n", + " 'RDATE_11',\n", + " 'RDATE_12',\n", + " 'RDATE_13',\n", + " 'RDATE_14',\n", + " 'RDATE_15',\n", + " 'RDATE_16',\n", + " 'RDATE_17',\n", + " 'RDATE_18',\n", + " 'RDATE_19',\n", + " 'RDATE_20',\n", + " 'RDATE_21',\n", + " 'RDATE_22',\n", + " 'RDATE_23',\n", + " 'RDATE_24',\n", + " 'RAMNT_3',\n", + " 'RAMNT_4',\n", + " 'RAMNT_5',\n", + " 'RAMNT_6',\n", + " 'RAMNT_7',\n", + " 'RAMNT_8',\n", + " 'RAMNT_9',\n", + " 'RAMNT_10',\n", + " 'RAMNT_11',\n", + " 'RAMNT_12',\n", + " 'RAMNT_13',\n", + " 'RAMNT_14',\n", + " 'RAMNT_15',\n", + " 'RAMNT_16',\n", + " 'RAMNT_17',\n", + " 'RAMNT_18',\n", + " 'RAMNT_19',\n", + " 'RAMNT_20',\n", + " 'RAMNT_21',\n", + " 'RAMNT_22',\n", + " 'RAMNT_23',\n", + " 'RAMNT_24']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# First create the variable with the threshold \n", + "threshold =0.25 \n", + "\n", + "# define a condition \n", + "condition = nulls_percent_df['nulls_percentage']>threshold\n", + "columns_above_threshold = nulls_percent_df[condition]\n", + "columns_above_threshold\n", + "\n", + "# Create a list with column names\n", + "drop_columns_list = list(columns_above_threshold['column_name'])\n", + "drop_columns_list" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bca2c434", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWOSOURCETCODESTATEZIPMAILCODEPVASTATEDOBNOEXCHRECINHSE...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
08901GRI0IL6108137120...0.00L4EXXX39.0C
19401BOA1CA9132652020...0.00L2GXXX1.0A
29001AMH1NC2701700...0.01L4EXXX60.0C
38701BRY0CA9595328010...0.01L4EXXX41.0C
486010FL3317620010X...0.01L2FXXX26.0A
..................................................................
954079601ASE1AK9950400...0.00L1GXXX12.0C
954089601DCD1TX7737950010...0.01L1FXXX2.0A
954099501MBC1MI4891038010...0.01L3EXXX34.0B
954108601PRV0CA9132040050X...18.01L4FXXX11.0A
954118801MCC2NC2840918010X...0.01L1GC1C12.0C
\n", + "

95412 rows × 409 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n", + "0 8901 GRI 0 IL 61081 3712 0 \n", + "1 9401 BOA 1 CA 91326 5202 0 \n", + "2 9001 AMH 1 NC 27017 0 0 \n", + "3 8701 BRY 0 CA 95953 2801 0 \n", + "4 8601 0 FL 33176 2001 0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 ASE 1 AK 99504 0 0 \n", + "95408 9601 DCD 1 TX 77379 5001 0 \n", + "95409 9501 MBC 1 MI 48910 3801 0 \n", + "95410 8601 PRV 0 CA 91320 4005 0 \n", + "95411 8801 MCC 2 NC 28409 1801 0 \n", + "\n", + " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 X ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 X ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 409 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Removing the null values with a threshold above 25% using the list created above that shows all the columns within this th\n", + "data_drop1 = data.drop(columns=drop_columns_list)\n", + "data_drop1" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "afc4d49c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "F 51277\n", + "M 39094\n", + " 2957\n", + "U 1715\n", + "J 365\n", + "C 2\n", + "A 2\n", + "Name: GENDER, dtype: int64\n" + ] + } + ], + "source": [ + "# Check and fill the null values with F in the GENDER column\n", + "print(data['GENDER'].value_counts())\n", + "data['GENDER'] = data['GENDER'].fillna('F')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "432c1919", + "metadata": {}, + "outputs": [], + "source": [ + "# Lets settle the values of GENDER to only M F or other \n", + "def frequent_values(df, column, n=2, replace_value='other'):\n", + " value_counts = df[column].value_counts()\n", + " top_n_values = value_counts.index[:n]\n", + " df[column] = df[column].apply(lambda x: x if x in top_n_values else replace_value)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c3336b39", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWOSOURCETCODESTATEZIPMAILCODEPVASTATEDOBNOEXCHRECINHSE...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
08901GRI0IL6108137120...0.00L4EXXX39.0C
19401BOA1CA9132652020...0.00L2GXXX1.0A
29001AMH1NC2701700...0.01L4EXXX60.0C
38701BRY0CA9595328010...0.01L4EXXX41.0C
486010FL3317620010X...0.01L2FXXX26.0A
..................................................................
954079601ASE1AK9950400...0.00L1GXXX12.0C
954089601DCD1TX7737950010...0.01L1FXXX2.0A
954099501MBC1MI4891038010...0.01L3EXXX34.0B
954108601PRV0CA9132040050X...18.01L4FXXX11.0A
954118801MCC2NC2840918010X...0.01L1GC1C12.0C
\n", + "

95412 rows × 481 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n", + "0 8901 GRI 0 IL 61081 3712 0 \n", + "1 9401 BOA 1 CA 91326 5202 0 \n", + "2 9001 AMH 1 NC 27017 0 0 \n", + "3 8701 BRY 0 CA 95953 2801 0 \n", + "4 8601 0 FL 33176 2001 0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 ASE 1 AK 99504 0 0 \n", + "95408 9601 DCD 1 TX 77379 5001 0 \n", + "95409 9501 MBC 1 MI 48910 3801 0 \n", + "95410 8601 PRV 0 CA 91320 4005 0 \n", + "95411 8801 MCC 2 NC 28409 1801 0 \n", + "\n", + " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 X ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 X ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 481 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "frequent_values(data, 'GENDER', n=2, replace_value='other')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c4b726ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['F', 'M', 'other'], dtype=object)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['GENDER'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d3cf8e4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWTCODEDOBAGENUMCHLDINCOMEWEALTH1HITMBCRAFTMBGARDEN...FISTDATENEXTDATETIMELAGAVGGIFTCONTROLNTARGET_BTARGET_DHPHONE_DRFA_2FCLUSTER2
089010371260.0NaNNaNNaN0NaNNaN...89119003.04.07.7419359551500.00439.0
194011520246.01.06.09.0160.00.0...93109504.018.015.66666714853500.0021.0
2900110NaNNaN3.01.020.00.0...90019101.012.07.4814811507800.01460.0
387010280170.0NaN1.04.020.00.0...87028711.09.06.81250017255600.01441.0
486010200178.01.03.02.0601.00.0...79038005.014.06.864865711200.01226.0
..................................................................
95407960110NaNNaNNaNNaN0NaNNaN...9602NaNNaN25.00000018456800.00112.0
9540896011500148.01.07.09.010.00.0...9603NaNNaN20.00000012270600.0112.0
9540995011380160.0NaNNaNNaN0NaNNaN...94109501.03.08.28571418964100.01334.0
9541086010400558.0NaN7.0NaN0NaNNaN...86128704.04.012.1463414693118.01411.0
9541188012180180.0NaN5.08.030.00.0...88038809.06.096.79487218511400.01112.0
\n", + "

95412 rows × 407 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW TCODE DOB AGE NUMCHLD INCOME WEALTH1 HIT MBCRAFT \\\n", + "0 8901 0 3712 60.0 NaN NaN NaN 0 NaN \n", + "1 9401 1 5202 46.0 1.0 6.0 9.0 16 0.0 \n", + "2 9001 1 0 NaN NaN 3.0 1.0 2 0.0 \n", + "3 8701 0 2801 70.0 NaN 1.0 4.0 2 0.0 \n", + "4 8601 0 2001 78.0 1.0 3.0 2.0 60 1.0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 1 0 NaN NaN NaN NaN 0 NaN \n", + "95408 9601 1 5001 48.0 1.0 7.0 9.0 1 0.0 \n", + "95409 9501 1 3801 60.0 NaN NaN NaN 0 NaN \n", + "95410 8601 0 4005 58.0 NaN 7.0 NaN 0 NaN \n", + "95411 8801 2 1801 80.0 NaN 5.0 8.0 3 0.0 \n", + "\n", + " MBGARDEN ... FISTDATE NEXTDATE TIMELAG AVGGIFT CONTROLN \\\n", + "0 NaN ... 8911 9003.0 4.0 7.741935 95515 \n", + "1 0.0 ... 9310 9504.0 18.0 15.666667 148535 \n", + "2 0.0 ... 9001 9101.0 12.0 7.481481 15078 \n", + "3 0.0 ... 8702 8711.0 9.0 6.812500 172556 \n", + "4 0.0 ... 7903 8005.0 14.0 6.864865 7112 \n", + "... ... ... ... ... ... ... ... \n", + "95407 NaN ... 9602 NaN NaN 25.000000 184568 \n", + "95408 0.0 ... 9603 NaN NaN 20.000000 122706 \n", + "95409 NaN ... 9410 9501.0 3.0 8.285714 189641 \n", + "95410 NaN ... 8612 8704.0 4.0 12.146341 4693 \n", + "95411 0.0 ... 8803 8809.0 6.0 96.794872 185114 \n", + "\n", + " TARGET_B TARGET_D HPHONE_D RFA_2F CLUSTER2 \n", + "0 0 0.0 0 4 39.0 \n", + "1 0 0.0 0 2 1.0 \n", + "2 0 0.0 1 4 60.0 \n", + "3 0 0.0 1 4 41.0 \n", + "4 0 0.0 1 2 26.0 \n", + "... ... ... ... ... ... \n", + "95407 0 0.0 0 1 12.0 \n", + "95408 0 0.0 1 1 2.0 \n", + "95409 0 0.0 1 3 34.0 \n", + "95410 1 18.0 1 4 11.0 \n", + "95411 0 0.0 1 1 12.0 \n", + "\n", + "[95412 rows x 407 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numerical_data = data.select_dtypes(include = np.number)\n", + "numerical_data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0fbb2c34", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column_namenulls_percentage
346RDATE_50.999906
368RAMNT_50.999906
344RDATE_30.997464
366RAMNT_30.997464
345RDATE_40.997055
.........
145HUPA70.000000
144HUPA60.000000
143HUPA50.000000
142HUPA40.000000
203LFC60.000000
\n", + "

407 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " column_name nulls_percentage\n", + "346 RDATE_5 0.999906\n", + "368 RAMNT_5 0.999906\n", + "344 RDATE_3 0.997464\n", + "366 RAMNT_3 0.997464\n", + "345 RDATE_4 0.997055\n", + ".. ... ...\n", + "145 HUPA7 0.000000\n", + "144 HUPA6 0.000000\n", + "143 HUPA5 0.000000\n", + "142 HUPA4 0.000000\n", + "203 LFC6 0.000000\n", + "\n", + "[407 rows x 2 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Checking for null values in numerical_data DF\n", + "nulls_percent_numerical= pd.DataFrame(numerical_data.isna().sum()/len(numerical_data)).reset_index()\n", + "nulls_percent_numerical.columns = ['column_name', 'nulls_percentage']\n", + "\n", + "# Sorting the values to see the highest first\n", + "nulls_percent_numerical.sort_values(by = ['nulls_percentage'], ascending = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e9e8845a", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Variables to treat GEOCODE2, WEALTH1, ADI, DMA,and MSA.\n", + "data['GEOCODE2'].value_counts()\n", + "# In here we can see that there is a a blanc variables, but is not a NAN otherwise it ould appear NAN, so I will just turn the blanc value into the most common value A\n", + "data['GEOCODE2'].replace(' ', 'A', inplace = True)\n", + "data['GEOCODE2'].value_counts()\n", + "\n", + "#lets plot this variable first and check the null values \n", + "data['GEOCODE2'].isna().sum()\n", + "\n", + "sns.histplot(data['GEOCODE2'])\n", + "plt.show()\n", + "\n", + "# Since there is only 132, since this is a discrete varibale I will just fill it with the mode\n", + "data['GEOCODE2']= data['GEOCODE2'].fillna('A')\n", + "data['GEOCODE2'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5ab2104f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9.0 7585\n", + "8.0 6793\n", + "7.0 6198\n", + "6.0 5825\n", + "5.0 5280\n", + "4.0 4810\n", + "3.0 4237\n", + "2.0 4085\n", + "1.0 3454\n", + "0.0 2413\n", + "Name: WEALTH1, dtype: int64\n", + "float64\n", + "44732\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ODATEDWOSOURCETCODESTATEZIPMAILCODEPVASTATEDOBNOEXCHRECINHSE...TARGET_DHPHONE_DRFA_2RRFA_2FRFA_2AMDMAUD_RMDMAUD_FMDMAUD_ACLUSTER2GEOCODE2
08901GRI0IL6108137120...0.00L4EXXX39.0C
19401BOA1CA9132652020...0.00L2GXXX1.0A
29001AMH1NC2701700...0.01L4EXXX60.0C
38701BRY0CA9595328010...0.01L4EXXX41.0C
486010FL3317620010X...0.01L2FXXX26.0A
..................................................................
954079601ASE1AK9950400...0.00L1GXXX12.0C
954089601DCD1TX7737950010...0.01L1FXXX2.0A
954099501MBC1MI4891038010...0.01L3EXXX34.0B
954108601PRV0CA9132040050X...18.01L4FXXX11.0A
954118801MCC2NC2840918010X...0.01L1GC1C12.0C
\n", + "

95412 rows × 480 columns

\n", + "
" + ], + "text/plain": [ + " ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH \\\n", + "0 8901 GRI 0 IL 61081 3712 0 \n", + "1 9401 BOA 1 CA 91326 5202 0 \n", + "2 9001 AMH 1 NC 27017 0 0 \n", + "3 8701 BRY 0 CA 95953 2801 0 \n", + "4 8601 0 FL 33176 2001 0 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 9601 ASE 1 AK 99504 0 0 \n", + "95408 9601 DCD 1 TX 77379 5001 0 \n", + "95409 9501 MBC 1 MI 48910 3801 0 \n", + "95410 8601 PRV 0 CA 91320 4005 0 \n", + "95411 8801 MCC 2 NC 28409 1801 0 \n", + "\n", + " RECINHSE ... TARGET_D HPHONE_D RFA_2R RFA_2F RFA_2A MDMAUD_R MDMAUD_F \\\n", + "0 ... 0.0 0 L 4 E X X \n", + "1 ... 0.0 0 L 2 G X X \n", + "2 ... 0.0 1 L 4 E X X \n", + "3 ... 0.0 1 L 4 E X X \n", + "4 X ... 0.0 1 L 2 F X X \n", + "... ... ... ... ... ... ... ... ... ... \n", + "95407 ... 0.0 0 L 1 G X X \n", + "95408 ... 0.0 1 L 1 F X X \n", + "95409 ... 0.0 1 L 3 E X X \n", + "95410 X ... 18.0 1 L 4 F X X \n", + "95411 X ... 0.0 1 L 1 G C 1 \n", + "\n", + " MDMAUD_A CLUSTER2 GEOCODE2 \n", + "0 X 39.0 C \n", + "1 X 1.0 A \n", + "2 X 60.0 C \n", + "3 X 41.0 C \n", + "4 X 26.0 A \n", + "... ... ... ... \n", + "95407 X 12.0 C \n", + "95408 X 2.0 A \n", + "95409 X 34.0 B \n", + "95410 X 11.0 A \n", + "95411 C 12.0 C \n", + "\n", + "[95412 rows x 480 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# For the Wealth1 variable \n", + "#First check frequency and if it is th write type and it is on float\n", + "print(data['WEALTH1'].value_counts())\n", + "print(data['WEALTH1'].dtypes)\n", + "\n", + "# Check null values \n", + "print(data['WEALTH1'].isna().sum())\n", + "\n", + "# There is almost 50% of null values in this column so I will just drop it \n", + "\n", + "data = data.drop(columns = 'WEALTH1', axis=1)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f4a07c91", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13.0 7296\n", + "51.0 4622\n", + "65.0 3765\n", + "57.0 2836\n", + "105.0 2617\n", + " ... \n", + "651.0 1\n", + "103.0 1\n", + "601.0 1\n", + "161.0 1\n", + "147.0 1\n", + "Name: ADI, Length: 204, dtype: int64\n", + "float64\n", + "132\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# ADI variable\n", + "#First check frequency and if it is th write type and it is on float\n", + "print(data['ADI'].value_counts())\n", + "print(data['ADI'].dtypes)\n", + "\n", + "# Check null values \n", + "print(data['ADI'].isna().sum())\n", + "\n", + "sns.distplot(data['ADI'])\n", + "plt.show()\n", + "\n", + "# Since this is a continuous variable and we only have 132 null values I will fill them with the mean\n", + "data['ADI'] = data['ADI'].fillna(np.mean(data['ADI']))\n", + "\n", + "# Check if the distribution changed\n", + "sns.distplot(data['ADI'])\n", + "plt.show()\n", + "\n", + "# Check if all the null values were actually filled\n", + "print(data['ADI'].isna().sum())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "44164ac6", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "803.0 7296\n", + "602.0 4632\n", + "807.0 3765\n", + "505.0 2839\n", + "819.0 2588\n", + " ... \n", + "502.0 2\n", + "569.0 1\n", + "554.0 1\n", + "552.0 1\n", + "516.0 1\n", + "Name: DMA, Length: 201, dtype: int64\n", + "float64\n", + "132\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# DMA variable\n", + "#First check frequency and if it is th write type and it is on float\n", + "print(data['DMA'].value_counts())\n", + "print(data['DMA'].dtypes)\n", + "\n", + "# Check null values \n", + "print(data['DMA'].isna().sum())\n", + "\n", + "sns.distplot(data['DMA'])\n", + "plt.show()\n", + "\n", + "# Since this is a continuous variable and we only have 132 null values I will fill them with the mean\n", + "data['DMA'] = data['DMA'].fillna(np.mean(data['DMA']))\n", + "\n", + "# Check if the distribution changed\n", + "sns.distplot(data['DMA'])\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "3119646d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.0 21187\n", + "4480.0 4606\n", + "1600.0 4059\n", + "2160.0 2586\n", + "520.0 1685\n", + " ... \n", + "9140.0 1\n", + "3200.0 1\n", + "9280.0 1\n", + "743.0 1\n", + "8480.0 1\n", + "Name: MSA, Length: 294, dtype: int64\n", + "float64\n", + "132\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# MSA variable\n", + "#First check frequency and if it is th write type and it is on float\n", + "print(data['MSA'].value_counts())\n", + "print(data['MSA'].dtypes)\n", + "\n", + "# Check null values \n", + "print(data['MSA'].isna().sum())\n", + "\n", + "sns.distplot(data['MSA'])\n", + "plt.show()\n", + "\n", + "# Since this is a continuous variable and we only have 132 null values I will fill them with the mean\n", + "data['MSA'] = data['MSA'].fillna(np.mean(data['MSA']))\n", + "\n", + "# Check if the distribution changed\n", + "sns.distplot(data['MSA'])\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}