From 9e9263c38ca236661ea11e3e115932ac3d45c097 Mon Sep 17 00:00:00 2001 From: Miguel Saraiva Date: Mon, 29 May 2023 21:18:00 +0100 Subject: [PATCH] lab --- lab-feature-engineering.ipynb | 381 ++++++++++++++++++++++++++++++++++ 1 file changed, 381 insertions(+) create mode 100644 lab-feature-engineering.ipynb diff --git a/lab-feature-engineering.ipynb b/lab-feature-engineering.ipynb new file mode 100644 index 0000000..b50fc02 --- /dev/null +++ b/lab-feature-engineering.ipynb @@ -0,0 +1,381 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from scipy import stats as st" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\migue\\AppData\\Local\\Temp\\ipykernel_37988\\344141594.py:1: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " data = pd.read_csv('learningSet.csv')\n" + ] + } + ], + "source": [ + "data = pd.read_csv('learningSet.csv')\n", + "categorical = pd.read_csv('categorical.csv')\n", + "numerical = pd.read_csv('numerical.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "Y = data[['TARGET_B', 'TARGET_D']]\n", + "numerical = data.select_dtypes(np.number)\n", + "numerical = numerical.drop(columns=Y.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ODATEDW TCODE DOB AGE NUMCHLD INCOME WEALTH1 HIT MBCRAFT MBGARDEN MBBOOKS MBCOLECT MAGFAML MAGFEM MAGMALE PUBGARDN PUBCULIN PUBHLTH PUBDOITY PUBNEWFN PUBPHOTO PUBOPP MALEMILI MALEVET VIETVETS WWIIVETS LOCALGOV STATEGOV FEDGOV WEALTH2 POP901 POP902 POP903 POP90C1 POP90C2 POP90C3 POP90C4 POP90C5 ETH1 ETH2 ETH3 ETH4 ETH5 ETH6 ETH7 ETH8 ETH9 ETH10 ETH11 ETH12 ETH13 ETH14 ETH15 ETH16 AGE901 AGE902 AGE903 AGE904 AGE905 AGE906 AGE907 CHIL1 CHIL2 CHIL3 AGEC1 AGEC2 AGEC3 AGEC4 AGEC5 AGEC6 AGEC7 CHILC1 CHILC2 CHILC3 CHILC4 CHILC5 HHAGE1 HHAGE2 HHAGE3 HHN1 HHN2 HHN3 HHN4 HHN5 HHN6 MARR1 MARR2 MARR3 MARR4 HHP1 HHP2 DW1 DW2 DW3 DW4 DW5 DW6 DW7 DW8 DW9 HV1 HV2 HV3 HV4 HU1 HU2 HU3 HU4 HU5 HHD1 HHD2 HHD3 HHD4 HHD5 HHD6 HHD7 HHD8 HHD9 HHD10 HHD11 HHD12 ETHC1 ETHC2 ETHC3 ETHC4 ETHC5 ETHC6 HVP1 HVP2 HVP3 HVP4 HVP5 HVP6 HUR1 HUR2 RHP1 RHP2 RHP3 RHP4 HUPA1 HUPA2 HUPA3 HUPA4 HUPA5 HUPA6 HUPA7 RP1 RP2 RP3 RP4 MSA ADI DMA IC1 IC2 IC3 IC4 IC5 IC6 IC7 IC8 IC9 IC10 IC11 IC12 IC13 IC14 IC15 IC16 IC17 IC18 IC19 IC20 IC21 IC22 IC23 HHAS1 HHAS2 HHAS3 HHAS4 MC1 MC2 MC3 TPE1 TPE2 TPE3 TPE4 TPE5 TPE6 TPE7 TPE8 TPE9 PEC1 PEC2 TPE10 TPE11 TPE12 TPE13 LFC1 LFC2 LFC3 LFC4 LFC5 LFC6 LFC7 LFC8 LFC9 LFC10 OCC1 OCC2 OCC3 OCC4 OCC5 OCC6 OCC7 OCC8 OCC9 OCC10 OCC11 OCC12 OCC13 EIC1 EIC2 EIC3 EIC4 EIC5 EIC6 EIC7 EIC8 EIC9 EIC10 EIC11 EIC12 EIC13 EIC14 EIC15 EIC16 OEDC1 OEDC2 OEDC3 OEDC4 OEDC5 OEDC6 OEDC7 EC1 EC2 EC3 EC4 EC5 EC6 EC7 EC8 SEC1 SEC2 SEC3 SEC4 SEC5 AFC1 AFC2 AFC3 AFC4 AFC5 AFC6 VC1 VC2 VC3 VC4 ANC1 ANC2 ANC3 ANC4 ANC5 ANC6 ANC7 ANC8 ANC9 ANC10 ANC11 ANC12 ANC13 ANC14 ANC15 POBC1 POBC2 LSC1 LSC2 LSC3 LSC4 VOC1 VOC2 VOC3 HC1 HC2 HC3 HC4 HC5 HC6 HC7 HC8 HC9 HC10 HC11 HC12 HC13 HC14 HC15 HC16 HC17 HC18 HC19 HC20 HC21 MHUC1 MHUC2 AC1 AC2 ADATE_2 ADATE_3 ADATE_4 ADATE_5 ADATE_6 ADATE_7 ADATE_8 ADATE_9 ADATE_10 ADATE_11 ADATE_12 ADATE_13 ADATE_14 ADATE_15 ADATE_16 ADATE_17 ADATE_18 ADATE_19 ADATE_20 ADATE_21 ADATE_22 ADATE_23 ADATE_24 CARDPROM MAXADATE NUMPROM CARDPM12 NUMPRM12 RDATE_3 RDATE_4 RDATE_5 RDATE_6 RDATE_7 RDATE_8 RDATE_9 RDATE_10 RDATE_11 RDATE_12 RDATE_13 RDATE_14 RDATE_15 RDATE_16 RDATE_17 RDATE_18 RDATE_19 RDATE_20 RDATE_21 RDATE_22 RDATE_23 RDATE_24 RAMNT_3 RAMNT_4 RAMNT_5 RAMNT_6 RAMNT_7 RAMNT_8 RAMNT_9 RAMNT_10 RAMNT_11 RAMNT_12 RAMNT_13 RAMNT_14 RAMNT_15 RAMNT_16 RAMNT_17 RAMNT_18 RAMNT_19 RAMNT_20 RAMNT_21 RAMNT_22 RAMNT_23 RAMNT_24 RAMNTALL NGIFTALL CARDGIFT MINRAMNT MINRDATE MAXRAMNT MAXRDATE LASTGIFT LASTDATE FISTDATE NEXTDATE TIMELAG AVGGIFT CONTROLN HPHONE_D RFA_2F CLUSTER2\n", + "False False False False True False False False False False False False False False False False False False False False False False False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False True False False False True True True True True True True True True True True True True True True True False False False False False True True True True True False True True True True True True True True True True True True True True True True True True True True True False True True True True True True True True True True True True True True True True False False False False False False False False False False True True False False False False False 825\n", + " True True True True False True True True True True True True True True True True True True True False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False True False False False True True True True True True True True True True True True True True True True False False False False False True True True True True False True True True True True True True True True True True True True True True True True True True True True False True True True True True True True True True True True True True True True True False False False False False False False False False False True True False False False False False 554\n", + " False True False True False True True True True True True True True True True True True True True False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False True False False False True True True True True True True True True True True True True True True True False False False False False True True True True True False True True True True True True True True True True True True True True True True True True True True True False True True True True True True True True True True True True True True True True False False False False False False False False False False True True False False False False False 505\n", + " False False False False False False False False False False False False False False False False False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False True False False False True True False False True True True True True True True True True True True True False False False False False True True True True True True True True True False True True True True True True True True True True True True True True True True True True True True True False True True True True True True True True True True True True False False False False False False False False False False True True False False False False False 408\n", + " True True True True False True True True True True True True True True True True True True True False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False True False False False True True False False True True True True True True True True True True True True False False False False False True True True True True True True True True False True True True True True True True True True True True True True True True True True True True True True False True True True True True True True True True True True True False False False False False False False False False False True True False False False False False 335\n", + " ... \n", + " False True False False False False False False False False False False False False False False False False False False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False True False True True False True True True True True False False False False False True True True True True True True False False True True False True False True True False True True True True True True True True True True True True False False True True False True False True True False True True True True True False False False False False False False False False False False False False False False False False 1\n", + " True True True True True True True True True True True True True False False True True False True False True True True True True True True True False False False False False False False False False False False False False False False False False 1\n", + " True True True True True False True True True True True True True True True True True True False False True True True True True True True False True True True True True False False False False False False False False False False False False False False False False False 1\n", + " True False False True True False True True False True True True True True True True True True True True True False True False False True True False True True False True True True True True False False False False False False False False False False False False False False False False False 1\n", + " True True True True False True True True True True True True True True True True True True True False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False True True True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False True True True False True False True True True False True True True True True True True True True True True True False False False False False True True True True True True True True True False True True True True True True True True True True True True True True True True True True True True True False True True True True True True True True True True True True False False False False False False False False False False True True False False False False True 1\n", + "Length: 65122, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numerical.isna().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "A 34484\n", + "B 28505\n", + "D 16580\n", + "C 15524\n", + "Name: GEOCODE2, dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "categorical['GEOCODE2'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "categorical['GEOCODE2'] = categorical['GEOCODE2'].replace(' ', 'A')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9.0 7585\n", + "8.0 6793\n", + "7.0 6198\n", + "6.0 5825\n", + "5.0 5280\n", + "4.0 4810\n", + "3.0 4237\n", + "2.0 4085\n", + "1.0 3454\n", + "0.0 2413\n", + "Name: WEALTH1, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['WEALTH1'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 50680\n", + "True 44732\n", + "Name: WEALTH1, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['WEALTH1'].isna().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "data = data.drop('WEALTH1', axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 95280\n", + "True 132\n", + "Name: ADI, dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['ADI'].isna().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "data['ADI'] = data['ADI'].dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "13.0 7296\n", + "51.0 4622\n", + "65.0 3765\n", + "57.0 2836\n", + "105.0 2617\n", + " ... \n", + "651.0 1\n", + "103.0 1\n", + "601.0 1\n", + "161.0 1\n", + "147.0 1\n", + "Name: ADI, Length: 204, dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['ADI'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 95412\n", + "Name: ADI, dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(data['ADI'] == ' ').value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 95280\n", + "True 132\n", + "Name: DMA, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['DMA'].isna().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "data['DMA'] = data['DMA'].dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 95412\n", + "Name: DMA, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(data['DMA'] == ' ').value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 95280\n", + "True 132\n", + "Name: MSA, dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['MSA'].isna().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "data['MSA'] = data['MSA'].dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 95412\n", + "Name: MSA, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(data['MSA'] == ' ').value_counts()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}