diff --git a/[lab-feature-engineering] Sara.ipynb b/[lab-feature-engineering] Sara.ipynb new file mode 100644 index 0000000..431e9b4 --- /dev/null +++ b/[lab-feature-engineering] Sara.ipynb @@ -0,0 +1,468 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "223d1eaf", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b15ef0f9", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = pd.read_csv('learningSet.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "24a2bbdf", + "metadata": {}, + "outputs": [], + "source": [ + "# Check for null values in the numerical columns.\n", + "# Use appropriate methods to clean the columns GEOCODE2, WEALTH1, ADI, DMA,and MSA.\n", + "# Use appropriate EDA technique where ever necessary." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "19df8494", + "metadata": {}, + "outputs": [], + "source": [ + "categorical = data.select_dtypes(object)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c9432d74", + "metadata": {}, + "outputs": [], + "source": [ + "numerical = data.select_dtypes(include=np.number)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6159f49a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95412 481\n" + ] + } + ], + "source": [ + "num_rows, num_columns = data.shape\n", + "print(num_rows, num_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7138c17d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95412 74\n" + ] + } + ], + "source": [ + "num_rows, num_columns = categorical.shape\n", + "print(num_rows, num_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "68b33e4c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95412 407\n" + ] + } + ], + "source": [ + "num_rows, num_columns = numerical.shape\n", + "print(num_rows, num_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "816f5ee7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Null values by column:\n", + "ODATEDW 0\n", + "TCODE 0\n", + "DOB 0\n", + "AGE 23665\n", + "NUMCHLD 83026\n", + " ... \n", + "TARGET_B 0\n", + "TARGET_D 0\n", + "HPHONE_D 0\n", + "RFA_2F 0\n", + "CLUSTER2 132\n", + "Length: 407, dtype: int64\n" + ] + } + ], + "source": [ + "null_values_by_column = numerical.isnull().sum()\n", + "\n", + "print(\"Null values by column:\")\n", + "print(null_values_by_column)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b5d8313d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A 34484\n", + "B 28505\n", + "D 16580\n", + "C 15524\n", + " 187\n", + "Name: GEOCODE2, dtype: int64\n" + ] + }, + { + "data": { + "text/plain": [ + "132" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(categorical['GEOCODE2'].value_counts())\n", + "categorical['GEOCODE2'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "847f5f21", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9.0 7585\n", + "8.0 6793\n", + "7.0 6198\n", + "6.0 5825\n", + "5.0 5280\n", + "4.0 4810\n", + "3.0 4237\n", + "2.0 4085\n", + "1.0 3454\n", + "0.0 2413\n", + "Name: WEALTH1, dtype: int64\n" + ] + }, + { + "data": { + "text/plain": [ + "44732" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(data['WEALTH1'].value_counts())\n", + "data['WEALTH1'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "951ea68a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13.0 7296\n", + "51.0 4622\n", + "65.0 3765\n", + "57.0 2836\n", + "105.0 2617\n", + " ... \n", + "651.0 1\n", + "103.0 1\n", + "601.0 1\n", + "161.0 1\n", + "147.0 1\n", + "Name: ADI, Length: 204, dtype: int64\n" + ] + }, + { + "data": { + "text/plain": [ + "132" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(data['ADI'].value_counts())\n", + "data['ADI'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "95a21202", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "803.0 7296\n", + "602.0 4632\n", + "807.0 3765\n", + "505.0 2839\n", + "819.0 2588\n", + " ... \n", + "569.0 1\n", + "554.0 1\n", + "584.0 1\n", + "552.0 1\n", + "516.0 1\n", + "Name: DMA, Length: 206, dtype: int64\n" + ] + }, + { + "data": { + "text/plain": [ + "132" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(data['DMA'].value_counts())\n", + "data['DMA'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a7c3dc40", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.0 21333\n", + "4480.0 4606\n", + "1600.0 4059\n", + "2160.0 2586\n", + "520.0 1685\n", + " ... \n", + "9140.0 1\n", + "3200.0 1\n", + "9280.0 1\n", + "743.0 1\n", + "8480.0 1\n", + "Name: MSA, Length: 298, dtype: int64\n" + ] + }, + { + "data": { + "text/plain": [ + "132" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(data['MSA'].value_counts())\n", + "data['MSA'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e683874f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "A 34803\n", + "B 28505\n", + "D 16580\n", + "C 15524\n", + "Name: GEOCODE2, dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "categorical['GEOCODE2'] = categorical['GEOCODE2'].fillna(data['GEOCODE2'].mode()[0])\n", + "categorical['GEOCODE2'].replace(\" \", categorical['GEOCODE2'].mode()[0], inplace=True)\n", + "categorical['GEOCODE2'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c4d74296", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numerical['WEALTH1'] = numerical['WEALTH1'].fillna(numerical['WEALTH1'].mode()[0])\n", + "numerical['WEALTH1'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "64e3896c", + "metadata": {}, + "outputs": [], + "source": [ + "numerical['ADI'] = numerical['ADI'].fillna(round(numerical['ADI'].mean()))\n", + "numerical['DMA'] = numerical['DMA'].fillna(round(numerical['DMA'].mean()))\n", + "numerical['MSA'] = numerical['MSA'].fillna(round(numerical['MSA'].mean()))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b281a4b3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "0\n", + "0\n" + ] + } + ], + "source": [ + "print(numerical['ADI'].isna().sum())\n", + "print(numerical['DMA'].isna().sum())\n", + "print(numerical['MSA'].isna().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c39c9b9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2fb7c69", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}