In [1]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


***EDA*** 

    # Handling Missing Values

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('data/RealEstateDataset.csv', sep=';')

In [None]:
df["Cre"]

Unnamed: 0,name_nsi,price,index,environment,quality_of_living,safety,transport,services,relax,condition,...,last_reconstruction,total_floors,floor,lift,balkonies,loggia,cellar,type,rooms,district
14833,Banská Bystrica,159000,,,,,,,,New building,...,,,,1,,,0,2-room apartment,2,Banská Bystrica
12118,Poprad,74990,,,,,,,,Partial reconstruction,...,,10.0,7.0,1,,,0,1-room apartment,1,Poprad
2748,Bratislava - mestská časť Podunajské Biskupice,168000,79.0,72.0,97.0,8.0,93.0,79.0,5.0,Complete reconstruction,...,2022.0,3.0,,0,,,0,3-room apartment,3,Bratislava II


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15403 entries, 0 to 15402
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   name_nsi             15403 non-null  object 
 1   price                15403 non-null  int64  
 2   index                5162 non-null   object 
 3   environment          4132 non-null   object 
 4   quality_of_living    4132 non-null   object 
 5   safety               4132 non-null   object 
 6   transport            4132 non-null   object 
 7   services             4132 non-null   object 
 8   relax                4132 non-null   object 
 9   condition            15076 non-null  object 
 10  area                 14687 non-null  object 
 11  energy_costs         980 non-null    float64
 12  provision            15403 non-null  int64  
 13  certificate          6445 non-null   object 
 14  construction_type    2367 non-null   object 
 15  orientation          1023 non-null  

#### The second way of finding  null values
    # by using the isnull() function.

In [41]:
print(df.isnull().sum())

name_nsi                   0
price                      0
index                  10241
environment            11271
quality_of_living      11271
safety                 11271
transport              11271
services               11271
relax                  11271
condition                327
area                     716
energy_costs           14423
provision                  0
certificate             8958
construction_type      13036
orientation            14380
year_built             11525
last_reconstruction    14721
total_floors            8117
floor                   7087
lift                       0
balkonies              13636
loggia                 13883
cellar                     0
type                       0
rooms                      0
district                   0
dtype: int64


In [8]:
9939+61

10000

## Handling Missing Values

***1. Deleting the columns with missing data***

In [42]:
updated_df = df.dropna(axis=1)

In [43]:
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15403 entries, 0 to 15402
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name_nsi   15403 non-null  object
 1   price      15403 non-null  int64 
 2   provision  15403 non-null  int64 
 3   lift       15403 non-null  int64 
 4   cellar     15403 non-null  int64 
 5   type       15403 non-null  object
 6   rooms      15403 non-null  int64 
 7   district   15403 non-null  object
dtypes: int64(5), object(3)
memory usage: 962.8+ KB


## issues factor
    # may lose valuable information on that feature, because we have deleted it completely due to some null values.

    # Should only be used if there are too many null values.

***2. Deleting the rows with missing data***

In [44]:
updated_df = df.dropna(axis=0)

In [45]:
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   name_nsi             0 non-null      object 
 1   price                0 non-null      int64  
 2   index                0 non-null      object 
 3   environment          0 non-null      object 
 4   quality_of_living    0 non-null      object 
 5   safety               0 non-null      object 
 6   transport            0 non-null      object 
 7   services             0 non-null      object 
 8   relax                0 non-null      object 
 9   condition            0 non-null      object 
 10  area                 0 non-null      object 
 11  energy_costs         0 non-null      float64
 12  provision            0 non-null      int64  
 13  certificate          0 non-null      object 
 14  construction_type    0 non-null      object 
 15  orientation          0 non-null      object 
 16  year_bu

## In this case, there are possibilities of getting better accuracy than before.
    # columns may contain more valuable information than we expected.

***3. Filling the Missing Values – Imputation***

In this case, we will be filling the missing values with a certain number.

The possible ways to do this are:

- Filling the missing data with the mean or median value if it’s a numerical variable.
- Filling the missing data with mode if it’s a categorical value.
- Filling the numerical value with 0 or -999, or some other number that will not occur in the data. This can be done so that the machine can recognize that the data is not real or is different.
- Filling the categorical value with a new type for the missing values.

In [51]:
df['energy_costs'].mean()

150.6204081632653

In [47]:
df['price'].median()

136900.0

In [48]:
df['relax'].mode()

0    7,1
Name: relax, dtype: object

In [53]:
#fillna: fills the null records
#dropna: drops the null records

updated_df = df
updated_df['index']=updated_df['index'].fillna(updated_df['index'].mode()[0])
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15403 entries, 0 to 15402
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   name_nsi             15403 non-null  object 
 1   price                15403 non-null  int64  
 2   index                15403 non-null  object 
 3   environment          4132 non-null   object 
 4   quality_of_living    4132 non-null   object 
 5   safety               4132 non-null   object 
 6   transport            4132 non-null   object 
 7   services             4132 non-null   object 
 8   relax                4132 non-null   object 
 9   condition            15076 non-null  object 
 10  area                 14687 non-null  object 
 11  energy_costs         15403 non-null  float64
 12  provision            15403 non-null  int64  
 13  certificate          6445 non-null   object 
 14  construction_type    2367 non-null   object 
 15  orientation          1023 non-null  

In [55]:
# use filter to include only numeric columns
numeric_columns = updated_df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", numeric_columns)

Numeric columns: ['price', 'energy_costs', 'provision', 'year_built', 'last_reconstruction', 'total_floors', 'floor', 'lift', 'balkonies', 'loggia', 'cellar', 'rooms']


In [10]:
# Display the data types of the first 5 rows' descriptive statistics
print(df.head().describe().dtypes)
# Display descriptive statistics for both numeric and object columns
print(df.head().describe(include='all').dtypes)

# Show count of missing values for each column
print("\nMissing values per column:\n", df.head().isnull().sum())

# Show the first 5 rows with missing values highlighted
df.head().style.highlight_null(props='background-color: red')


price                  float64
energy_costs           float64
provision              float64
year_built             float64
last_reconstruction    float64
total_floors           float64
floor                  float64
lift                   float64
balkonies              float64
loggia                 float64
cellar                 float64
rooms                  float64
dtype: object
name_nsi                object
price                  float64
index                   object
environment             object
quality_of_living       object
safety                  object
transport               object
services                object
relax                   object
condition               object
area                    object
energy_costs           float64
provision              float64
certificate             object
construction_type       object
orientation             object
year_built             float64
last_reconstruction    float64
total_floors           float64
floor                  fl

Unnamed: 0,name_nsi,price,index,environment,quality_of_living,safety,transport,services,relax,condition,area,energy_costs,provision,certificate,construction_type,orientation,year_built,last_reconstruction,total_floors,floor,lift,balkonies,loggia,cellar,type,rooms,district
0,Semerovo,42000,,,,,,,,Original condition,58,,0,,,,,,,,0,,,0,3-room apartment,3,Nové Zámky
1,Semerovo,42000,,,,,,,,Original condition,58,,0,none,Brick,,,,2.0,,0,,,0,3-room apartment,3,Nové Zámky
2,Štúrovo,107000,83.0,,,,,,,Partial reconstruction,40,,0,,,,,,5.0,3.0,0,,,0,1-room apartment,1,Nové Zámky
3,Štúrovo,105000,,,,,,,,Complete reconstruction,76,200.0,1,C,,,,,7.0,4.0,1,,,0,3-room apartment,3,Nové Zámky
4,Štúrovo,82000,,,,,,,,Partial reconstruction,63,,0,,,,,2018.0,,2.0,0,,,0,2-room apartment,2,Nové Zámky


In [12]:
# filling missing value to more than two columns


df_numeric_data = ['CreditScore', 'Age', 'Tenure', 'Balance', 
                   'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']

updated_df = df.copy()

for col in df_numeric_data:
    if updated_df[col].isnull().sum() > 0:  # Check if column has missing values
        mean_value = updated_df[col].mean()
        updated_df[col] = updated_df[col].fillna(mean_value)

updated_df.info()


KeyError: 'CreditScore'

In [15]:
updated_df1 = df
updated_df1['Age']=updated_df1['Age'].fillna(updated_df1['Age'].median())
updated_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  float64
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    9999 non-null   float64
 10  HasCrCard        9998 non-null   float64
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 1.1+ MB


In [16]:
updated_df = df
updated_df['Gender']=updated_df['Gender'].fillna(updated_df['Gender'].mode()[0])
# updated_df.info()
updated_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  float64
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    9999 non-null   float64
 10  HasCrCard        9998 non-null   float64
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 1.1+ MB
