In [12]:
import pandas as pd
import logging
import numpy as np

In [4]:
# Read the CSV file
df = pd.read_csv('./data/IpAddress_to_Country.csv')

In [3]:
# Set up logging
logging.basicConfig(filename='./LogMonitor/IpAddress_to_CountryCleaning.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [5]:
# Check for missing values
missing_values = df.isnull().sum()
logging.info("Missing values:\n %s", missing_values)

In [6]:
# Remove duplicate entries
df.drop_duplicates(inplace=True)
logging.info("Duplicate entries removed.")

In [16]:
# Remove leading and trailing whitespaces in IP address and country columns
df['lower_bound_ip_address'] = df['lower_bound_ip_address'].str.strip()
df['upper_bound_ip_address'] = df['upper_bound_ip_address'].astype(str).str.strip()
df['country'] = df['country'].str.strip()
logging.info("Leading and trailing whitespaces removed")


In [15]:
# Standardize IP address format
df['lower_bound_ip_address'] = df['lower_bound_ip_address'].astype(str).apply(lambda x: x.replace(".", ""))
df['upper_bound_ip_address'] = df['upper_bound_ip_address'].astype(str).apply(lambda x: x.replace(".", ""))
logging.info("IP address format standardized")

In [17]:
# Check for and handle outliers or invalid IP address ranges
df['lower_bound_ip_address'] = df['lower_bound_ip_address'].astype(float).astype(int)
df['upper_bound_ip_address'] = df['upper_bound_ip_address'].astype(float).astype(int)
df = df[(df['lower_bound_ip_address'] >= 0) & (df['upper_bound_ip_address'] <= 4294967295)]
logging.info("Outliers and invalid IP address ranges handled")


In [18]:
# Validate country values and handle inconsistencies or errors
valid_countries = ['Country A', 'Country B', 'Country C']  # replace with actual valid country names
df = df[df['country'].isin(valid_countries)]
logging.info("Country values validated")

In [19]:
# Convert IP address columns to appropriate data type
df['lower_bound_ip_address'] = df['lower_bound_ip_address'].astype('int64')
df['upper_bound_ip_address'] = df['upper_bound_ip_address'].astype('int64')
logging.info("IP address columns converted to int64")

In [21]:
# Perform data type conversions and transformations
df['country'] = df['country'].str.upper()
logging.info("Country names normalized to uppercase")


In [22]:
# Handle special characters or encoding issues
df['country'] = df['country'].str.replace('special_char', 'replacement_char')
logging.info("Special characters handled")


In [23]:
# Save the cleaned dataset
df.to_csv('cleaned_ip_mapping_dataset.csv', index=False)
logging.info("Cleaned dataset saved")