# Testing Spatial Interpolation on Fardikot City in Dataset
---

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline

## Data Loading and Cleaning
---

In [14]:
df = pd.read_csv('../uranium2.csv')
df.head()

Unnamed: 0,Year,State,District,Block/Taluka,Location/Site/Village,Latitude,Longitude,pH,EC (µS/cm),CO3 (mg/L),...,Total Hardness,Ca (mg/L),Mg (mg/L),Na (mg/L),K (mg/L),SiO2,TDS,Fe (ppm),As (ppb),U (ppb)
0,2023,Punjab,Amritsar,,Jagdev Khurd (Dalla),31.907,74.7391,7.58,1233.0,0.0,...,380.0,96.0,34.0,65.0,14.0,,,1.11,35.9,0.57
1,2023,Punjab,Amritsar,,Laungo Mahal,31.9059,74.8189,7.96,683.0,0.0,...,200.0,40.0,24.0,87.0,6.0,,,0.04,54.06,3.77
2,2023,Punjab,Amritsar,,Makowal,31.9046,74.8904,8.2,412.0,0.0,...,180.0,32.0,24.0,21.0,7.0,,,0.05,44.44,3.04
3,2023,Punjab,Amritsar,,Malakpur,31.9534,74.8453,8.2,442.0,0.0,...,170.0,32.0,22.0,20.0,5.0,,,1.76,35.85,1.04
4,2023,Punjab,Amritsar,,Ramdas,31.9667,74.9111,8.1,373.0,0.0,...,110.0,24.0,12.0,47.0,4.0,,,0.05,36.31,0.11


In [15]:
df_farid = df[df['District'] == 'Faridkot'].reset_index(drop=True)
df_farid = df_farid.sort_values(by='Year', ascending=False)
df_farid.head()

Unnamed: 0,Year,State,District,Block/Taluka,Location/Site/Village,Latitude,Longitude,pH,EC (µS/cm),CO3 (mg/L),...,Total Hardness,Ca (mg/L),Mg (mg/L),Na (mg/L),K (mg/L),SiO2,TDS,Fe (ppm),As (ppb),U (ppb)
0,2023,Punjab,Faridkot,,Bir Chahal,30.6557,74.8166,8.49,1943.0,48.0,...,380.0,24.0,78.0,297.0,11.0,,,0.02,1.9,156.7
1,2023,Punjab,Faridkot,,Chand Baja,30.739,74.862,7.91,2316.0,0.0,...,360.0,28.0,71.0,319.0,163.0,,,0.18,1.3,12.0
2,2023,Punjab,Faridkot,,Daggo Romana,30.6057,74.6965,7.94,159.0,0.0,...,70.0,8.0,12.0,9.0,3.0,,,0.22,3.3,1.45
3,2023,Punjab,Faridkot,,Dalsinghwala,30.2713,74.5536,8.27,1724.0,0.0,...,400.0,60.0,61.0,205.0,13.0,,,-0.02,0.8,28.4
4,2023,Punjab,Faridkot,,Deep Singh Wala,30.7389,74.4792,7.89,2378.0,0.0,...,280.0,28.0,51.0,471.0,13.0,,,0.04,1.72,21.47


### Data Cleaning

In [16]:
obj_cols = df_farid.select_dtypes(include=["object"])
print(obj_cols.columns)

Index(['State', 'District', 'Block/Taluka', 'Location/Site/Village', 'pH',
       'F (mg/L)', 'SO4 (mg/L)', 'NO3 (mg/L)', 'PO4 (mg/L)', 'K (mg/L)',
       'SiO2'],
      dtype='object')


In [17]:
# Removing location-specific cols
obj_to_num = obj_cols.iloc[:, 4:]
obj_to_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   pH          118 non-null    object
 1   F (mg/L)    118 non-null    object
 2   SO4 (mg/L)  118 non-null    object
 3   NO3 (mg/L)  118 non-null    object
 4   PO4 (mg/L)  118 non-null    object
 5   K (mg/L)    118 non-null    object
 6   SiO2        44 non-null     object
dtypes: object(7)
memory usage: 6.6+ KB


In [18]:
# Seeing why: this gives a list of  all unique non-float types for all columns
def find_non_numeric_values(df):
    data_list = []

    for col in obj_to_num.columns:
        non_numeric_counts = df[col][pd.to_numeric(df[col], errors="coerce").isna()].value_counts()
        for value, count in non_numeric_counts.items():
            data_list.append({'Col': col, 'Non-numeric value': value, 'Count': count})
    return pd.DataFrame(data_list)

In [19]:
_result_df_1 = find_non_numeric_values(df_farid)
_result_df_1

Unnamed: 0,Col,Non-numeric value,Count
0,PO4 (mg/L),<0.10,21


In [20]:
# Removing "<" from the strings
_df_clean1 = df_farid.replace({"<0.10": 0.10})
_result_df_2 = find_non_numeric_values(_df_clean1)
_result_df_2

In [21]:
# Final formatting for dtype
_df_clean2 = _df_clean1.copy()
_df_clean2[obj_to_num.columns] = _df_clean2[obj_to_num.columns].astype(float)
_df_clean2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   118 non-null    int64  
 1   State                  118 non-null    object 
 2   District               118 non-null    object 
 3   Block/Taluka           83 non-null     object 
 4   Location/Site/Village  118 non-null    object 
 5   Latitude               118 non-null    float64
 6   Longitude              118 non-null    float64
 7   pH                     118 non-null    float64
 8   EC (µS/cm)             118 non-null    float64
 9   CO3 (mg/L)             118 non-null    float64
 10  HCO3 (mg/L)            118 non-null    float64
 11  Cl (mg/L)              118 non-null    float64
 12  F (mg/L)               118 non-null    float64
 13  SO4 (mg/L)             118 non-null    float64
 14  NO3 (mg/L)             118 non-null    float64
 15  PO4 (m

In [22]:
# Final cleaned dataset
df_clean = _df_clean2.copy(deep=True)
df_clean.head()

Unnamed: 0,Year,State,District,Block/Taluka,Location/Site/Village,Latitude,Longitude,pH,EC (µS/cm),CO3 (mg/L),...,Total Hardness,Ca (mg/L),Mg (mg/L),Na (mg/L),K (mg/L),SiO2,TDS,Fe (ppm),As (ppb),U (ppb)
0,2023,Punjab,Faridkot,,Bir Chahal,30.6557,74.8166,8.49,1943.0,48.0,...,380.0,24.0,78.0,297.0,11.0,,,0.02,1.9,156.7
1,2023,Punjab,Faridkot,,Chand Baja,30.739,74.862,7.91,2316.0,0.0,...,360.0,28.0,71.0,319.0,163.0,,,0.18,1.3,12.0
2,2023,Punjab,Faridkot,,Daggo Romana,30.6057,74.6965,7.94,159.0,0.0,...,70.0,8.0,12.0,9.0,3.0,,,0.22,3.3,1.45
3,2023,Punjab,Faridkot,,Dalsinghwala,30.2713,74.5536,8.27,1724.0,0.0,...,400.0,60.0,61.0,205.0,13.0,,,-0.02,0.8,28.4
4,2023,Punjab,Faridkot,,Deep Singh Wala,30.7389,74.4792,7.89,2378.0,0.0,...,280.0,28.0,51.0,471.0,13.0,,,0.04,1.72,21.47


## Analysing Spatial Variography
---

We are analysing the change in uranium concentration with respect to distance between sample points in Faridkot district of Punjab to test if spatial interpolation techniques can be applied to estimate uranium concentration in unsampled locations.