In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Configuration 
import configparser

# Create ConfigParser object
config = configparser.ConfigParser()
config.read("config.ini", encoding="utf-8")  # Ensure UTF-8 encoding

df_4_eda = config.get("4_EDA", "pkl_filename")
drop_columns = config.get("5_ENCODE", "drop_columns").replace("\n", "").split(", ")
utvecklingsstadium_min = config.getint("5_ENCODE", "utvecklingsstadium_min")
utvecklingsstadium_max = config.getint("5_ENCODE", "utvecklingsstadium_max")
ohe_variables = config.get("5_ENCODE", "ohe_variables").replace("\n", "").split(", ")
te_variables = config.get("5_ENCODE", "te_variables").replace("\n", "").split(", ")
target_variable = config.get("5_ENCODE", "target_variable")
num_variables = config.get("5_ENCODE", "num_variables").replace("\n", "").split(", ")

pkl_filename = config.get("5_ENCODE", "pkl_filename")
csv_filename = config.get("5_ENCODE", "csv_filename")
csv_filename_chk = config.get("5_ENCODE", "csv_filename_chk")

In [3]:
# unpickle - pandas dataframe
df = pd.read_pickle(df_4_eda)

print(df.shape)

(6165, 35)


In [4]:
df

Unnamed: 0,delomrade,lan,latitud,longitud,groda,sort,jordart,skordear,graderingsdatum,graderingstyp,...,BT_7d_sum,G0_7d_sum,G3_7d_sum,G5_7d_sum,G7_7d_sum,G8_7d_sum,G10_7d_sum,g_year,g_month,g_week
0,Blekinge,Blekinge län,6218000,479000,Höstvete,Mariboss,Lättlera (15-25 % ler),2016,2016-04-18,Veckovis,...,100.0,126.0,7.0,0.0,0.0,0.0,0.0,2016,4,16
1,Blekinge,Blekinge län,6218000,479000,Höstvete,Mariboss,Lättlera (15-25 % ler),2016,2016-04-25,Veckovis,...,0.0,126.0,7.0,0.0,0.0,0.0,0.0,2016,4,17
2,Blekinge,Blekinge län,6218000,479000,Höstvete,Mariboss,Lättlera (15-25 % ler),2016,2016-05-02,Veckovis,...,100.0,220.0,45.0,12.0,0.0,0.0,0.0,2016,5,18
3,Blekinge,Blekinge län,6218000,479000,Höstvete,Mariboss,Lättlera (15-25 % ler),2016,2016-05-09,Veckovis,...,0.0,418.0,128.0,41.0,0.0,0.0,0.0,2016,5,19
4,Blekinge,Blekinge län,6218000,479000,Höstvete,Mariboss,Lättlera (15-25 % ler),2016,2016-05-16,Veckovis,...,0.0,549.0,163.0,49.0,0.0,0.0,0.0,2016,5,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6233,SÖ Skåne,Skåne län,6156000,452000,Höstvete,Pondus,Unknown,2024,2024-05-20,Veckovis,...,300.0,466.0,148.0,56.0,18.0,7.0,0.0,2024,5,21
6234,Stockholm,Stockholms län,6593000,655000,Höstvete,Kask,Unknown,2024,2024-05-06,Veckovis,...,300.0,195.0,35.0,5.0,0.0,0.0,0.0,2024,5,19
6235,Stockholm,Stockholms län,6593000,655000,Höstvete,Kask,Unknown,2024,2024-05-13,Veckovis,...,300.0,252.0,48.0,8.0,0.0,0.0,0.0,2024,5,20
6236,Stockholm,Stockholms län,6593000,655000,Höstvete,Norin,Unknown,2024,2024-05-06,Veckovis,...,300.0,195.0,35.0,5.0,0.0,0.0,0.0,2024,5,19


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6165 entries, 0 to 6237
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   delomrade           6165 non-null   object        
 1   lan                 6165 non-null   object        
 2   latitud             6165 non-null   object        
 3   longitud            6165 non-null   object        
 4   groda               6165 non-null   object        
 5   sort                6131 non-null   object        
 6   jordart             6165 non-null   object        
 7   skordear            6165 non-null   int64         
 8   graderingsdatum     6165 non-null   datetime64[ns]
 9   graderingstyp       6165 non-null   object        
 10  utvecklingsstadium  6165 non-null   int64         
 11  Bladfläcksvampar    6165 non-null   float64       
 12  Sädesbladlus        5459 non-null   float64       
 13  WGS84N              6165 non-null   float64       
 1

In [6]:
# Drop unwanted columns:
df.drop(drop_columns, axis=1, inplace=True)
df.shape

(6165, 27)

In [7]:
# Drop unwanted rows:
df = df[(df['utvecklingsstadium'] > utvecklingsstadium_min) & (df['utvecklingsstadium'] < utvecklingsstadium_max)]
df.shape

(3642, 27)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3642 entries, 3 to 6237
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   groda               3642 non-null   object 
 1   sort                3619 non-null   object 
 2   jordart             3642 non-null   object 
 3   utvecklingsstadium  3642 non-null   int64  
 4   Bladfläcksvampar    3642 non-null   float64
 5   WGS84N              3642 non-null   float64
 6   WGS84E              3642 non-null   float64
 7   TM_7d_avg           3642 non-null   float64
 8   TX_7d_avg           3642 non-null   float64
 9   TN_7d_avg           3642 non-null   float64
 10  UM_7d_avg           3642 non-null   float64
 11  UX_7d_avg           3642 non-null   float64
 12  UN_7d_avg           3642 non-null   float64
 13  FM2_7d_avg          3642 non-null   float64
 14  CLDM_7d_avg         3642 non-null   float64
 15  Q0_7d_avg           3642 non-null   float64
 16  RR_7d_sum  

In [9]:
df = df.reset_index(drop=True)

# Categorical Variables:

In [10]:
print(df.select_dtypes(include=['object']).columns.tolist())

['groda', 'sort', 'jordart']


In [11]:
# Check the distribution of categorical variables:
print(df.select_dtypes(include=['object']).nunique())

groda       1
sort       30
jordart     7
dtype: int64


In [12]:
print(df['groda'].value_counts())

groda
Höstvete    3642
Name: count, dtype: int64


In [13]:
print(df['sort'].value_counts())

sort
Julius        551
Praktik       465
Mariboss      366
Brons         357
Torp          329
Linus         215
Hereford      207
Norin         150
Informer      129
RGT Reform    116
Ellvis        107
KWS Kerrin     92
Cubus          90
Hallfreda      74
Nordh          61
Memory         55
Kask           51
Etana          45
Pondus         40
Bright         23
Frontal        21
Saknas         20
Barranco        9
Olivin          8
Terence         8
Opus            7
Stava           7
Fenomen         6
Jonas           6
RGT Saki        4
Name: count, dtype: int64


In [14]:
print(df['jordart'].value_counts())

jordart
Unknown                       1874
Lättlera (15-25 % ler)         762
Sandjord (<5 % ler)            361
Leriga jordar (5-15 % ler)     298
Mellanlera (25-40 % ler)       263
Styv lera (>40 % ler)           77
Mulljord                         7
Name: count, dtype: int64


In [15]:
# One-Hot Encoding:

ohe_var = ohe_variables
#ohe = OneHotEncoder(categories = 'auto')   # drop='none', dtype=np.int32, handle_unknown='ignore', sparse_output=False
ohe = OneHotEncoder(categories = 'auto', drop=None, sparse_output=False)   # dtype=np.int32, handle_unknown='ignore'
ohe_data = ohe.fit_transform(df[ohe_var])
#ohe_df = pd.DataFrame(data = ohe_data.toarray(), columns = ohe.get_feature_names_out(ohe_var))
ohe_df = pd.DataFrame(data = ohe_data, columns = ohe.get_feature_names_out(ohe_var), index=df.index)

ohe_df

Unnamed: 0,groda_Höstvete,jordart_Leriga jordar (5-15 % ler),jordart_Lättlera (15-25 % ler),jordart_Mellanlera (25-40 % ler),jordart_Mulljord,jordart_Sandjord (<5 % ler),jordart_Styv lera (>40 % ler),jordart_Unknown
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
3637,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3638,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3639,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3640,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
#ohe_df.rename(columns={
#    'jordart_Lättlera (15-25 % ler)': 'jordart_Lättlera',
#    'jordart_Leriga jordar (5-15 % ler)': 'jordart_Leriga',
#    'jordart_Mellanlera (25-40 % ler)': 'jordart_Mellanlera',
#    'jordart_Mulljord': 'jordart_Mulljord',
#    'jordart_Sandjord (<5 % ler)': 'jordart_Sandjord',    
#    'jordart_Styv lera (>40 % ler)': 'jordart_Styv_lera',
#    'jordart_Unknown': 'jordart_Unknown',
#    'RGT Reform': 'RGT_Reform',
#    'RGT Koi': 'RGT_Koi'
#}, inplace=True)

#ohe_df

In [17]:
ohe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 8 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   groda_Höstvete                      3642 non-null   float64
 1   jordart_Leriga jordar (5-15 % ler)  3642 non-null   float64
 2   jordart_Lättlera (15-25 % ler)      3642 non-null   float64
 3   jordart_Mellanlera (25-40 % ler)    3642 non-null   float64
 4   jordart_Mulljord                    3642 non-null   float64
 5   jordart_Sandjord (<5 % ler)         3642 non-null   float64
 6   jordart_Styv lera (>40 % ler)       3642 non-null   float64
 7   jordart_Unknown                     3642 non-null   float64
dtypes: float64(8)
memory usage: 227.8 KB


In [18]:
from category_encoders import TargetEncoder

tae_var = te_variables
target = target_variable  # Target variable
tae = TargetEncoder(cols=tae_var)
tae_data = tae.fit_transform(df[tae_var], df[target])
tae_df = pd.DataFrame(data = tae_data, columns = tae_var, index=df.index)

tae_df

Unnamed: 0,sort
0,5.807459
1,5.807459
2,5.807459
3,5.807459
4,5.807459
...,...
3637,5.842724
3638,5.842724
3639,11.211486
3640,9.586729


# Numerical Variables:

In [19]:
print(df.select_dtypes(include=['number']).columns.tolist())

['utvecklingsstadium', 'Bladfläcksvampar', 'WGS84N', 'WGS84E', 'TM_7d_avg', 'TX_7d_avg', 'TN_7d_avg', 'UM_7d_avg', 'UX_7d_avg', 'UN_7d_avg', 'FM2_7d_avg', 'CLDM_7d_avg', 'Q0_7d_avg', 'RR_7d_sum', 'BT_7d_sum', 'G0_7d_sum', 'G3_7d_sum', 'G5_7d_sum', 'G7_7d_sum', 'G8_7d_sum', 'G10_7d_sum', 'g_year', 'g_month', 'g_week']


In [20]:
# Remove target variable from num_variables (if it exists)
if target_variable in num_variables:
    num_variables.remove(target_variable)

num_var=num_variables

In [21]:
# Standard Scaler:

standard_scaler = StandardScaler()
sts_data = standard_scaler.fit_transform(df[num_var])
sts_df = pd.DataFrame(data = sts_data, columns = num_var, index=df.index)

sts_df

Unnamed: 0,utvecklingsstadium,WGS84N,WGS84E,TM_7d_avg,TX_7d_avg,TN_7d_avg,UM_7d_avg,UX_7d_avg,UN_7d_avg,FM2_7d_avg,CLDM_7d_avg,Q0_7d_avg,RR_7d_sum,BT_7d_sum,G0_7d_sum,G3_7d_sum,G5_7d_sum,G7_7d_sum,G8_7d_sum,G10_7d_sum
0,-1.016192,-2.842171e-14,0.0,0.091776,0.221143,0.017808,-0.687976,-0.397392,-0.561158,-0.154727,0.502183,-1.304867,-0.233486,-0.802437,-0.690650,-0.653107,-0.700018,-0.837370,-0.788203,-0.662958
1,-0.939836,-2.842171e-14,0.0,-0.592501,-0.389441,-0.754170,-1.157496,-1.502025,-1.044478,-0.467076,0.399384,-0.600145,-0.629299,-0.802437,-0.423158,-0.509297,-0.641033,-0.837370,-0.788203,-0.662958
2,-0.558057,-2.842171e-14,0.0,0.079718,0.210240,0.139699,-1.633036,-0.968386,-1.961548,0.140271,0.261248,-0.038841,0.162328,-0.802437,-0.163833,-0.340835,-0.537810,-0.712224,-0.749246,-0.662958
3,-0.252633,-2.842171e-14,0.0,1.394011,1.695813,0.989811,-1.867797,-2.035665,-1.640461,-0.961630,-0.851073,1.056568,-0.130867,-0.802437,0.475290,0.374105,0.221615,0.052554,0.029886,0.000125
4,0.969062,-2.842171e-14,0.0,0.429392,0.719969,0.514748,-0.785793,-0.167927,-0.996034,0.712912,-0.117022,0.463119,-0.673279,-0.802437,1.283894,1.446514,1.578258,1.735067,1.880326,2.238032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3637,-0.939836,-2.842171e-14,0.0,-0.737194,-1.013654,-0.785424,0.716070,0.555154,0.822332,1.085997,-0.794051,-0.038841,0.279606,0.903288,-1.029610,-0.981815,-0.943329,-0.837370,-0.788203,-0.662958
3638,-0.558057,-2.842171e-14,0.0,0.182209,0.093030,0.277217,0.425630,0.288334,0.259022,0.539384,-0.587650,0.010613,-0.086888,0.903288,-0.592638,-0.570930,-0.589422,-0.587079,-0.651855,-0.662958
3639,-1.016192,-2.842171e-14,0.0,-0.737194,-1.013654,-0.785424,0.716070,0.555154,0.822332,1.085997,-0.794051,-0.038841,0.279606,0.903288,-1.029610,-0.981815,-0.943329,-0.837370,-0.788203,-0.662958
3640,-1.016192,-2.842171e-14,0.0,-0.571400,-0.817395,-0.391622,1.262339,1.355613,1.224536,-1.100453,-0.117022,-0.617454,0.030390,0.903288,-1.146000,-1.035230,-0.965448,-0.837370,-0.788203,-0.662958


In [22]:
sts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   utvecklingsstadium  3642 non-null   float64
 1   WGS84N              3642 non-null   float64
 2   WGS84E              3642 non-null   float64
 3   TM_7d_avg           3642 non-null   float64
 4   TX_7d_avg           3642 non-null   float64
 5   TN_7d_avg           3642 non-null   float64
 6   UM_7d_avg           3642 non-null   float64
 7   UX_7d_avg           3642 non-null   float64
 8   UN_7d_avg           3642 non-null   float64
 9   FM2_7d_avg          3642 non-null   float64
 10  CLDM_7d_avg         3642 non-null   float64
 11  Q0_7d_avg           3642 non-null   float64
 12  RR_7d_sum           3642 non-null   float64
 13  BT_7d_sum           3642 non-null   float64
 14  G0_7d_sum           3642 non-null   float64
 15  G3_7d_sum           3642 non-null   float64
 16  G5_7d_

In [23]:
# MinMax Scaler:

minmax_scaler = MinMaxScaler()
mms_data = minmax_scaler.fit_transform(df[num_var])
mms_df = pd.DataFrame(data=mms_data, columns=num_var, index=df.index)

mms_df

Unnamed: 0,utvecklingsstadium,WGS84N,WGS84E,TM_7d_avg,TX_7d_avg,TN_7d_avg,UM_7d_avg,UX_7d_avg,UN_7d_avg,FM2_7d_avg,CLDM_7d_avg,Q0_7d_avg,RR_7d_sum,BT_7d_sum,G0_7d_sum,G3_7d_sum,G5_7d_sum,G7_7d_sum,G8_7d_sum,G10_7d_sum
0,0.000000,0.0,0.0,0.538164,0.550067,0.587234,0.338312,0.473587,0.334680,0.333333,0.792188,0.189586,0.099476,0.000000,0.149332,0.091822,0.046964,0.000000,0.000000,0.000000
1,0.026316,0.0,0.0,0.428502,0.450378,0.455851,0.229941,0.219287,0.219171,0.274220,0.770544,0.316422,0.028796,0.000000,0.202347,0.116930,0.056128,0.000000,0.000000,0.000000
2,0.157895,0.0,0.0,0.536232,0.548287,0.607979,0.120181,0.342138,0.000000,0.389163,0.741461,0.417445,0.170157,0.000000,0.253743,0.146341,0.072165,0.017308,0.005076,0.000000
3,0.263158,0.0,0.0,0.746860,0.790832,0.752660,0.065995,0.096437,0.076737,0.180624,0.507271,0.614597,0.117801,0.000000,0.380413,0.271162,0.190149,0.123077,0.106599,0.073733
4,0.684211,0.0,0.0,0.592271,0.631509,0.671809,0.315735,0.526413,0.230749,0.497537,0.661819,0.507788,0.020942,0.000000,0.540672,0.458393,0.400916,0.355769,0.347716,0.322581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3637,0.026316,0.0,0.0,0.405314,0.348465,0.450532,0.662383,0.692875,0.665320,0.568144,0.519276,0.417445,0.191099,0.428571,0.082153,0.034433,0.009164,0.000000,0.000000,0.000000
3638,0.157895,0.0,0.0,0.552657,0.529150,0.631383,0.595346,0.631450,0.530695,0.464696,0.562732,0.426346,0.125654,0.428571,0.168758,0.106169,0.064147,0.034615,0.017766,0.000000
3639,0.000000,0.0,0.0,0.405314,0.348465,0.450532,0.662383,0.692875,0.665320,0.568144,0.519276,0.417445,0.191099,0.428571,0.082153,0.034433,0.009164,0.000000,0.000000,0.000000
3640,0.000000,0.0,0.0,0.431884,0.380507,0.517553,0.788468,0.877150,0.761443,0.154351,0.661819,0.313307,0.146597,0.428571,0.059085,0.025108,0.005727,0.000000,0.000000,0.000000


In [24]:
mms_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   utvecklingsstadium  3642 non-null   float64
 1   WGS84N              3642 non-null   float64
 2   WGS84E              3642 non-null   float64
 3   TM_7d_avg           3642 non-null   float64
 4   TX_7d_avg           3642 non-null   float64
 5   TN_7d_avg           3642 non-null   float64
 6   UM_7d_avg           3642 non-null   float64
 7   UX_7d_avg           3642 non-null   float64
 8   UN_7d_avg           3642 non-null   float64
 9   FM2_7d_avg          3642 non-null   float64
 10  CLDM_7d_avg         3642 non-null   float64
 11  Q0_7d_avg           3642 non-null   float64
 12  RR_7d_sum           3642 non-null   float64
 13  BT_7d_sum           3642 non-null   float64
 14  G0_7d_sum           3642 non-null   float64
 15  G3_7d_sum           3642 non-null   float64
 16  G5_7d_

In [25]:
# Cyclic Encoding:

# cye_df = pd.DataFrame()
# cye_df['year'] = df['year']
# cye_df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
# cye_df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
# cye_df['week_sin'] = np.sin(2 * np.pi * df['week'] / 52)
# cye_df['week_cos'] = np.cos(2 * np.pi * df['week'] / 52)

cye_df = pd.DataFrame({
    'g_year': df['g_year'],
    'g_month_sin': np.sin(2 * np.pi * df['g_month'] / 12),
    'g_month_cos': np.cos(2 * np.pi * df['g_month'] / 12),
    'g_week_sin': np.sin(2 * np.pi * df['g_week'] / 52),
    'g_week_cos': np.cos(2 * np.pi * df['g_week'] / 52)
},index=df.index)   

cye_df

Unnamed: 0,g_year,g_month_sin,g_month_cos,g_week_sin,g_week_cos
0,2016,5.000000e-01,-0.866025,0.748511,-0.663123
1,2016,5.000000e-01,-0.866025,0.663123,-0.748511
2,2016,5.000000e-01,-0.866025,0.568065,-0.822984
3,2016,5.000000e-01,-0.866025,0.464723,-0.885456
4,2016,1.224647e-16,-1.000000,0.354605,-0.935016
...,...,...,...,...,...
3637,2024,5.000000e-01,-0.866025,0.663123,-0.748511
3638,2024,5.000000e-01,-0.866025,0.568065,-0.822984
3639,2024,5.000000e-01,-0.866025,0.663123,-0.748511
3640,2024,5.000000e-01,-0.866025,0.748511,-0.663123


In [26]:
cye_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   g_year       3642 non-null   int32  
 1   g_month_sin  3642 non-null   float64
 2   g_month_cos  3642 non-null   float64
 3   g_week_sin   3642 non-null   Float64
 4   g_week_cos   3642 non-null   Float64
dtypes: Float64(2), float64(2), int32(1)
memory usage: 135.3 KB


In [27]:
# Reset index for all DataFrames before concatenation - dont do:
#ohe_df.reset_index(drop=True, inplace=True)
#sts_df.reset_index(drop=True, inplace=True)
#cye_df.reset_index(drop=True, inplace=True)

# Combine all features
X = pd.concat([ohe_df, sts_df, cye_df], axis=1).reset_index(drop=True)  # Reset index for consistency
y = df[target_variable].reset_index(drop=True)  # Reset index for consistency

X.shape

(3642, 33)

In [28]:
# Final df for model:
df_final = pd.concat([ohe_df, tae_df, sts_df, cye_df, df[target_variable]], axis=1).reset_index(drop=True)  # Reset index for consistency

df_final.shape

(3642, 35)

In [29]:
# Final df for validation:
df_final_chk = pd.concat([df, ohe_df, tae_df, sts_df, cye_df], axis=1)

df_final_chk.shape

(3642, 61)

In [30]:
# Define all DataFrames to check
dfs_to_check = [ohe_df, tae_df, sts_df, cye_df, X, y, df_final, df_final_chk]

# Perform assertions in a loop
for i, df_check in enumerate(dfs_to_check, 1):
    assert df.shape[0] == df_check.shape[0], f"Row count mismatch in DataFrame {i} ({df_check})!"

print("✅ Row count matches for all DataFrames.")

✅ Row count matches for all DataFrames.


In [31]:
import pandas as pd

# Define all DataFrames to check
dfs_to_check = [ohe_df, tae_df, sts_df, cye_df, X, y, df_final, df_final_chk]

# Perform index equality check in a loop
for i, df_check in enumerate(dfs_to_check, 1):
    pd.testing.assert_index_equal(df.index, df_check.index, 
                                  obj=f"Index mismatch in DataFrame {i} ({df_check})!")

print("✅ Index matches for all DataFrames.")


✅ Index matches for all DataFrames.


In [32]:
df_final

Unnamed: 0,groda_Höstvete,jordart_Leriga jordar (5-15 % ler),jordart_Lättlera (15-25 % ler),jordart_Mellanlera (25-40 % ler),jordart_Mulljord,jordart_Sandjord (<5 % ler),jordart_Styv lera (>40 % ler),jordart_Unknown,sort,utvecklingsstadium,...,G5_7d_sum,G7_7d_sum,G8_7d_sum,G10_7d_sum,g_year,g_month_sin,g_month_cos,g_week_sin,g_week_cos,Bladfläcksvampar
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.807459,-1.016192,...,-0.700018,-0.837370,-0.788203,-0.662958,2016,5.000000e-01,-0.866025,0.748511,-0.663123,3.00
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.807459,-0.939836,...,-0.641033,-0.837370,-0.788203,-0.662958,2016,5.000000e-01,-0.866025,0.663123,-0.748511,3.00
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.807459,-0.558057,...,-0.537810,-0.712224,-0.749246,-0.662958,2016,5.000000e-01,-0.866025,0.568065,-0.822984,1.00
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.807459,-0.252633,...,0.221615,0.052554,0.029886,0.000125,2016,5.000000e-01,-0.866025,0.464723,-0.885456,0.00
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.807459,0.969062,...,1.578258,1.735067,1.880326,2.238032,2016,1.224647e-16,-1.000000,0.354605,-0.935016,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3637,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.842724,-0.939836,...,-0.943329,-0.837370,-0.788203,-0.662958,2024,5.000000e-01,-0.866025,0.663123,-0.748511,6.00
3638,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.842724,-0.558057,...,-0.589422,-0.587079,-0.651855,-0.662958,2024,5.000000e-01,-0.866025,0.568065,-0.822984,2.00
3639,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11.211486,-1.016192,...,-0.943329,-0.837370,-0.788203,-0.662958,2024,5.000000e-01,-0.866025,0.663123,-0.748511,0.00
3640,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9.586729,-1.016192,...,-0.965448,-0.837370,-0.788203,-0.662958,2024,5.000000e-01,-0.866025,0.748511,-0.663123,6.00


In [33]:
df_final_chk

Unnamed: 0,groda,sort,jordart,utvecklingsstadium,Bladfläcksvampar,WGS84N,WGS84E,TM_7d_avg,TX_7d_avg,TN_7d_avg,...,G3_7d_sum,G5_7d_sum,G7_7d_sum,G8_7d_sum,G10_7d_sum,g_year,g_month_sin,g_month_cos,g_week_sin,g_week_cos
0,Höstvete,Mariboss,Lättlera (15-25 % ler),31,3.00,69.042,20.262,3.64,8.70,-1.00,...,-0.653107,-0.700018,-0.837370,-0.788203,-0.662958,2016,5.000000e-01,-0.866025,0.748511,-0.663123
1,Höstvete,Mariboss,Lättlera (15-25 % ler),32,3.00,69.042,20.262,1.37,6.46,-3.47,...,-0.509297,-0.641033,-0.837370,-0.788203,-0.662958,2016,5.000000e-01,-0.866025,0.663123,-0.748511
2,Höstvete,Mariboss,Lättlera (15-25 % ler),37,1.00,69.042,20.262,3.60,8.66,-0.61,...,-0.340835,-0.537810,-0.712224,-0.749246,-0.662958,2016,5.000000e-01,-0.866025,0.568065,-0.822984
3,Höstvete,Mariboss,Lättlera (15-25 % ler),41,0.00,69.042,20.262,7.96,14.11,2.11,...,0.374105,0.221615,0.052554,0.029886,0.000125,2016,5.000000e-01,-0.866025,0.464723,-0.885456
4,Höstvete,Mariboss,Lättlera (15-25 % ler),57,3.00,69.042,20.262,4.76,10.53,0.59,...,1.446514,1.578258,1.735067,1.880326,2.238032,2016,1.224647e-16,-1.000000,0.354605,-0.935016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3637,Höstvete,Pondus,Unknown,32,6.00,69.042,20.262,0.89,4.17,-3.57,...,-0.981815,-0.943329,-0.837370,-0.788203,-0.662958,2024,5.000000e-01,-0.866025,0.663123,-0.748511
3638,Höstvete,Pondus,Unknown,37,2.00,69.042,20.262,3.94,8.23,-0.17,...,-0.570930,-0.589422,-0.587079,-0.651855,-0.662958,2024,5.000000e-01,-0.866025,0.568065,-0.822984
3639,Höstvete,Kask,Unknown,31,0.00,69.042,20.262,0.89,4.17,-3.57,...,-0.981815,-0.943329,-0.837370,-0.788203,-0.662958,2024,5.000000e-01,-0.866025,0.663123,-0.748511
3640,Höstvete,Norin,Unknown,31,6.00,69.042,20.262,1.44,4.89,-2.31,...,-1.035230,-0.965448,-0.837370,-0.788203,-0.662958,2024,5.000000e-01,-0.866025,0.748511,-0.663123


In [34]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 35 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   groda_Höstvete                      3642 non-null   float64
 1   jordart_Leriga jordar (5-15 % ler)  3642 non-null   float64
 2   jordart_Lättlera (15-25 % ler)      3642 non-null   float64
 3   jordart_Mellanlera (25-40 % ler)    3642 non-null   float64
 4   jordart_Mulljord                    3642 non-null   float64
 5   jordart_Sandjord (<5 % ler)         3642 non-null   float64
 6   jordart_Styv lera (>40 % ler)       3642 non-null   float64
 7   jordart_Unknown                     3642 non-null   float64
 8   sort                                3642 non-null   float64
 9   utvecklingsstadium                  3642 non-null   float64
 10  WGS84N                              3642 non-null   float64
 11  WGS84E                              3642 no

In [35]:
df_final_chk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 61 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   groda                               3642 non-null   object 
 1   sort                                3619 non-null   object 
 2   jordart                             3642 non-null   object 
 3   utvecklingsstadium                  3642 non-null   int64  
 4   Bladfläcksvampar                    3642 non-null   float64
 5   WGS84N                              3642 non-null   float64
 6   WGS84E                              3642 non-null   float64
 7   TM_7d_avg                           3642 non-null   float64
 8   TX_7d_avg                           3642 non-null   float64
 9   TN_7d_avg                           3642 non-null   float64
 10  UM_7d_avg                           3642 non-null   float64
 11  UX_7d_avg                           3642 no

In [36]:
# Function to clean and rename columns:
def auto_rename_columns(columns):
    renamed_columns = {}
    for col in columns:
        new_col = re.sub(r'\s?\(.*\)', '', col)  # Remove any text in and with parentheses ()
        new_col = new_col.replace(' ', '_')  # Replace spaces with underscores
        renamed_columns[col] = new_col  # Store renamed column
    return renamed_columns
 
# Appy column renames to df
df_final.rename(columns=auto_rename_columns(ohe_df.columns), inplace=True)
df_final_chk.rename(columns=auto_rename_columns(ohe_df.columns), inplace=True)

#X.columns = X.columns.str.replace(' ', '_')

In [37]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   groda_Höstvete         3642 non-null   float64
 1   jordart_Leriga_jordar  3642 non-null   float64
 2   jordart_Lättlera       3642 non-null   float64
 3   jordart_Mellanlera     3642 non-null   float64
 4   jordart_Mulljord       3642 non-null   float64
 5   jordart_Sandjord       3642 non-null   float64
 6   jordart_Styv_lera      3642 non-null   float64
 7   jordart_Unknown        3642 non-null   float64
 8   sort                   3642 non-null   float64
 9   utvecklingsstadium     3642 non-null   float64
 10  WGS84N                 3642 non-null   float64
 11  WGS84E                 3642 non-null   float64
 12  TM_7d_avg              3642 non-null   float64
 13  TX_7d_avg              3642 non-null   float64
 14  TN_7d_avg              3642 non-null   float64
 15  UM_7

In [38]:
df_final_chk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 61 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   groda                  3642 non-null   object 
 1   sort                   3619 non-null   object 
 2   jordart                3642 non-null   object 
 3   utvecklingsstadium     3642 non-null   int64  
 4   Bladfläcksvampar       3642 non-null   float64
 5   WGS84N                 3642 non-null   float64
 6   WGS84E                 3642 non-null   float64
 7   TM_7d_avg              3642 non-null   float64
 8   TX_7d_avg              3642 non-null   float64
 9   TN_7d_avg              3642 non-null   float64
 10  UM_7d_avg              3642 non-null   float64
 11  UX_7d_avg              3642 non-null   float64
 12  UN_7d_avg              3642 non-null   float64
 13  FM2_7d_avg             3642 non-null   float64
 14  CLDM_7d_avg            3642 non-null   float64
 15  Q0_7

In [39]:
df_final.shape

(3642, 35)

In [40]:
df_final_chk.shape

(3642, 61)

In [41]:
# pickle - pandas dataframe
df_final.to_pickle(pkl_filename)
print(f"Data Pickled")

# Save the DataFrame to a CSV file with UTF-8 encoding
csv_file = csv_filename
df_final_chk.to_csv(csv_file, index=False, encoding='utf-8-sig')  # Ensure proper encoding for special characters
print(f"Data saved to {csv_file}")

# Save the DataFrame to a CSV file with UTF-8 encoding
csv_file = csv_filename_chk
df_final_chk.to_csv(csv_file, index=False, encoding='utf-8-sig')  # Ensure proper encoding for special characters
print(f"Data saved to {csv_file}")

Data Pickled
Data saved to df_5_final_t1.csv
Data saved to df_5_final_chk_t1.csv
