In [1]:
import pandas as pd
import numpy as np

# Crop and Soil Properties Data Processing

In [2]:
# Reading the data 

crop_data_path = '../data/data_raw/cpdata.csv'
soil_data_path = '../data/data_raw/soil_properties.csv'

crop_data = pd.read_csv(crop_data_path)
soil_data = pd.read_csv(soil_data_path)

In [3]:
# Shuffle the crop dataframe
crop_data_shuffled = crop_data.sample(frac=1)

# Displaying the crop data
crop_data_shuffled.head()

Unnamed: 0,temperature,humidity,ph,rainfall,label
2717,24.121887,90.723516,6.945563,102.835632,orange
2072,18.729877,61.331862,5.001039,139.871004,Pigeon Peas
1790,27.811328,97.484106,6.465906,154.062122,Coconut
695,23.052764,60.424786,7.011121,52.602853,Lentil
1319,25.748369,84.941183,5.212362,104.949326,Sugarcane


In [4]:
# Shuffle the soil dataframe
soil_data_shuffled = soil_data.sample(frac=1)

#Displaying the soil data
soil_data_shuffled.head()

Unnamed: 0.1,Unnamed: 0,Crop,N,P,K,pH
1238,1238,Orange,20,10,10,3.94
1752,1752,French Beans(Farasbi),90,125,60,4.82
158,158,Banana,100,75,50,6.52
823,823,Spinach,60,60,50,6.16
1728,1728,Black Pepper,100,60,50,4.84


In [5]:
# Function for lowering the cases

def change_case(crop):
    crop = crop.replace(" ", "")
    crop = crop.lower()
    return crop

crop_data_shuffled['label'] = crop_data_shuffled['label'].apply(change_case)
soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].apply(change_case)

In [6]:
# Cleaning up the soil data

soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].replace('mungbeans','mungbean')
soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].replace('lentils(masoordal)','lentil')
soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].replace('pigeonpeas(toordal)','pigeonpeas')
soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].replace('mothbean(matki)','mothbeans')
soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].replace('chickpeas(channa)','chickpea')
del soil_data_shuffled['Unnamed: 0']

In [7]:
# Displaying the shuffled crop data
crop_data_shuffled.head()

Unnamed: 0,temperature,humidity,ph,rainfall,label
2717,24.121887,90.723516,6.945563,102.835632,orange
2072,18.729877,61.331862,5.001039,139.871004,pigeonpeas
1790,27.811328,97.484106,6.465906,154.062122,coconut
695,23.052764,60.424786,7.011121,52.602853,lentil
1319,25.748369,84.941183,5.212362,104.949326,sugarcane


In [8]:
crop_names = crop_data_shuffled['label'].unique()
crop_names

array(['orange', 'pigeonpeas', 'coconut', 'lentil', 'sugarcane', 'maize',
       'pomegranate', 'coffee', 'jute', 'tobacco', 'watermelon', 'rice',
       'grapes', 'groundnut', 'rubber', 'peas', 'papaya', 'blackgram',
       'banana', 'mungbean', 'cotton', 'tea', 'adzukibeans', 'chickpea',
       'mothbeans', 'muskmelon', 'mango', 'millet', 'apple',
       'kidneybeans', 'wheat'], dtype=object)

In [9]:
# Displaying the shuffled soil data
soil_data_shuffled.head()

Unnamed: 0,Crop,N,P,K,pH
1238,orange,20,10,10,3.94
1752,frenchbeans(farasbi),90,125,60,4.82
158,banana,100,75,50,6.52
823,spinach,60,60,50,6.16
1728,blackpepper,100,60,50,4.84


In [10]:
crop_names_from_soil_data = soil_data_shuffled['Crop'].unique()
crop_names_from_soil_data

array(['orange', 'frenchbeans(farasbi)', 'banana', 'spinach',
       'blackpepper', 'cardamom', 'blackeyedbeans(chawli)',
       'clusterbeans(gavar)', 'rice', 'horsegram(kulthi)', 'lemongrass',
       'mungbean', 'pomegranate', 'ridgegourd', 'gooseberry(amla)',
       'cloves', 'ladyfinger', 'grapes', 'corianderseeds', 'turmeric',
       'pumpkin', 'coconut', 'fenugreekleaf(methi)', 'garlic',
       'almondnut', 'pistachionut', 'asafoetida', 'jackfruit',
       'ziziphusmauritiana(bor)', 'dates', 'pineapple',
       'garciniaindica(kokam)', 'cauliflower', 'papaya', 'apricot',
       'barley(jav)', 'lemon', 'drumstick–moringa', 'apple', 'greenpeas',
       'brinjal', 'jute', 'cashewnuts', 'cucumber', 'ginger',
       'ragi(naachnnii)', 'blackgram', 'soyabean', 'mango',
       'custardapple', 'lentil', 'chickoo', 'mushroom', 'jaiphal(nutmeg)',
       'cuminseeds', 'maize', 'potato', 'tapioca(suran)', 'figs',
       'jambun(syzygiumcumini)', 'radish', 'watermelon',
       'favabeans(papd

In [11]:
# Extracting relevant labels from crop_data based on crop_names in soil_data

extract_labels = []
for crop_name in crop_names_from_soil_data:
    if crop_name in crop_names:
        extract_labels.append(crop_name)

In [17]:
# using extract labesl on crop to get all the data related to those labels
new_crop_data = pd.DataFrame(columns = crop_data_shuffled.columns)
new_soil_data = pd.DataFrame(columns = soil_data_shuffled.columns)


In [18]:

for label in extract_labels:
    new_rows = crop_data_shuffled[crop_data_shuffled['label'] == label]  # Extract all rows for the specific label
    new_crop_data = pd.concat([new_crop_data, new_rows], ignore_index=True)  # Concatenate the new rows to new_crop_data DataFrame

 

In [19]:
# for label in extract_labels:
#     new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])
    
for label in extract_labels:
    new_rows = soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[:1]  # Extract the first row(s) for the specific label
    new_soil_data = pd.concat([new_soil_data, new_rows], ignore_index=True)  # Concatenate the new rows to new_soil_data DataFrame

    


In [20]:
new_crop_data

Unnamed: 0,temperature,humidity,ph,rainfall,label
0,24.121887,90.723516,6.945563,102.835632,orange
1,13.208444,94.027694,6.354023,106.269616,orange
2,30.491838,90.458286,7.781989,113.330211,orange
3,11.866319,93.683946,6.976998,106.060149,orange
4,20.681852,90.915105,7.829507,109.751393,orange
...,...,...,...,...,...
2195,29.193787,91.462411,6.660955,26.482403,muskmelon
2196,29.480699,90.336987,6.640471,26.036577,muskmelon
2197,29.109683,92.435110,6.144109,27.956023,muskmelon
2198,27.996117,90.846603,6.630301,21.618938,muskmelon


In [21]:
new_soil_data

Unnamed: 0,Crop,N,P,K,pH
0,orange,20,10,10,3.94
1,banana,100,75,50,6.52
2,rice,80,40,40,5.58
3,mungbean,20,40,20,5.54
4,pomegranate,20,10,40,5.38
5,grapes,20,125,200,4.08
6,coconut,20,10,30,4.92
7,papaya,50,50,50,5.92
8,apple,20,125,200,6.58
9,jute,80,40,40,5.58


In [22]:
new_crop_data.to_csv('../data/data_raw/crop_data_merged.csv')
new_soil_data.to_csv('../data/data_raw/soil_data_merged.csv')