In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import gdal

In [7]:
    def load_clean_yield_data(yield_data_filepath):
        """
        Cleans the yield data by making sure any Nan values in the columns we care about
        are removed
        """
        important_columns=['Year', 'State ANSI', 'County ANSI', 'Value']
        yield_data=pd.read_csv(yield_data_filepath).dropna(subset=important_columns,how='any')

        return yield_data

    def get_tif_files(image_path):
        """
        Get all the .tif files in the image folder.

        Parameters
        ----------
        image_path: pathlib Path
            Directory to search for tif files
        Returns:
            A list of .tif filenames
        """
        files = []
        for dir_file in image_path.iterdir():
            if str(dir_file).endswith('tif'):
                files.append(str(dir_file.parts[-1]))
        return files
    def get_tif_files_12(image_path):#name length less than 12
        files = []
        for dir_file in image_path.iterdir():
            if str(dir_file).endswith('tif'):
                if len(str(dir_file.parts[-1]))<12:#max possible name is ss_ccc.tif, len=10
                    files.append(str(dir_file.parts[-1]))
        return files
    def get_tif_files_12(mask_path,temperature_path,image_path,weather_path):
        mask_files=[f for f in listdir(mask_path) if isfile(join(mask_path, f))]
        temperature_files=[f for f in listdir(temperature_path) if isfile(join(temperature_path, f))]
        image_files=[f for f in listdir(image_path) if isfile(join(image_path, f))]
        weather_files=[f for f in listdir(weather_path) if isfile(join(weather_path, f))]
        files = []
        for dir_file in image_path.iterdir():
            if str(dir_file).endswith('tif'):
                if len(str(dir_file.parts[-1]))<12:#max possible name is ss_ccc.tif, len=10
                    if str(dir_file.parts[-1]) in temperature_files:
                        if str(dir_file.parts[-1]) in weather_files:
                            if str(dir_file.parts[-1]) in mask_files:
                                files.append(str(dir_file.parts[-1]))
        return files


In [8]:
from pathlib import Path
import numpy as np
import gdal
import math
from itertools import repeat
from concurrent.futures import ProcessPoolExecutor
np.set_printoptions(threshold=np.inf)

In [9]:
class DataCleaner:
    """Take the exported, downloaded data
    and clean it.

    Specifically:
    - split the image collections into years
    - merge the temperature and reflection images
    - apply the mask, so only the farmland pixels are considered

    Parameters
    -----------
    mask_path: pathlib Path, default=Path('data/crop_yield-data_mask')
        Path to which the mask tif files have been saved
    temperature_path: pathlib Path, default=Path('data/crop_yield-data_temperature')
        Path to which the temperature tif files have been saved
    image_path: pathlib Path, default=Path('data/crop_yield-data_image')
        Path to which the image tif files have been saved
    yield_data: pathlib Path, default=Path('data/yield_data.csv')
        Path to the yield data csv file
    savedir: pathlib Path, default=Path('data/img_output')
        Path to save the data to
    multiprocessing: boolean, default=False
        Whether to use multiprocessing
    """
    def __init__ (self,mask_path=Path('/content/drive/MyDrive/crop_yield-data_mask_aditya'),
                  temperature_path=Path('/content/drive/MyDrive/crop_yield-data_temperature_aditya'),
                  image_path=Path('/content/drive/MyDrive/crop_yield-data_image_aditya'),
                  weather_path = Path('/content/drive/MyDrive/Daymet_data'),
                  yield_data_path=Path('SOP_download/yield_data.csv'),
                  savedir=Path('/content/drive/MyDrive/img_output'),
                  multiprocessing=False, processes=4, parallelism=6):
        self.mask_path=mask_path;
        self.temperature_path=temperature_path;
        self.image_path=image_path;
        self.weather_path = weather_path
        self.tif_files=get_tif_files_12(self.mask_path,self.temperature_path,self.image_path,self.weather_path)
        self.tif_files.remove('17_195.tif')
        self.tif_files.remove('19_195.tif')
        self.tif_files.remove('20_195.tif')
        self.tif_files.remove('21_93.tif')
        self.tif_files.remove('29_195.tif')
        self.tif_files.sort()
        self.multiprocessing=multiprocessing;
        self.processes=processes;
        self.parallelism=parallelism

        self.savedir = savedir
        if not self.savedir.exists():
            self.savedir.mkdir()

        self.yield_data=load_clean_yield_data(yield_data_path)[['Year', 'State ANSI', 'County ANSI']].values.astype(int) #.astype(int) added to avoid floating values read
        
    
    def process(self, num_years=14, delete_when_done=False):
        """
        Process all the data.

        Parameters
        ----------
        num_years: int, default=14
            How many years of data to create.
        delete_when_done: boolean, default=False
            Whether or not to delete the original .tif files once the .npy array
            has been generated.
        """
        print("#counties available: ")
        print(len(self.tif_files))
        
        files = []
        for dir_file in image_path.iterdir():
            if str(dir_file).endswith('tif'):
                if len(str(dir_file.parts[-1]))<12:#max possible name is ss_ccc.tif, len=10
                    files.append(str(dir_file.parts[-1]))#basically taking out filename only
        # print(len(files))

        
        if delete_when_done:
            print('Warning!! delete_when_done=True will delete the .tif files')
        
        if not self.multiprocessing:
            for filename in self.tif_files:
                process_county(filename, self.savedir, self.image_path, self.mask_path,
                               self.temperature_path, self.weather_path,self.yield_data, num_years=num_years, 
                               delete_when_done=delete_when_done)
        else:
            length=len(self.tif_files)
            files_iter=iter(self.tif_files)
            # turn all other arguments to iterator
            savedir_iter=repeat(self.savedir)
            im_path_iter=repeat(self.image_path)
            mask_path_iter=repeat(self.mask_path)
            temp_path_iter=repeat(self.temperature_path)
            weat_path_iter = repeat(self.weather_path)
            yd_iter=repeat(self.yield_data)
            num_years_iter=repeat(num_years)
            delete_when_done_iter=repeat(delete_when_done)

            with ProcessPoolExecutor() as executor:
                chunksize= int(max(length/(self.processes*self.parallelism),1))
                executor.map(process_county, files_iter, savedir_iter, im_path_iter,mask_path_iter,
                             temp_path_iter, weat_path_iter,yd_iter, num_years_iter, delete_when_done_iter,
                             chunksize=chunksize)     

def process_county(filename, savedir, image_path, mask_path, temperature_path, weather_path,yield_path,
                    num_years, delete_when_done):
    """
    Process and save county level data
    """

    # exporting part saves files in "{state}_{county}_.tif" format
    # so the last 4 characters are always .tif
    print("process_county \t",filename)
    locations=filename[:-4].split("_")
    state, county=int(locations[0]), int(locations[1])

    image=np.transpose(np.array(gdal.Open(str(image_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    temp=np.transpose(np.array(gdal.Open(str(temperature_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    mask=np.transpose(np.array(gdal.Open(str(mask_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    weather = np.transpose(np.array(gdal.Open(str(weather_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    
    # print(weather.shape," ",image.shape," ",temp.shape,"\n")

    temp = temp-12400
    temp = temp*1.5625  #considering 12400 (0) to 15600 (5000)
    temp[temp<0]=0
    temp[temp>5000]=5000

    image = image-1
    image[image<0]=0
    image[image>5000]=5000

    weather[:,:,0:weather.shape[2]:2] = weather[:,:,0:weather.shape[2]:2]*(5000/35)  #considering 12400 (0) to 15600 (5000)
    weather[:,:,1:weather.shape[2]:2] = weather[:,:,1:weather.shape[2]:2]*(5000/3200)
    weather[weather<0]=0
    weather[weather>5000]=5000

    # a values of 12 indicates farmland: everything else, we want to ignore
    mask[mask!=12]=0
    mask[mask==12]=1

    # when exporting the image, we append bands from many years into a single image for efficiency.
    # We want to split it back now
    img_temp_weat_merge = []
    img_list= divide_into_years(image, bands=7, composite_period=8, num_years=num_years)
    mask_list= divide_into_years(mask, bands=1, composite_period=365, num_years=num_years, extend=True)
    temp_list= divide_into_years(temp, bands=2, composite_period=8, num_years=num_years)
    weat_list= divide_into_years_daymet(weather, bands = 2, composite_period=1, num_years= num_years) #divide_into_years(weather, bands=2, composite_period=8, num_years=num_years)
    img_temp_weat_merge = merge_image_lists(img_list, 7, temp_list, 2, weat_list, 2)
    masked_img_temp_weat = mask_image(img_temp_weat_merge, mask_list)
    start_year=2006 #start year from the modis website
    for i in range(0,num_years):
        year=i+start_year
        key=np.array([year,state,county])
        fname='{}_{}_{}'.format(int(year),int(state),int(county))
        save_filename=fname;
        np.save(savedir/save_filename, masked_img_temp_weat[i])
        print("saving file",save_filename)
        
    if delete_when_done:
        (image_path/filename).unlink()
        (temperature_path/filename).unlink()
        (mask_path/filename).unlink()
        (weather_path/filename).unlink()
    
    print(f'{filename} array written')


# helper methods for the data cleaning

def divide_into_years(img, bands, composite_period, num_years=10, extend=False):
    """
    Parameters
    ----------
    img: the appended image collection to split up
    bands: the number of bands in an individual image
    composite_period: length of the composite period, in days
    num_years: how many years of data to create.
    extend: boolean, default=False
        If true, and num_years > number of years for which we have data, then the extend the image
        collection by copying over the last image.
        NOTE: This is different from the original code, where the 2nd to last image is copied over

    Returns:
    ----------
    im_list: a list of appended image collections, where each element in the list is a year's worth
        of data
    """
    bands_per_year=bands*math.ceil(365/composite_period)

    # if necessary, pad the image collection with the final images
    if extend: 
        num_bands_necessary=bands_per_year*num_years
        while img.shape[2]<num_bands_necessary:
            img=np.concatenate((img, img[:, :, -bands:]),axis=2)

    image_list=[]
    cur_idx=0
    for i in range(0, num_years-1):
        image_list.append(img[:,:,cur_idx:cur_idx+bands_per_year])
        cur_idx+=bands_per_year
    image_list.append(img[:,:,cur_idx:cur_idx+bands_per_year])
    return image_list

def divide_into_years_daymet(img, bands, composite_period, num_years=10, extend=False):
    bands_per_year=bands*math.ceil(365/composite_period)

    # if necessary, pad the image collection with the final images
    if extend: 
        num_bands_necessary=bands_per_year*num_years
        while img.shape[2]<num_bands_necessary:
            img=np.concatenate((img, img[:, :, -bands:]),axis=2)    
    
    # d1b1 d1b2 d2b1 d2b2 d3b1 d3b2 d4b1 d4b2 d5b1 d5b2 d6b1 d6b2 d7b1 d7b2 d8b1 d8b2 d9b1 d9b2 .......
    image_list=[]
    cur_idx=0
    required=np.arange(365)
    required = np.append(required[required%8 == 0],required[required%8 == 1])
    required=np.sort(required)
    for i in range(0, num_years-1,1):
        image_list.append(img[:,:,cur_idx+required])
        cur_idx+=bands_per_year
    image_list.append(img[:,:,cur_idx+required])

    return image_list

def merge_image_lists(MODIS_img_list,num_bands_1,MODIS_temperature_img_list, num_bands_2, MODIS_weather_img_list, num_bands_3):
    MODIS_list = []
    for i in range(0,len(MODIS_img_list)):
        img_shape=MODIS_img_list[i].shape
        img_temperature_shape=MODIS_temperature_img_list[i].shape
        img_weather_shape = MODIS_weather_img_list[i].shape
        img_shape_new=(img_shape[0],img_shape[1],img_shape[2]+img_temperature_shape[2]+img_weather_shape[2])
        merged_tensor =np.empty(img_shape_new)
        # print("img : ",img_shape[2]/7,"temp : ",img_temperature_shape[2]/2, "weat :",img_weather_shape[2]/2, "\n")
        for j in range(0,(int)(img_weather_shape[2]/2)):
            img=MODIS_img_list[i][:,:,(j*7):(j*7+7)]
            temperature=MODIS_temperature_img_list[i][:,:,(j*2):(j*2+2)]
            weat = MODIS_weather_img_list[i][:,:,(j*2):(j*2+2)]
            merged_tensor[:,:,(j*11):(j*11+11)] = np.concatenate((img,temperature, weat), axis =2)
            
        MODIS_list.append(merged_tensor)
    return MODIS_list



def mask_image(im_list, mask_list):
    masked_im_list=[]
    for img, mask in zip(im_list, mask_list):
        expanded_mask=np.tile(mask,(1,1,img.shape[2]))
        masked_img=img*expanded_mask
        masked_im_list.append(masked_img)
    return masked_im_list



In [None]:

mask_path = Path('/content/drive/MyDrive/data_Lakshya/crop_yield-data_mask_Lakshya')
temperature_path=Path('/content/drive/MyDrive/data_Lakshya/crop_yield-data_temperature')
image_path=Path('/content/drive/MyDrive/data_Lakshya/crop_yield-data_image_Lakshya')
weather_path = Path('/content/drive/MyDrive/data_Lakshya/Daymet_data')
yield_data_path=Path('/content/drive/MyDrive/cleanedData/Shared/corn_yield_Lakshya_2006_2015.csv')
cleaned_data_path=Path('/content/drive/MyDrive/cleanedData/histImageComposite')
multiprocessing=False
processes=4 # prev = 4
parallelism=6 # prev = 6
delete_when_done=False
num_years = 10  #number of years of data = 10

cleaner = DataCleaner(mask_path, temperature_path, image_path, weather_path, yield_data_path,
                        savedir=cleaned_data_path, multiprocessing=multiprocessing,
                        processes=processes, parallelism=parallelism)
cleaner.process(delete_when_done=delete_when_done, num_years=num_years)


In [None]:
imageComposite_files=[]
for dir_file in Path('/content/drive/MyDrive/cleanedData/histImageComposite').iterdir():
    if str(dir_file).endswith('npy'):
        imageComposite_files.append(str(dir_file.parts[-1]))
print("imageComposite_files: ")
print(len(imageComposite_files))


In [23]:
image=np.transpose(np.array(gdal.Open(str('/content/drive/MyDrive/data_Lakshya/crop_yield-data_mask_Lakshya/17_101.tif')).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
print(image.shape)
# print(image)
count=0
values=[]
image = image-1
for i in range(0,89):
    for j in range(0,75):
        for k in range(0,10):  
            if image[i][j][k]==12:
                count=count+1
                values.append(image[i][j][k])
                # print(image[i][j][k])
print(values)
print(count)
print()
# image[image<0]=0
# image[image>5000]=5000
# print(np.count_nonzero(image==65535))

(121, 130, 10)
[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 

In [None]:
print(image)

In [None]:
temp=np.transpose(np.array(gdal.Open(str('/content/drive/MyDrive/data_Lakshya/crop_yield-data_temperature/17_101.tif')).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
print(image.shape)
# print(image)
count=0
values=[]
image = image-1
for i in range(0,89):
    for j in range(0,75):
        for k in range(0,10):  
            # if image[i][j][k]==65535:
                count=count+1
                if temp[i][j][k] not in values:
                    values.append(temp[i][j][k])
                # print(image[i][j][k])
print(values)
print(len(values))
print(count)
temp = temp-12400
temp = temp*1.5625  #considering 12400 (0) to 15600 (5000)
temp[temp<0]=0
temp[temp>5000]=5000
print(temp)

(121, 130, 7282)
[0, 14353, 14288, 13928, 13846, 14392, 14293, 14003, 13849, 14375, 14295, 13840, 14393, 14303, 13978, 13838, 14346, 14291, 13929, 13834, 14379, 14302, 13949, 13808, 14314, 14304, 13950, 13827, 14199, 14301, 13966, 13830, 14366, 13986, 14406, 14322, 13954, 13819, 14423, 14318, 13917, 13811, 14386, 14311, 13884, 13805, 14263, 14305, 13804, 14194, 13989, 13853, 14323, 14001, 14391, 14308, 13970, 14313, 13974, 13839, 14167, 13985, 13844, 14104, 14310, 14006, 13860, 14279, 14011, 13858, 14356, 14307, 13981, 13825, 14407, 13957, 13802, 14444, 13912, 13800, 14452, 13892, 13797, 14294, 13798, 14141, 13820, 14343, 13979, 13845, 14232, 13984, 13843, 14092, 13835, 14178, 14034, 13828, 14276, 14320, 13983, 13803, 14370, 14325, 13794, 14426, 14324, 13793, 14446, 13807, 14416, 13817, 14351, 13822, 14381, 13944, 14348, 13997, 13857, 14327, 14290, 13851, 14298, 13915, 13789, 14225, 14306, 13913, 13770, 14273, 14319, 13771, 14326, 13778, 14400, 14433, 13801, 14445, 14421, 14315, 14382,

In [None]:
image=np.transpose(np.array(gdal.Open(str('/content/drive/MyDrive/data_Lakshya/crop_yield-data_image_Lakshya/17_101.tif')).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
print(image.shape)
# print(image)
count=0
values=[]
image = image-1
for i in range(0,89):
    for j in range(0,75):
        for k in range(0,10):  
            count=count+1
            values.append(image[i][j][k])
            print(image[i][j][k])
print(values)
# print(count)
# print()
image[image<0]=0
image[image>5000]=5000
# print(np.count_nonzero(image==65535))

In [None]:
a = np.load('/content/drive/MyDrive/img_output/2004_17_147.npy')
np.nonzero(a[0])



In [None]:
class DataCleanerSoil:
    """Take the exported, downloaded data
    and clean it.

    Specifically:
    - split the image collections into years
    - merge the temperature and reflection images
    - apply the mask, so only the farmland pixels are considered
    """
    def __init__ (self,mask_path=Path('/content/drive/MyDrive/crop_yield-data_mask_aditya'),
                  clay_path=Path('/content/drive/MyDrive/crop_yield-data_temperature_aditya'),
                  sand_path=Path('/content/drive/MyDrive/crop_yield-data_image_aditya'),
                  water_path = Path('/content/drive/MyDrive/Daymet_data'),
                  ph_path = Path('/content/drive/MyDrive/Daymet_data'),
                  bulk_path = Path('/content/drive/MyDrive/Daymet_data'),
                  carbon_path = Path('/content/drive/MyDrive/Daymet_data'),
                  yield_data_path=Path('SOP_download/yield_data.csv'),
                  savedir=Path('/content/drive/MyDrive/img_output'),
                  multiprocessing=False, processes=4, parallelism=6):
        self.mask_path=mask_path;
        self.clay_path=clay_path;
        self.sand_path=sand_path;
        self.water_path=water_path;
        self.ph_path = ph_path;
        self.bulk_path = bulk_path;
        self.carbon_path = carbon_path;
        self.tif_files=get_tif_files_12(self.mask_path,self.clay_path,self.sand_path,self.water_path)
        self.tif_files.remove('17_185.tif')
        self.tif_files.remove('19_185.tif')  
        self.tif_files.remove('20_185.tif')
        self.tif_files.remove('21_185.tif')
        self.tif_files.remove('29_185.tif')
        self.tif_files.remove('31_185.tif') 
        self.tif_files.sort()
        self.multiprocessing=multiprocessing;
        self.processes=processes;
        self.parallelism=parallelism

        self.savedir = savedir
        if not self.savedir.exists():
            self.savedir.mkdir()

        self.yield_data=load_clean_yield_data(yield_data_path)[['Year', 'State ANSI', 'County ANSI']].values.astype(int) #.astype(int) added to avoid floating values read
        
    
    def process(self, num_years=14, delete_when_done=False):
        """
        Process all the data.

        Parameters
        ----------
        num_years: int, default=14
            How many years of data to create.
        delete_when_done: boolean, default=False
            Whether or not to delete the original .tif files once the .npy array
            has been generated.
        """
        files = []
        for dir_file in clay_path.iterdir():
            if str(dir_file).endswith('tif'):
                files.append(str(dir_file.parts[-1]))
        print(len(files))
        if delete_when_done:
            print('Warning!! delete_when_done=True will delete the .tif files')
        
        if not self.multiprocessing:
            for filename in self.tif_files:
                process_county(filename, self.savedir, self.clay_path, self.mask_path,
                               self.sand_path, self.water_path, self.ph_path, self.bulk_path, 
                               self.carbon_path,self.yield_data, num_years=num_years, 
                               delete_when_done=delete_when_done)
        else:
            length=len(self.tif_files)
            files_iter=iter(self.tif_files)
            savedir_iter=repeat(self.savedir)
            clay_path_iter=repeat(self.clay_path)
            mask_path_iter=repeat(self.mask_path)
            sand_path_iter=repeat(self.sand_path)
            water_path_iter = repeat(self.water_path)
            ph_path_iter = repeat(self.ph_path)
            bulk_path_iter = repeat(self.bulk_path)
            carbon_path_iter = repeat(self.carbon_path)
            yd_iter=repeat(self.yield_data)
            num_years_iter=repeat(num_years)
            delete_when_done_iter=repeat(delete_when_done)

            with ProcessPoolExecutor() as executor:
                chunksize= int(max(length/(self.processes*self.parallelism),1))
                executor.map(process_county, files_iter, savedir_iter, clay_path_iter,mask_path_iter,
                             sand_path_iter, water_path_iter, ph_path_iter, bulk_path_iter, carbon_path_iter,
                             yd_iter, num_years_iter, delete_when_done_iter,
                             chunksize=chunksize)

def process_county(filename, savedir, clay_path, mask_path, sand_path, water_path,
                   ph_path, bulk_path, carbon_path,yield_path,
                    num_years, delete_when_done):
    """
    Process and save county level data
    """

    # exporting part saves files in "{state}_{county}_.tif" format
    # so the last 4 characters are always .tif
    print("process_county \t",filename)
    locations=filename[:-4].split("_")
    state, county=int(locations[0]), int(locations[1])

    clay=np.transpose(np.array(gdal.Open(str(clay_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    sand=np.transpose(np.array(gdal.Open(str(sand_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    water=np.transpose(np.array(gdal.Open(str(water_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    ph=np.transpose(np.array(gdal.Open(str(ph_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    carbon=np.transpose(np.array(gdal.Open(str(carbon_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    bulk=np.transpose(np.array(gdal.Open(str(bulk_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    mask=np.transpose(np.array(gdal.Open(str(mask_path/filename)).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))
    
    clay = clay-2
    clay = clay*(5000/98)  #considering 2 (0) to 100 (5000)
    clay[clay<0]=0
    clay[clay>5000]=5000

    ph = ph-42
    ph = ph*(5000/68)  #considering 42 (0) to 110 (5000)
    ph[ph<0]=0
    ph[ph>5000]=5000

    water = water-0
    water = water*(5000/52.974)  #considering 0 (0) to 52.974 (5000)
    water[water<0]=0
    water[water>5000]=5000

    carbon = carbon-0
    carbon = carbon*(5000/120)  #considering 0 (0) to 120 (5000)
    carbon[carbon<0]=0
    carbon[carbon>5000]=5000
   
    bulk = bulk-5
    bulk = bulk*(5000/180)  #considering 5 (0) to 185 (5000)
    bulk[bulk<0]=0
    bulk[bulk>5000]=5000

    sand = sand-1
    sand = sand*(5000/99)  #considering 1 (0) to 100 (5000)
    sand[sand<0]=0
    sand[sand>5000]=5000


    # a values of 12 indicates farmland: everything else, we want to ignore
    mask[mask!=12]=0
    mask[mask==12]=1
    
    soil_merge = []
    mask_list= divide_into_years(mask, bands=1, composite_period=365, num_years=num_years, extend=True)
    soil_merge_list = merge_images(clay, 6, sand, 6, water, 6, ph, 6, carbon, 6, bulk, 6)
    masked_soil = mask_soil(soil_merge_list, mask_list)
    start_year=2006 #start year from the modis website
    for i in range(0,num_years):
        year=i+start_year
        key=np.array([year,state,county])
        fname='{}_{}_{}'.format(int(year),int(state),int(county))
        save_filename=fname;
        np.save(savedir/save_filename, masked_soil[i])
        print("saving file",save_filename)
        
    if delete_when_done:
        (clay_path/filename).unlink()
        (sand_path/filename).unlink()
        (water_path/filename).unlink()
        (ph_path/filename).unlink()
        (carbon_path/filename).unlink()
        (bulk_path/filename).unlink()
        (mask_path/filename).unlink()
    
    print(f'{filename} array written')


# helper methods for the data cleaning
# this function is now working well in comparison to older one
def merge_images(clay, num_bands_1, sand, num_bands_2, water, num_bands_3, ph, num_bands_4, carbon, num_bands_5, bulk, num_bands_6):
    MODIS_list = []
    shape=clay.shape
    shape_new=(shape[0],shape[1],36)
    merged_tensor=np.empty(shape_new)
    merged_tensor[:,:,0:36]=np.concatenate((clay,sand,water,ph,carbon,bulk), axis =2)
    MODIS_list.append(merged_tensor)
    return MODIS_list

def mask_soil(soil_merge_list, mask_list):
    masked_im_list=[]
    print(len(soil_merge_list)," ",len(mask_list))

    img=soil_merge_list[0]#there is only one image as soil is static
    for mask in (mask_list):
        masked_img=img*mask
        masked_im_list.append(masked_img) 
        return masked_im_list

In [None]:
mask_path=Path('/content/drive/MyDrive/data_Lakshya/crop_yield-data_mask_Lakshya')
clay_path=Path('/content/drive/MyDrive/data_Lakshya/soil_data_clay_content')
sand_path=Path('/content/drive/MyDrive/data_Lakshya/soil_data_sand')
water_path = Path('/content/drive/MyDrive/data_Lakshya/soil_data_water_content')
ph_path = Path('/content/drive/MyDrive/data_Lakshya/soil_data_ph_value')
bulk_path = Path('/content/drive/MyDrive/data_Lakshya/soil_data_bulk_density')
carbon_path = Path('/content/drive/MyDrive/data_Lakshya/soil_data_carbon_content')
yield_data_path=Path('/content/drive/MyDrive/data_Lakshya/corn_yield_Lakshya.csv')
cleaned_data_path=Path('/content/drive/MyDrive/cleanedData/histSoilComposite')
multiprocessing=False#false value is not working giving error
processes=4 # prev = 4
parallelism=6 # prev = 6
delete_when_done=False
num_years = 10  #number of years of data = 10

cleaner = DataCleanerSoil(mask_path, clay_path, sand_path, water_path,  ph_path,
                        bulk_path, carbon_path, yield_data_path,
                        savedir=cleaned_data_path, multiprocessing=multiprocessing,
                        processes=processes, parallelism=parallelism)
cleaner.process(delete_when_done=delete_when_done, num_years=num_years)

In [None]:
a=np.load('/content/drive/MyDrive/singleData/2011_55_141.npy')
print(a.shape)
print(np.nonzero(a))

In [None]:
global_processed_files=[]

imageComposite_files=[]
for dir_file in Path('/content/drive/MyDrive/cleanedData/histImageComposite').iterdir():
    # print(dir_file)
    if str(dir_file).endswith('npy'):
        #strip out the directory so it's just file name
        imageComposite_files.append(str(dir_file.parts[-1]))
print("imageComposite_files: ")
print(len(imageComposite_files))
# 5480
soil_files=[]
for dir_file in Path('/content/drive/MyDrive/cleanedData/histSoilComposite').iterdir():
    # print(dir_file)
    if str(dir_file).endswith('npy'):
        #strip out the directory so it's just file name
        soil_files.append(str(dir_file.parts[-1]))

print("soilComposite_files: ")
print(len(soil_files))

for str_file in imageComposite_files:
    if str_file in soil_files:
        global_processed_files.append(str_file)

global_processed_files.sort()
print("global_processed_files: ")
print(len(global_processed_files))
