In [1]:
# Install pandas if not already installed
import pandas as pd

#Gathering & Cleaning Data
**EXTRACT DATA**

In [2]:
# Load the two CSV files into DataFrames
yield_path = './Resources/yield.csv'       # Replace with actual path
yield_df_path = './Resources/yield_df.csv'  # Replace with actual path
rainfall_path = './Resources/rainfall.csv'  # Replace with actual path
temp_path = './Resources/temp.csv'          # Replace with actual path
pesticide_path = './Resources/pesticides.csv'    # Replace with actual path

# Read the CSV files into DataFrames
yield_data = pd.read_csv(yield_path)
yield_df = pd.read_csv(yield_df_path)
rainfall_df = pd.read_csv(rainfall_path)
temp_df = pd.read_csv(temp_path)
pesticide_df = pd.read_csv(pesticide_path)


**START CLEAN UP yield_data**

In [3]:
# Inspect the shape and preview the first few rows of yield_data
print("Shape of yield_data:", yield_data.shape)
print("\nFirst few rows of yield_data:")
yield_data.head()


Shape of yield_data: (56717, 12)

First few rows of yield_data:


Unnamed: 0,Domain Code,Domain,Area Code,Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value
0,QC,Crops,2,Afghanistan,5419,Yield,56,Maize,1961,1961,hg/ha,14000
1,QC,Crops,2,Afghanistan,5419,Yield,56,Maize,1962,1962,hg/ha,14000
2,QC,Crops,2,Afghanistan,5419,Yield,56,Maize,1963,1963,hg/ha,14260
3,QC,Crops,2,Afghanistan,5419,Yield,56,Maize,1964,1964,hg/ha,14257
4,QC,Crops,2,Afghanistan,5419,Yield,56,Maize,1965,1965,hg/ha,14400


In [4]:
# Renamed Value to hg/ha_yield to easily recognize and 'crops yields production value'
yield_data = yield_data.rename(index=str, columns={"Value": "hg/ha_yield"}) 
yield_data.head()

Unnamed: 0,Domain Code,Domain,Area Code,Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,hg/ha_yield
0,QC,Crops,2,Afghanistan,5419,Yield,56,Maize,1961,1961,hg/ha,14000
1,QC,Crops,2,Afghanistan,5419,Yield,56,Maize,1962,1962,hg/ha,14000
2,QC,Crops,2,Afghanistan,5419,Yield,56,Maize,1963,1963,hg/ha,14260
3,QC,Crops,2,Afghanistan,5419,Yield,56,Maize,1964,1964,hg/ha,14257
4,QC,Crops,2,Afghanistan,5419,Yield,56,Maize,1965,1965,hg/ha,14400


**FINAL CLEANED UP yield_data**

In [5]:
# Drop columns that are not needed for the analysis
yield_data = yield_data.drop(['Year Code','Element Code','Element','Year Code','Area Code','Domain Code','Domain','Unit','Item Code'], axis=1)
yield_data.head()
#I will remove Year values older than 1990 later

Unnamed: 0,Area,Item,Year,hg/ha_yield
0,Afghanistan,Maize,1961,14000
1,Afghanistan,Maize,1962,14000
2,Afghanistan,Maize,1963,14260
3,Afghanistan,Maize,1964,14257
4,Afghanistan,Maize,1965,14400


**START CLEAN UP yield_df**

In [6]:
# Inspect the shape of all DataFrames
print("Shape of yield_df:", yield_df.shape)
# Preview the first few rows of all DataFrames
print("\nFirst few rows of yield_data:")
yield_df.head()

Shape of yield_df: (28242, 8)

First few rows of yield_data:


Unnamed: 0.1,Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,0,Albania,Maize,1990,36613,1485,121.0,16.37
1,1,Albania,Potatoes,1990,66667,1485,121.0,16.37
2,2,Albania,"Rice, paddy",1990,23333,1485,121.0,16.37
3,3,Albania,Sorghum,1990,12500,1485,121.0,16.37
4,4,Albania,Soybeans,1990,7000,1485,121.0,16.37


**FINAL CLEANED UP yield_df**

In [7]:
# Drop the 'Unnamed: 0' column from yield_data
yield_df = yield_df.drop(columns=['Unnamed: 0'])

# # Verify the change
# print("\nUpdated yield_data after dropping 'Unnamed: 0':")
yield_df.head()
#I will remove Year values older than 1990 later

Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,Albania,Maize,1990,36613,1485,121.0,16.37
1,Albania,Potatoes,1990,66667,1485,121.0,16.37
2,Albania,"Rice, paddy",1990,23333,1485,121.0,16.37
3,Albania,Sorghum,1990,12500,1485,121.0,16.37
4,Albania,Soybeans,1990,7000,1485,121.0,16.37


**START CLEAN UP rainfall_df**

In [8]:
# Inspect the shape and preview the first few rows of rainfall_df
print("Shape of rainfall_df:", rainfall_df.shape)
print("\nFirst few rows of rainfall_df:")
rainfall_df.head()


Shape of rainfall_df: (6727, 3)

First few rows of rainfall_df:


Unnamed: 0,Area,Year,average_rain_fall_mm_per_year
0,Afghanistan,1985,327
1,Afghanistan,1986,327
2,Afghanistan,1987,327
3,Afghanistan,1989,327
4,Afghanistan,1990,327


In [9]:
rainfall_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6727 entries, 0 to 6726
Data columns (total 3 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0    Area                          6727 non-null   object
 1   Year                           6727 non-null   int64 
 2   average_rain_fall_mm_per_year  5953 non-null   object
dtypes: int64(1), object(2)
memory usage: 157.8+ KB


**FINAL CLEANED UP rainfall_df**

In [10]:
# Correct the column name in rainfall_df ( extra space in ' Area')
rainfall_df.rename(columns={' Area': 'Area'}, inplace=True)

#  Drop rows with NaN values
rainfall_df = rainfall_df.dropna() 
rainfall_df.info()
#I will remove Year values older than 1990 later

<class 'pandas.core.frame.DataFrame'>
Index: 5953 entries, 0 to 6726
Data columns (total 3 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Area                           5953 non-null   object
 1   Year                           5953 non-null   int64 
 2   average_rain_fall_mm_per_year  5953 non-null   object
dtypes: int64(1), object(2)
memory usage: 186.0+ KB


**START CLEAN UP temp_df**

In [11]:
# Inspect the shape and preview the first few rows of temp_df
print("Shape of temp_df:", temp_df.shape)
print("\nFirst few rows of temp_df:")
temp_df.head()

Shape of temp_df: (71311, 3)

First few rows of temp_df:


Unnamed: 0,year,country,avg_temp
0,1849,Côte D'Ivoire,25.58
1,1850,Côte D'Ivoire,25.52
2,1851,Côte D'Ivoire,25.67
3,1852,Côte D'Ivoire,
4,1853,Côte D'Ivoire,


**FINAL CLEANED UP temp_df**

In [12]:
# Change column names
temp_df = temp_df.rename(columns={"year": "Year", "country": 'Area'})

#drop the rows with NaN values
temp_df = temp_df.dropna()
temp_df.head()

#I will remove Year values older than 1990 later

Unnamed: 0,Year,Area,avg_temp
0,1849,Côte D'Ivoire,25.58
1,1850,Côte D'Ivoire,25.52
2,1851,Côte D'Ivoire,25.67
7,1856,Côte D'Ivoire,26.28
8,1857,Côte D'Ivoire,25.17


**START CLEAN UP pesticide_df**

In [13]:
# Inspect the shape and preview the first few rows of pesticide_df
print("Shape of pesticide_df:", pesticide_df.shape)
print("\nFirst few rows of pesticide_df:")
pesticide_df.head()

Shape of pesticide_df: (4349, 7)

First few rows of pesticide_df:


Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value
0,Pesticides Use,Albania,Use,Pesticides (total),1990,tonnes of active ingredients,121.0
1,Pesticides Use,Albania,Use,Pesticides (total),1991,tonnes of active ingredients,121.0
2,Pesticides Use,Albania,Use,Pesticides (total),1992,tonnes of active ingredients,121.0
3,Pesticides Use,Albania,Use,Pesticides (total),1993,tonnes of active ingredients,121.0
4,Pesticides Use,Albania,Use,Pesticides (total),1994,tonnes of active ingredients,201.0


**FINAL CLEANED UP pesticide_df**

In [14]:
pesticide_df = pesticide_df.rename(index=str, columns={"Value": "pesticides_tonnes"})

# For knowledge -> axis=0 is for rows, axis=1 is for columns
pesticide_df = pesticide_df.drop(['Element', 'Domain', 'Unit', 'Item'], axis=1) 
pesticide_df.head()
# I will remove Year values older than 1990 later

Unnamed: 0,Area,Year,pesticides_tonnes
0,Albania,1990,121.0
1,Albania,1991,121.0
2,Albania,1992,121.0
3,Albania,1993,121.0
4,Albania,1994,201.0


**MERGING yield_data & yield_df**

In [15]:
# Display the columns for both datasets
print("Columns in yield_df.csv:", yield_df.columns.tolist())
print("Columns in yield.csv:", yield_data.columns.tolist())

Columns in yield_df.csv: ['Area', 'Item', 'Year', 'hg/ha_yield', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']
Columns in yield.csv: ['Area', 'Item', 'Year', 'hg/ha_yield']


In [16]:

# Merge yield_df and yield_data on 'Area' and 'Year', specifying suffixes to avoid duplicate column names
yield_df = pd.merge(yield_df, yield_data, on=['Area', 'Year'], how='left', suffixes=('_df', '_data'))
yield_df.head()

Unnamed: 0,Area,Item_df,Year,hg/ha_yield_df,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Item_data,hg/ha_yield_data
0,Albania,Maize,1990,36613,1485,121.0,16.37,Maize,36613
1,Albania,Maize,1990,36613,1485,121.0,16.37,Potatoes,66667
2,Albania,Maize,1990,36613,1485,121.0,16.37,"Rice, paddy",23333
3,Albania,Maize,1990,36613,1485,121.0,16.37,Sorghum,12500
4,Albania,Maize,1990,36613,1485,121.0,16.37,Soybeans,7000


In [17]:
# Remove duplicates while keeping the first occurrence for each Area and Year
yield_df = yield_df.drop_duplicates(subset=['Area', 'Year', 'Item_df'])

# Optionally, if you need only one 'Item_data' per Area and Year, you can further simplify:
yield_df = yield_df.groupby(['Area', 'Year'], as_index=False).first()

# Drop redundant columns if needed
yield_df = yield_df.drop(columns=['Item_data', 'hg/ha_yield_data'])

# Rename columns for clarity
yield_df = yield_df.rename(columns={
    'Item_df': 'Item',
    'hg/ha_yield_df': 'hg/ha_yield'
})

# Display the cleaned DataFrame
print("\nCleaned DataFrame:")
yield_df.head()



Cleaned DataFrame:


Unnamed: 0,Area,Year,Item,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,Albania,1990,Maize,36613,1485,121.0,16.37
1,Albania,1991,Maize,29068,1485,121.0,15.36
2,Albania,1992,Maize,24876,1485,121.0,16.06
3,Albania,1993,Maize,24185,1485,121.0,16.05
4,Albania,1994,Maize,25848,1485,201.0,16.96


In [18]:
# Check for missing data
print("Missing values in the merged DataFrame:")
print(yield_df.isnull().sum())

Missing values in the merged DataFrame:
Area                             0
Year                             0
Item                             0
hg/ha_yield                      0
average_rain_fall_mm_per_year    0
pesticides_tonnes                0
avg_temp                         0
dtype: int64


In [19]:
# Save the yield_df DataFrame to a CSV file
yield_df.to_csv('./Resources/crop_yields.csv', index=False)

print("DataFrame successfully saved as 'crop_yields.csv!'")
# Inspect the shape and preview the first few rows of pesticide_df
print("Shape of yield_df:", yield_df.shape)

DataFrame successfully saved as 'crop_yields.csv!'
Shape of yield_df: (2250, 7)


**MERGING rainfall_df, temp_df & pesticide_df**

In [26]:
# Assuming rainfall_df, temp_df, and pesticide_df are already loaded
# Merge rainfall_df and temp_df on 'Area' and 'Year'
merged_df = pd.merge(rainfall_df, temp_df, on=['Area', 'Year'], how='inner')

# Merge the result with pesticide_df on 'Area' and 'Year'
merged_df = pd.merge(merged_df, pesticide_df, on=['Area', 'Year'], how='inner')
merged_df.head()


Unnamed: 0,Area,Year,average_rain_fall_mm_per_year,avg_temp,pesticides_tonnes
0,Albania,1990,1485,16.37,121.0
1,Albania,1991,1485,15.36,121.0
2,Albania,1992,1485,16.06,121.0
3,Albania,1993,1485,16.05,121.0
4,Albania,1994,1485,16.96,201.0


In [28]:
# Check for missing data
print("Missing values in the merged DataFrame:")
print(merged_df.isnull().sum())

Missing values in the merged DataFrame:
Area                             0
Year                             0
average_rain_fall_mm_per_year    0
avg_temp                         0
pesticides_tonnes                0
dtype: int64


In [27]:
# Save the merged DataFrame to a CSV file
merged_df.to_csv('./Resources/environmental_factors.csv', index=False)

print("DataFrame successfully saved as 'environmental_factors.csv'")
print("Shape of yield_df:", merged_df.shape)

DataFrame successfully saved as 'environmental_factors.csv'
Shape of yield_df: (4437, 5)
