In [12]:
# Libraries
import pandas as pd

# Clean the Datasets
## Crops Dataset

In [13]:
crops = pd.read_csv('Crops.csv')
crops.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Unit,Y1961,...,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017,Y2018,Y2019,Y2020,Y2021
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,ha,,...,13490.0,14114.0,13703.0,14676.0,19481.0,19793.0,20053.0,29203.0,22134.0,21685.0
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5419,Yield,hg/ha,,...,45960.0,29910.0,19996.0,16521.0,16859.0,13788.0,17161.0,13083.0,17759.0,18748.0
2,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5510,Production,tonnes,,...,62000.0,42215.0,27400.0,24246.0,32843.0,27291.0,34413.0,38205.0,39307.0,40655.23
3,2,'004,Afghanistan,711,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",5312,Area harvested,ha,,...,18500.0,18500.0,30000.0,25000.0,24500.0,26160.0,25220.0,27387.0,26255.0,26287.0
4,2,'004,Afghanistan,711,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",5419,Yield,hg/ha,,...,6757.0,6757.0,7167.0,7200.0,7075.0,6970.0,7866.0,6902.0,7409.0,7379.0


### Pivot df to long format

In [14]:
# Need to get years to be in rows
years_list = crops.iloc[: ,9:70].columns.values.tolist()

# Melt to get rows
crops_long = pd.melt(crops, id_vars = ['Area', 'Item', 'Element', 'Unit'], 
                      value_vars = years_list, var_name="year", value_name="production")

# Get rid of Y before year
crops_long['year']=  crops_long['year'].str.extract(r'(\d+)', expand=False)
crops_long = crops_long[crops_long['Element'] == 'Production']
crops_long = crops_long[crops_long['Unit'] == 'tonnes']
crops_long = crops_long.drop(['Element', 'Unit'], axis = 1)
crops_long['year'] = crops_long['year'].astype(int)

crops_long = crops_long.drop_duplicates()
crops_long.head(143)

Unnamed: 0,Area,Item,year,production
2,Afghanistan,"Almonds, in shell",1961,
5,Afghanistan,"Anise, badian, coriander, cumin, caraway, fenn...",1961,
8,Afghanistan,Apples,1961,15100.0
11,Afghanistan,Apricots,1961,32000.0
15,Afghanistan,Barley,1961,378000.0
...,...,...,...,...
346,Albania,Groundnut oil,1961,
349,Albania,"Hen eggs in shell, fresh",1961,2745.0
354,Albania,Hop cones,1961,
355,Albania,"Horse meat, fresh or chilled",1961,


## Fertilizer
### We'll keep doing the same pivoting method

In [15]:
fert = pd.read_csv('Fertilizer.csv')
fert.head()

# Need to get years to be in rows
years_list = fert.iloc[: ,9:70].columns.values.tolist()

# Melt to get rows
fert_long= pd.melt(fert, id_vars = ['Area', 'Item', 'Element', 'Unit'], 
                      value_vars = years_list, var_name="year")

fert_long['year']=  fert_long['year'].str.extract(r'(\d+)', expand=False)

fert_long = fert_long[fert_long['Element'] == 'Agricultural Use']

# Create Column for each fertilizer type
nitrogen = fert_long[fert_long['Item'] == 'Nutrient nitrogen N (total)']
nit_wide = pd.pivot(nitrogen, columns='Item', values='value')
nit_wide = nit_wide.reset_index(drop=True)

phos = fert_long[fert_long['Item'] == 'Nutrient phosphate P2O5 (total)']
phos_wide = pd.pivot(phos, columns='Item', values='value')
phos_wide = phos_wide.reset_index(drop=True)

pot = fert_long[fert_long['Item'] == 'Nutrient potash K2O (total)']
pot_wide = pd.pivot(pot, columns='Item', values='value')
pot_wide = pot_wide.reset_index(drop=True)

fert_wide = pd.concat([nit_wide, phos_wide, pot_wide], axis=1)

# Create db with year and country pairs
df = fert_long[fert_long['Item'] == 'Nutrient nitrogen N (total)'].reset_index(drop=True)

# Create comibned df
fert_com = pd.concat([fert_wide, df], axis=1)
fert_com = fert_com.drop(['Item', 'Element', 'Unit', 'value'], axis=1)
fert_com = fert_com.rename(columns={"Nutrient nitrogen N (total)": "nitrogen",
                                    "Nutrient phosphate P2O5 (total)" : "phosphate",
                                    "Nutrient potash K2O (total)" : "potassium"})
fert_com['year'] = fert_com['year'].astype(int)

fert_com = fert_com.drop_duplicates()

fert_com.head()


Unnamed: 0,nitrogen,phosphate,potassium,Area,year
0,1000.0,100.0,,Afghanistan,1962
1,2000.0,2500.0,700.0,Albania,1962
2,15000.0,25000.0,15000.0,Algeria,1962
3,500.0,300.0,200.0,Angola,1962
4,,,,Antigua and Barbuda,1962


## Prices

In [16]:
prices = pd.read_csv('Prices.csv')
prices.head(5)

# More different cols
years_list = prices.iloc[:, 11:75].columns.values.tolist()

# Melt to get rows
prices_long = pd.melt(prices, id_vars = ['Area', 'Item', 'Element', 'Months'], 
                      value_vars = years_list, var_name="year")

prices_long['year']=  prices_long['year'].str.extract(r'(\d+)', expand=False)
prices_long = prices_long[prices_long['Element'] == 'Producer Price (USD/tonne)']
prices_long = prices_long.drop(['Element', 'Months'], axis = 1)
prices_long['value'] = prices_long['value'].astype(float)
prices_long['year'] = prices_long['year'].astype(int)

prices_long = prices_long.drop_duplicates()

prices_long.head()

Unnamed: 0,Area,Item,year,value
125,Albania,Apples,1991,
135,Albania,Apricots,1991,
151,Albania,Barley,1991,
167,Albania,"Beans, dry",1991,
184,Albania,"Broad beans and horse beans, green",1991,


## Rain Data

In [17]:
rain = pd.read_csv("rainfall.csv")
rain.rename(columns={"average_rain_fall_mm_per_year":"avg_rain","Year":"year", " Area":"Area"}, inplace=True)
rain = rain.drop_duplicates(subset=['Area', 'year'], keep="first")
rain.head()

Unnamed: 0,Area,year,avg_rain
0,Afghanistan,1985,327
1,Afghanistan,1986,327
2,Afghanistan,1987,327
3,Afghanistan,1989,327
4,Afghanistan,1990,327


## Temp Data

In [18]:
temp = pd.read_csv("temp.csv")
temp.rename(columns={"country":"Area"}, inplace = True)
temp = temp.drop_duplicates(subset=['Area', 'year'], keep="first")
temp.head()

Unnamed: 0,year,Area,avg_temp
0,1849,CÃ´te D'Ivoire,25.58
1,1850,CÃ´te D'Ivoire,25.52
2,1851,CÃ´te D'Ivoire,25.67
3,1852,CÃ´te D'Ivoire,
4,1853,CÃ´te D'Ivoire,


## Look at Countires not Matching

In [19]:
crops['Area'].unique();

In [20]:
rain['Area'].unique();

# Combine UN DFs

In [21]:
merge_df = pd.merge(crops_long, fert_com, how='inner', left_on=['Area', 'year'], right_on=['Area', 'year'])
merge_df2 = pd.merge(merge_df, prices_long, how='inner', left_on=['Area', 'year', 'Item'], right_on=['Area', 'year', 'Item'])

# Fix UN Country Names
There are a lot of country (Area) names that don't match between the UN and Kaggle data, need to switch them around to match.



In [22]:
# Create list for wrong UN countries
UN_country = ["United States of America", "United Kingdom of Great Britain and Northern Ireland",
              "Venezuela (Bolivarian Republic of)", "Türkiye", "Iran (Islamic Republic of)",
              "Russian Federation", "United Republic of Tanzania", "Democratic Republic of the Congo",
              "Congo", "Côte d'Ivoire", "China, mainland"]

# Create list for names we want

country_want = ["United States", "United Kingdom", "Venezuela, RB", "Turkey", "Iran", "Russia",
                "Tanzania", "Congo, Dem. Rep.", "Congo, Rep.", "Cote d'Ivoire", "China"]

# Verify list are same length

if len(UN_country) == len(country_want):
    print("Lists are same length, proceed to changing names")
else:
    print("Check lists! Not the same length!")

Lists are same length, proceed to changing names


In [23]:
# Changing Names
merge_df2_clean = merge_df2

for (old, new) in zip(UN_country, country_want):
    merge_df2_clean['Area'] = merge_df2['Area'].replace(old, new)


## Merge in kaggle data

In [24]:
merge_df3 = pd.merge(merge_df2_clean, rain, how='inner', left_on=['year', 'Area'], right_on=['year','Area'])
merge_df4 = pd.merge(merge_df3, temp, how="inner", left_on=['Area', 'year'], right_on=['Area', 'year'])

merge_df4['total_val_usd'] = merge_df4['production']*merge_df4['value']

# Remove Special characters, causing problems
merge_df4['Item'] = merge_df4['Item'].str.replace('\W', ' ')
merge_df4 = merge_df4.drop_duplicates()

  merge_df4['Item'] = merge_df4['Item'].str.replace('\W', ' ')


In [25]:
merge_df4.to_csv('C:/Users/HunterBlum/OneDrive/School/SanDiego/Courses/Cloud/CloudFinal-Team1/Data/df_merged.csv',
                 index=False)