Skip to content

Commit 81ca330

Browse files
committed
Changes
1. Adding write csv output to functions 2. Changing structure of files 3. Adding function to create earliest and most recent columns. 4. Adding time calculations
1 parent 496aff8 commit 81ca330

File tree

3 files changed

+294
-143
lines changed

3 files changed

+294
-143
lines changed

DataUpdate.py

Lines changed: 65 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -144,23 +144,16 @@ def IHMEHistoricalDeathsFile():
144144
writer.save()
145145

146146

147-
def FAOFBS():
148-
import requests
147+
def FAOFBS(write_csv_output=False):
149148
import numpy as np
150-
import matplotlib.pyplot as plt
151149
import pandas as pd
152-
import csv
153-
import xlrd
154-
import matplotlib.lines as mlines
155-
import matplotlib.transforms as mtransforms
156-
import xlsxwriter
157-
import dask.dataframe as dd
158-
import statsmodels.api as sm
159150

160-
Country_Concord = pd.read_csv('input\CountryConcordFAO.csv', encoding="ISO-8859-1")
161-
concord_table = pd.read_csv('input\Aggregation for crop type.csv')
162-
series_concord_table = pd.read_csv('input\FAOSeriesConcordance.csv')
163-
data = pd.read_csv(r'input\FoodBalanceSheets_E_All_Data_(Normalized).csv',
151+
152+
153+
Country_Concord = pd.read_csv('input\FAO\CountryConcordFAO.csv', encoding="ISO-8859-1")
154+
concord_table = pd.read_csv('input\FAO\Aggregation for crop type.csv')
155+
series_concord_table = pd.read_csv('input\FAO\FAOSeriesConcordance.csv')
156+
data = pd.read_csv(r'input\FAO\FoodBalanceSheets_E_All_Data_(Normalized).csv',
164157
encoding="ISO-8859-1",chunksize=100000)
165158

166159
chunk_list=[]
@@ -184,27 +177,13 @@ def FAOFBS():
184177
data = pd.DataFrame(data.to_records())
185178
data.columns = [hdr.replace("('sum', 'Value',", "").replace(")", "").replace("'", "") \
186179
for hdr in data.columns]
187-
return (data)
180+
data=data.rename(columns={"Area":"CountryName"})
188181

182+
if write_csv_output==True:
183+
data.to_csv('output\CSV\FAOFBS.csv')
189184

190-
def FAOFBSFile():
191-
import requests
192-
import numpy as np
193-
import matplotlib.pyplot as plt
194-
import pandas as pd
195-
import csv
196-
import xlrd
197-
import matplotlib.lines as mlines
198-
import matplotlib.transforms as mtransforms
199-
import xlsxwriter
200-
import dask.dataframe as dd
201-
import statsmodels.api as sm
202-
203-
p = FAOFBS()
204-
writer = pd.ExcelWriter('FAOFBS2019.xlsx', engine='xlsxwriter')
205-
p.to_excel(writer, sheet_name='FAOFBS')
185+
return (data)
206186

207-
writer.save()
208187

209188

210189
def FAOFBSFish():
@@ -217,7 +196,6 @@ def FAOFBSFish():
217196
import matplotlib.lines as mlines
218197
import matplotlib.transforms as mtransforms
219198
import xlsxwriter
220-
import dask.dataframe as dd
221199
import statsmodels.api as sm
222200

223201
Country_Concord = pd.read_csv('input\CountryConcordFAO.csv', encoding="ISO-8859-1")
@@ -264,7 +242,6 @@ def FAOFBSFishFile():
264242
import matplotlib.lines as mlines
265243
import matplotlib.transforms as mtransforms
266244
import xlsxwriter
267-
import dask.dataframe as dd
268245
import statsmodels.api as sm
269246

270247
p = FAOFBSFish()
@@ -364,53 +341,33 @@ def UISDataFile():
364341

365342
writer.save()
366343

367-
368-
def AQUASTATData():
369-
import requests
344+
def AQUASTATData(write_csv_output=False):
370345
import numpy as np
371-
import matplotlib.pyplot as plt
372346
import pandas as pd
373-
import csv
374-
import xlrd
375-
import matplotlib.lines as mlines
376-
import matplotlib.transforms as mtransforms
377-
import xlsxwriter
378-
import dask.dataframe as dd
379-
import statsmodels.api as sm
380347

381-
data = pd.read_excel('input\AQUASTAT.xlsx')
348+
data = pd.read_csv(r'input\AQUASTAT\AQUASTAT.csv', encoding="ISO-8859-1")
382349

383-
country_concordance = pd.read_excel('input\CountryConcordanceAQUASTAT.xlsx')
350+
country_concordance = pd.read_csv(r'input\AQUASTAT\CountryConcordanceAQUASTAT.csv', encoding="ISO-8859-1")
384351

385352
data = pd.merge(data, country_concordance, how="left", left_on="Area", right_on="Area name")
386353

387-
series_concordance = pd.read_excel('input\SeriesConcordanceAQUASTAT.xlsx')
388-
389-
# print (series_concordance.head())
390-
# print(data.head())
354+
series_concordance = pd.read_csv(r'input\AQUASTAT\SeriesConcordanceAQUASTAT.csv', encoding="ISO-8859-1")
391355

392356
data = pd.merge(data, series_concordance, how="left", left_on="Variable Name", right_on="Series name in Aquastat")
393-
# print(data.head())
394-
# print(data['Country Name in IFs'].unique)
357+
395358
data = data.drop(['Variable Id', 'Area Id', 'Symbol', 'Md'], axis=1)
396359

397360
data = data.dropna(how='any')
398-
data = pd.pivot_table(data, index=['Country Name in IFs', 'Year'], columns=['Series name in IFs'], values=['Value'],
361+
data = pd.pivot_table(data, index=['CountryName', 'Year'], columns=['SeriesName'], values=['Value'],
399362
aggfunc=[np.sum])
400363
data = pd.DataFrame(data.to_records())
401364
data.columns = [hdr.replace("('sum', 'Value',", "").replace(")", "").replace("'", "") \
402365
for hdr in data.columns]
403-
return (data)
404-
405366

406-
def AQUASTATDataFile():
407-
import pandas as pd
408-
p = AQUASTATData()
409-
p = p.reset_index()
410-
writer = pd.ExcelWriter('AQUASTAT.xlsx', engine='xlsxwriter')
411-
p.to_excel(writer, sheet_name='1', merge_cells=False)
412-
writer.save()
367+
if write_csv_output==True:
368+
data.to_csv('output/CSV/AQUASTATOutput.csv')
413369

370+
return (data)
414371

415372
def IMFGFSRevenueData():
416373
import numpy as np
@@ -465,58 +422,33 @@ def IMFGFSRevenueDataFile():
465422
writer.save()
466423

467424

468-
def IMFGFSExpenditureData():
469-
import requests
425+
def IMFGFSExpenditureData(write_csv_output=False):
470426
import numpy as np
471-
import matplotlib.pyplot as plt
472427
import pandas as pd
473-
import csv
474-
import xlrd
475-
import matplotlib.lines as mlines
476-
import matplotlib.transforms as mtransforms
477-
import xlsxwriter
478-
import statsmodels.api as sm
479428

480-
data = pd.read_csv('input\GFSCOFOG_02-23-2018 23-30-06-28.csv',chunksize=10000)
481-
concord_table = pd.read_excel('input\CountryConcordanceIMF.xlsx')
429+
data = pd.read_csv('input\IMFGFS\GFSCOFOG_02-23-2018 23-30-06-28.csv',chunksize=10000)
430+
concord_table = pd.read_csv('input\IMFGFS\CountryConcordanceIMF.csv', encoding="ISO-8859-1")
482431
chunk_list=[]
483432
for chunk in data:
484433
chunk['FuncSector'] = chunk[str('COFOG Function Name')] + chunk[str('Sector Name')]
485434
chunk = chunk.merge(concord_table, on="Country Name", how='left')
486-
chunk=chunk[chunk['Country name in IFs'].notnull()]
435+
chunk=chunk[chunk['CountryName'].notnull()]
487436
chunk_list.append(chunk)
488437

489438
data=pd.concat(chunk_list)
490439

491-
492-
493-
data = pd.pivot_table(data, index=["Country name in IFs", "Unit Name", 'Time Period'], values=['Value'],
440+
data = pd.pivot_table(data, index=["CountryName", "Unit Name", 'Time Period'], values=['Value'],
494441
columns=['FuncSector'], aggfunc=[np.sum])
495442
data = pd.DataFrame(data.to_records())
496443
data.columns = [hdr.replace("('sum', 'Value',", "").replace(")", "").replace("'", "") \
497444
for hdr in data.columns]
498-
print("IMF GFS Expenditure data done")
499-
return (data)
500445

446+
data=data.rename(columns={"Time Period":"Year"})
501447

502-
def IMFGFSExpenditureDataFile():
503-
import requests
504-
import numpy as np
505-
import matplotlib.pyplot as plt
506-
import pandas as pd
507-
import csv
508-
import xlrd
509-
import matplotlib.lines as mlines
510-
import matplotlib.transforms as mtransforms
511-
import xlsxwriter
512-
import statsmodels.api as sm
513-
514-
p = IMFGFSExpenditureData()
515-
writer = pd.ExcelWriter('IMFGFSEXP.xls', engine='xlsxwriter')
516-
p.to_excel(writer, sheet_name='EXP')
517-
518-
writer.save()
448+
if write_csv_output==True:
449+
data.to_csv('output\CSV\IMFGFSExpOutput.csv')
519450

451+
return (data)
520452

521453
def WDIData():
522454
import numpy as np
@@ -618,4 +550,39 @@ def WDIDataFile():
618550
data = WDIData()
619551
writer = pd.ExcelWriter('WDISeries.xlsx', engine='xlsxwriter')
620552
data.to_excel(writer, sheet_name='WDIData', merge_cells=False)
621-
writer.save()
553+
writer.save()
554+
555+
#A function to write tables to SQL
556+
def Write_to_SQL(df,con):
557+
import pandas as pd
558+
import numpy as np
559+
df2 = df.drop(["CountryName", "Year"], axis=1)
560+
for col in (df2):
561+
data = df[["CountryName", "Year", str(col)]]
562+
data1 = pd.pivot_table(data, values=[str(col)], index=["CountryName"], columns=["Year"],
563+
aggfunc=[np.sum])
564+
565+
# Get Most Recent Year
566+
dataRecent = data.dropna()
567+
dataRecent = dataRecent.sort_values(["CountryName", "Year"])
568+
dataRecent = dataRecent.drop_duplicates("CountryName", keep='last')
569+
dataRecent.reset_index()
570+
dataRecent.columns = ["CountryName", "MostRecentYear", "MostRecentValue"]
571+
572+
# Get Earliest Year
573+
dataEarliest = data.dropna()
574+
dataEarliest = dataEarliest.sort_values(["CountryName", "Year"])
575+
dataEarliest = dataEarliest.drop_duplicates("CountryName", keep='first')
576+
dataEarliest.reset_index()
577+
dataEarliest.columns = ["CountryName", "EarliestYear", "EarliestValue"]
578+
579+
data1 = pd.DataFrame(data1.to_records())
580+
val = "('sum', '" + str(col) + "', "
581+
data1.columns = [hdr.replace(val, "").replace(")", "").replace("'", "") \
582+
for hdr in data1.columns]
583+
584+
data1 = pd.merge(data1, dataRecent, how='left', left_on="CountryName", right_on="CountryName")
585+
data1 = pd.merge(data1, dataEarliest, how='left', left_on="CountryName", right_on="CountryName")
586+
587+
588+
data1.to_sql(name=str("Series").strip() + str(col).strip(), con=con, if_exists="replace", index=False)

GenerateHistSeriesFile.py

Lines changed: 28 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,36 @@
11
import DataUpdate
22
import sqlite3
3-
import pandas as pd
4-
import numpy as np
3+
import time
54

6-
connection=sqlite3.connect("output/IFsHistSeries.db")
5+
#Generate database
6+
connection=sqlite3.connect("output/Database/IFsHistSeries.db")
7+
print('Database created at output/Database/IFsHistSeries.db')
78
cursor=connection.cursor()
89

9-
Aquastat=DataUpdate.AQUASTATData()
10-
Aquastat2=Aquastat.drop(["Country Name in IFs","Year"],axis=1)
11-
12-
13-
for col in (Aquastat2):
14-
data=Aquastat[["Country Name in IFs","Year",str(col)]]
15-
data1=pd.pivot_table(data,values=[str(col)],index=["Country Name in IFs"], columns=["Year"],aggfunc=[np.sum])
16-
17-
#Get Most Recent Year
18-
dataRecent=data.dropna()
19-
dataRecent=dataRecent.sort_values(["Country Name in IFs","Year"])
20-
dataRecent=dataRecent.drop_duplicates("Country Name in IFs",keep='last')
21-
dataRecent.reset_index()
22-
dataRecent.columns=["Country Name in IFs","MostRecentYear","MostRecentValue"]
23-
24-
#Get Earliest Year
25-
dataEarliest = data.dropna()
26-
dataEarliest= dataEarliest.sort_values(["Country Name in IFs", "Year"])
27-
dataEarliest = dataEarliest.drop_duplicates("Country Name in IFs", keep='first')
28-
dataEarliest.reset_index()
29-
dataEarliest.columns = ["Country Name in IFs", "EarliestYear", "EarliestValue"]
30-
31-
data1 = pd.DataFrame(data1.to_records())
32-
val="('sum', '"+str(col)+"', "
33-
data1.columns = [hdr.replace(val,"").replace(")", "").replace("'", "") \
34-
for hdr in data1.columns]
35-
36-
data1=pd.merge(data1,dataRecent,how='left',left_on="Country Name in IFs",right_on="Country Name in IFs")
37-
data1=pd.merge(data1,dataEarliest,how='left',left_on="Country Name in IFs",right_on="Country Name in IFs")
38-
print(data1.head())
39-
40-
data1.to_sql(name=str("Series").strip()+str(col).strip(), con=connection, if_exists="replace", index=False)
41-
42-
43-
44-
45-
IMFExp=DataUpdate.IMFGFSExpenditureData()
46-
IMFExp.to_sql(name="IMFExpenditure", con=connection, if_exists="replace", index=False)
47-
48-
49-
FAO=DataUpdate.FAOFBS()
50-
FAO.to_sql(name="FAOFoodBalanceSheetAggregated", con=connection, if_exists="replace", index=False)
10+
#AQUASTAT Data
11+
print('Starting AQUASTAT data')
12+
start=time.time()
13+
Aquastat=DataUpdate.AQUASTATData(write_csv_output=True)
14+
DataUpdate.Write_to_SQL(Aquastat,connection)
15+
end=time.time()
16+
print(end-start)
17+
18+
19+
#IMFGFS Expenditure data
20+
print('Starting IMF GFS expenditure data')
21+
start=time.time()
22+
IMFExp=DataUpdate.IMFGFSExpenditureData(write_csv_output=True)
23+
DataUpdate.Write_to_SQL(IMFExp,connection)
24+
end=time.time()
25+
print(end-start)
26+
27+
#FAO Data
28+
print('Starting FAO Food balance sheet data')
29+
start=time.time()
30+
FAO=DataUpdate.FAOFBS(write_csv_output=True)
31+
DataUpdate.Write_to_SQL(FAO,connection)
32+
end=time.time()
33+
print(end-start)
5134

5235
WDI=DataUpdate.WDIData()
5336
WDI.to_sql(name="WorldDevelopmentIndicators",con=connection,if_exists="replace",index=False)

0 commit comments

Comments
 (0)