In [None]:
#Dependencies
from bs4 import BeautifulSoup
import requests
from splinter import Browser
import time
import pandas as pd
import numpy as np
import datetime
from sqlalchemy import create_engine

In [None]:
#INFLATION DATAFRAME PROCESS

In [None]:
url = 'https://www.inflation.eu/inflation-rates/mexico/historic-inflation/cpi-inflation-mexico.aspx'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

table = pd.read_html(url)
print(table)

In [None]:
#Transform
df = table[8]
df2 = df.drop(0)
df3 = df2[[0, 1]].copy()
df4 = df2[[3, 4]].copy()
df4 = df4.rename(columns={3:0,4:1})
df5 = df3.append(df4, ignore_index = True)
df5[0] = df5[0].replace(regex=[r'^ba.$', 'CPI Mexico '], value='')
df5 = df5.rename(columns={0:'year', 1:'inflation'})
df5['inflation'] = df5['inflation'].replace(regex=[r'^ba.$', '%'], value='')
df5['inflation'] = df5['inflation'].astype('float64')
print(df5.dtypes)
df5.head()


In [None]:
#INPC DATAFRAME PROCESS

In [None]:
excel_file = "Indicadores20200416194631.xls"
df = pd.read_excel(excel_file)
df

In [None]:
#Transform data
df2 = df.drop([0,1,2,3,607,608,609])
df3 = df2.rename(columns={'Instituto Nacional de Estadística y Geografía (INEGI)':'period','Unnamed: 1':'inpc'})
df3['year'] = df3['period'].str.slice(stop=4)
df3['month'] = df3['period'].str.slice(start=-2)
df3['inpc'] = df3['inpc'].astype('float64')
df3.head()


In [None]:
#BARREL PRICE DATAFRAME PROCESS

In [None]:
#Extract CSV  into DataFrame
barrelPrice_csv = "Consulta_Precio_Barril_Petroleo.csv"
barrelPrice_df = pd.read_csv(barrelPrice_csv)
barrelPrice_df.head()

In [None]:
#Clean and Transform Dataframe
#1. Change column "fecha"  into datetime format 
# datetime assumes original info has Month as its first element. Its necessary to include "dayfirst=True" so the method knows first value is the DAY.
barrelPrice_df['Fecha']=pd.to_datetime(barrelPrice_df['Fecha'], dayfirst=True)

#2. Eliminate NA values
#Identify the rows with "N/E" value (by its index value)
indexNames = barrelPrice_df[ barrelPrice_df['SI744'] == "N/E" ].index
# Delete these rows  from dataFrame, using the index value identified
barrelPrice_df.drop(indexNames , inplace=True)

#3. Change "SI744" column into float format 
barrelPrice_df['SI744'] = barrelPrice_df['SI744'].astype('float64')

# 4. Separate month and year into new columns
barrelPrice_df["month"]=barrelPrice_df.Fecha.dt.strftime('%m')
barrelPrice_df["year"]=barrelPrice_df.Fecha.dt.strftime('%Y')

#5. Find average barrel price per month, grouping by year and month - MEAN / reset_index  --> Final DF 292 rows (1 per month)
#Create a copy of DF & Groupby and find average value
barrelPrice_df2=barrelPrice_df.copy()
barrelPrice_avg =barrelPrice_df2.groupby(["year","month"]).mean()

#Create new DF by using Reset index 
barrelPrice_wAVG_df= barrelPrice_avg.reset_index()

#Rename column 'SI744' 
barrelPrice_wAVG_df=barrelPrice_wAVG_df.rename(columns={"SI744":"barrel_avg_price"})

#Round Average Barrel Price into 4 decimals
barrelPrice_wAVG_df["barrel_avg_price"]=round(barrelPrice_wAVG_df["barrel_avg_price"],4)
barrelPrice_wAVG_df

In [None]:
#GDP  DATAFRAME PROCESS

In [None]:
# Open CSV
gdp_data="GDP2_file.csv"

# Read files
gdp_data = pd.read_csv(gdp_data, thousands=',')
gdp_data.head()

In [None]:
#Transform Data
df = pd.melt(gdp_data,
                            id_vars=["Concepto"],
                            var_name = "Period",
                            value_name="GDP Index")
  

df ["Year"]= df ["Period"].str.slice(start=3)
df ["Quarter"]= df ["Period"].str.slice(stop=2)

df = df.replace(0, np.nan)
df = df.dropna(how='all', axis=0)
df = df.replace(np.nan, 0)

df = df.rename(columns={'Period':'period','GDP Index':'gdp','Year': 'year','Quarter': 'quarter'})
del df["Concepto"]
del df["period"]

df["quarter"].replace({"T1":1, "T2":2, "T3":3, "T4":4}, inplace=True)

dict_q ={1:[1,2,3],2:[4,5,6],3:[7,8,9],4:[10,11,12]}
monthList=[dict_q[x[1]] for x in df["quarter"].items()]
df["month"]=monthList

#List values displays into multiple rows
df=df.explode("month")

print(df.dtypes)
df.head()

In [None]:
#Connection to DB
connection_string = "postgres:PASSWORD@localhost:5432/Project2"
engine = create_engine(f'postgresql://{connection_string}')

# Confirm tables
engine.table_names() 

In [None]:
#Load information DF into DB

# Inflation dataframe
df5.to_sql(name='inflation', con=engine, if_exists='append', index=False)

# INPC dataframe
df3.to_sql(name='indice', con=engine, if_exists='append', index=False)

# Barrel Price dataframe
barrelPrice_wAVG_df.to_sql(name="barrel_price", con=engine, if_exists="append", index=False)

#GDP dataframe
df.to_sql(name="gdp", con=engine, if_exists="append", index=False)

In [None]:
#Verify information uploaded  - INFLATION TABLE
pd.read_sql_query('select * from inflation', con=engine)



In [None]:
#Verify information uploaded - INDICE TABLE (INPC)
pd.read_sql_query('select * from indice', con=engine)


In [None]:
#Verify information uploaded - BARREL_PRICE TABLE 
pd.read_sql_query('select * from barrel_price', con=engine)


In [None]:
#Verify information uploaded - GDP TABLE
pd.read_sql_query('select * from gdp', con=engine)