In [1]:
import pymongo
import pandas as pd
from pymongo import MongoClient


import numpy as np
import pandas as pd
import scipy.stats as stats

import datetime

import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('ggplot')

In [2]:
client = MongoClient()
db_eur = client.eur_scraper

collection_eur = db_eur.rates

In [3]:
df_eur = pd.DataFrame(list(collection_eur.find()))

In [4]:
df_eur.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   _id        53 non-null     object
 1   year       51 non-null     object
 2   day_month  50 non-null     object
 3   eur_rate   49 non-null     object
dtypes: object(4)
memory usage: 1.8+ KB


In [5]:
# remove Mongo _id column

df_eur.drop(columns = ['_id'], inplace = True)

In [6]:
# drop rows with NaN values

df_eur.dropna(inplace = True)

In [7]:
df_eur.head()

Unnamed: 0,year,day_month,eur_rate
3,2019,18 Sep.,−0.50
4,2016,16 Mar.,−0.40
5,2015,9 Dec.,−0.30
6,2014,10 Sep.,−0.20
7,2014,11 Jun.,−0.10


In [8]:
df_eur['date'] = df_eur['day_month'] + ' ' + df_eur['year']

In [9]:
def date_converter(date):
    
    """
    PARAMETERS
    date - string in format '2-digit-day month_name 4-digit-year'
    
    RETURNS
    date in new string format ' 4 digit year-2 digit month-2 digit day'
    """
    
    dct_month = {
        'Jan': '01',
        'Feb': '02',
        'Mar': '03',
        'Apr': '04',
        'May': '05',
        'Jun': '06',
        'Jul': '07',
        'Aug': '08',
        'Sep': '09',
        'Oct': '10',
        'Nov': '11',
        'Dec': '12'
    }
    
    lst_date = date.split()
        
    str_yr = lst_date[-1]
    
    str_mo = dct_month[lst_date[1][:3]]
    
    str_d = lst_date[0]
    
    #add zeros before single digit dates
    if len(str_d) ==1:
        str_d += '0'
        str_d = str_d[::-1]
        
    
    str_date = f'{str_yr}-{str_mo}-{str_d}'

    return str_date

In [10]:
df_eur['date'] = [date_converter(d) for d in df_eur['date']]

In [11]:
df_eur['date'] = pd.to_datetime(df_eur['date'], yearfirst = True)

In [12]:
df_eur.sort_values(by = 'date', inplace = True)

In [13]:
df_eur.head()

Unnamed: 0,year,day_month,eur_rate,date
51,1999,1 Jan.,2.0,1999-01-01
50,1999,4 Jan. 1\n,2.75,1999-01-04
49,1999,22 Jan.,2.0,1999-01-22
48,1999,9 Apr.,1.5,1999-04-09
47,1999,5 Nov.,2.0,1999-11-05


In [14]:
# drop year and day_month columns
df_eur.drop(columns = ['year', 'day_month'], inplace = True)

In [15]:
df_eur.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49 entries, 51 to 3
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   eur_rate  49 non-null     object        
 1   date      49 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 1.1+ KB


In [16]:
lst_num = '1 2 3 4 5 6 7 8 9 0'.split()


new_r = []
for r in df_eur['eur_rate']:
    if r[0] not in lst_num:
        r = float(r[1:])*(-1)
    else:
        r = float(r)
    
    new_r.append(r)

In [17]:
df_eur['eur_rate'] = new_r

In [18]:
df_eur.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49 entries, 51 to 3
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   eur_rate  49 non-null     float64       
 1   date      49 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1)
memory usage: 1.1 KB


In [19]:
df_eur['eur_rate_lag1'] = df_eur['eur_rate'].shift(1)

In [20]:
df_eur['eur_rate_dif'] = df_eur['eur_rate'] - df_eur['eur_rate_lag1']
df_eur['eur_rate_change%'] = 100*((df_eur['eur_rate'] - df_eur['eur_rate_lag1'])/df_eur['eur_rate_lag1'])

In [21]:
df_eur

Unnamed: 0,eur_rate,date,eur_rate_lag1,eur_rate_dif,eur_rate_change%
51,2.0,1999-01-01,,,
50,2.75,1999-01-04,2.0,0.75,37.5
49,2.0,1999-01-22,2.75,-0.75,-27.272727
48,1.5,1999-04-09,2.0,-0.5,-25.0
47,2.0,1999-11-05,1.5,0.5,33.333333
46,2.25,2000-02-04,2.0,0.25,12.5
45,2.5,2000-03-17,2.25,0.25,11.111111
44,2.75,2000-04-28,2.5,0.25,10.0
43,3.25,2000-06-09,2.75,0.5,18.181818
42,3.25,2000-06-28,3.25,0.0,0.0


In [22]:
df_eur.drop(51, inplace = True)

In [23]:
df_eur.fillna(0,inplace = True)

In [24]:
df_eur.replace([np.inf, -np.inf], [100.0,-100.0], inplace = True)

In [25]:
df_eur

Unnamed: 0,eur_rate,date,eur_rate_lag1,eur_rate_dif,eur_rate_change%
50,2.75,1999-01-04,2.0,0.75,37.5
49,2.0,1999-01-22,2.75,-0.75,-27.272727
48,1.5,1999-04-09,2.0,-0.5,-25.0
47,2.0,1999-11-05,1.5,0.5,33.333333
46,2.25,2000-02-04,2.0,0.25,12.5
45,2.5,2000-03-17,2.25,0.25,11.111111
44,2.75,2000-04-28,2.5,0.25,10.0
43,3.25,2000-06-09,2.75,0.5,18.181818
42,3.25,2000-06-28,3.25,0.0,0.0
41,3.5,2000-09-01,3.25,0.25,7.692308


In [26]:
%store df_eur

Stored 'df_eur' (DataFrame)
