#### Implemented get_data_by_date() method which returns a specific page in myfitnesspal using BeautifulSoup. 

In [1]:
from requests import Session
from bs4 import BeautifulSoup as bs
import pandas as pd
from authentication import username, password
from datetime import timedelta, date

#url for the different pages. 
url = "https://www.myfitnesspal.com/account/login"
login_url = "https://www.myfitnesspal.com/account/login"
diary_url = "https://www.myfitnesspal.com/food/diary"

# get data by date method. returns the food/diary page depending on the date. 
def get_data_by_date(date):

    with Session() as s:
        site = s.get(url)
        bs_content = bs(site.content, "html.parser")
        token = bs_content.find("meta", {"name": "csrf-token"})["content"]
        login_data = {
            "username": username,
            "password": password,
            "authenticity_token": token,
            "remember_me": 1,
            "utf8": "true",
        }

        s.post(login_url, login_data)
        print(f"{diary_url}?date={date}")
        home_page = s.get(f"{diary_url}?date={date}")
        return home_page

#helper method for datelist. 
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

# returns list of dates from start_date to end_date as strings. 
def datelist():

    start_date = date(2020, 11, 14)
    end_date = date.today()
    date_list = []
    for single_date in daterange(start_date, end_date):
        date_list.append(single_date.strftime("%Y-%m-%d"))
    return date_list



In [2]:
dates = datelist()
print(dates)


['2020-11-14', '2020-11-15', '2020-11-16', '2020-11-17', '2020-11-18', '2020-11-19', '2020-11-20', '2020-11-21', '2020-11-22', '2020-11-23', '2020-11-24', '2020-11-25', '2020-11-26', '2020-11-27', '2020-11-28', '2020-11-29', '2020-11-30', '2020-12-01', '2020-12-02', '2020-12-03', '2020-12-04', '2020-12-05', '2020-12-06', '2020-12-07', '2020-12-08', '2020-12-09', '2020-12-10', '2020-12-11', '2020-12-12', '2020-12-13', '2020-12-14', '2020-12-15', '2020-12-16', '2020-12-17', '2020-12-18', '2020-12-19', '2020-12-20', '2020-12-21', '2020-12-22', '2020-12-23', '2020-12-24', '2020-12-25', '2020-12-26', '2020-12-27', '2020-12-28', '2020-12-29', '2020-12-30', '2020-12-31', '2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04']


#### obtain data in table form using pandas.read_html() and then appending just the totals for each day to df_totals df. 

In [53]:
#Initialize empty data frame. 
df_totals = pd.DataFrame()
#for each date in dates list. 
for date in dates:
    
    df_list = pd.read_html(get_data_by_date(date).text, header=0)
    df = df_list[0]
    
    # just access the row with the totals for the day.
    subset = df.loc[df['Breakfast'] == "Totals"]
    
    #add to df totals 
    df_totals = pd.concat([df_totals, subset], ignore_index = True)


https://www.myfitnesspal.com/food/diary?date=2020-11-14
https://www.myfitnesspal.com/food/diary?date=2020-11-15
https://www.myfitnesspal.com/food/diary?date=2020-11-16
https://www.myfitnesspal.com/food/diary?date=2020-11-17
https://www.myfitnesspal.com/food/diary?date=2020-11-18
https://www.myfitnesspal.com/food/diary?date=2020-11-19
https://www.myfitnesspal.com/food/diary?date=2020-11-20
https://www.myfitnesspal.com/food/diary?date=2020-11-21
https://www.myfitnesspal.com/food/diary?date=2020-11-22
https://www.myfitnesspal.com/food/diary?date=2020-11-23
https://www.myfitnesspal.com/food/diary?date=2020-11-24
https://www.myfitnesspal.com/food/diary?date=2020-11-25
https://www.myfitnesspal.com/food/diary?date=2020-11-26
https://www.myfitnesspal.com/food/diary?date=2020-11-27
https://www.myfitnesspal.com/food/diary?date=2020-11-28
https://www.myfitnesspal.com/food/diary?date=2020-11-29
https://www.myfitnesspal.com/food/diary?date=2020-11-30
https://www.myfitnesspal.com/food/diary?date=202

In [66]:
#add date column in df_totals corresponding to the total calories that day. 
df_totals['date'] = dates

#savepoint
df_totals.to_csv(r'macros_raw.csv', index = False)


### cleaning df_totals


In [200]:
#Recognize 0 and 0 - as NA's.

NA_vals = ['0 -', 'Nan', '0']
df_raw = pd.read_csv('macros_raw.csv', na_values = NA_vals)
df_raw.head(5)

Unnamed: 0,Breakfast,Calories kcal,Carbs g,Fat g,Protein g,Sodium mg,Sugar g,Unnamed: 7,date
0,Totals,1921.0,202 42,65 30,133 28,1422.0,53.0,,2020-11-14
1,Totals,1944.0,239 48,48 22,149 30,755.0,94.0,,2020-11-15
2,Totals,1960.0,253 52,41 19,146 29,886.0,63.0,,2020-11-16
3,Totals,1837.0,221 48,41 20,147 32,2486.0,50.0,,2020-11-17
4,Totals,2038.0,251 48,50 22,157 30,1071.0,57.0,,2020-11-18


In [201]:
#Detecting NA's. 
df_raw.isnull().sum()

Breakfast          0
Calories  kcal     2
Carbs  g           0
Fat  g             0
Protein  g         0
Sodium  mg         2
Sugar  g           2
Unnamed: 7        52
date               0
dtype: int64

In [202]:
#Drop unnamed column.
df_raw.dropna(axis = 1, inplace=True, how='all')

#Drop column named "Breakfast".
df_raw.drop(columns=['Breakfast'], inplace = True)

#Drop row at index 47 because the data was incomplete that day. 
df_raw.drop(47,inplace = True)


In [203]:
#Drop rows where calories were 0. 
df_raw.dropna(axis = 0, inplace=True,how='any')

In [204]:
#Rename column names.
df_raw.columns = ['calories', 'carbs','fat', 'protein','sodium','sugar','date']

In [205]:
#Remove last 2 character from carbs and protein columns. 
df_raw.carbs = df_raw.carbs.map(lambda x: str(x)[:-2])
df_raw.protein = df_raw.protein.map(lambda x: str(x)[:-2])

In [206]:
#Remove the random numbers from fat. 
df_raw.fat = df_raw.fat.str.split(' ').str.get(0)

In [215]:
#change type of sugar, calories and sodium columns to int. 
df_raw = df_raw.astype({"sugar": int, "sodium": int, "calories": int})

In [216]:
df_raw.head()

Unnamed: 0,date,calories,carbs,fat,protein,sodium,sugar
0,2020-11-14,1921,202,65,133,1422,53
1,2020-11-15,1944,239,48,149,755,94
2,2020-11-16,1960,253,41,146,886,63
3,2020-11-17,1837,221,41,147,2486,50
4,2020-11-18,2038,251,50,157,1071,57


In [217]:
#Reorder columns so date is first. 
dateDf = pd.DataFrame(df_raw.pop('date'))
df_raw = dateDf.join(df_raw)

In [218]:
#write to csv 
df_cleaned = df_raw

df_cleaned.to_csv(r'macros_clean.csv', index = False)

### final data frame.  

In [219]:
df_cleaned


Unnamed: 0,date,calories,carbs,fat,protein,sodium,sugar
0,2020-11-14,1921,202,65,133,1422,53
1,2020-11-15,1944,239,48,149,755,94
2,2020-11-16,1960,253,41,146,886,63
3,2020-11-17,1837,221,41,147,2486,50
4,2020-11-18,2038,251,50,157,1071,57
5,2020-11-19,2241,251,65,157,1912,34
6,2020-11-20,1968,233,54,144,1161,39
7,2020-11-21,1979,229,51,152,1005,82
8,2020-11-22,2078,223,61,148,1777,49
10,2020-11-24,2160,259,65,146,751,105
