In [13]:
pip install lxml



`Extracting the infomation`

In [70]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_table_data(table):
    rows = table.find_all("tr")
    data = []

    for row in rows:
        cells = row.find_all(["th", "td"])
        row_data = [cell.get_text(strip=True) for cell in cells]
        if row_data:
            data.append(row_data)

    return data

url = "https://en.wikipedia.org/wiki/Table_of_food_nutrients"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.find_all("table", class_="wikitable")

all_data = []

for table in tables:
    table_data = get_table_data(table)
    all_data.extend(table_data)

# Determine the maximum number of columns
max_columns = max(len(row) for row in all_data)

# Fill missing values with empty strings
all_data = [row + [''] * (max_columns - len(row)) for row in all_data]

# Create a DataFrame
columns = ["Food", "Measure", "Grams", "Calories", "Protein", "Carb", "Fiber", "Fat", "Sat_fat"]
df = pd.DataFrame(all_data, columns=columns)

# Print the DataFrame
print(df)


                           Food  Measure  Grams  Calories  Protein  Carb  \
0                Dairy products                                            
1                          Food  Measure  Grams  Calories  Protein  Carb   
2             Cows' milk, whole    1 qt.    976       660       32    48   
3                          skim    1 qt.    984       360       36    52   
4          Buttermilk, cultured    1 cup    246       127        9    13   
..                          ...      ...    ...       ...      ...   ...   
414         Fruit-flavored soda   12 oz.    346       161        0    42   
415                  Ginger ale   12 oz.    346       105        0    28   
416                   Root beer   12 oz.    346       140        0    35   
417  Coffee, black, unsweetened    1 cup    230         3        t     1   
418     Tea, clear, unsweetened    1 cup    230         4        0     1   

     Fiber  Fat  Sat_fat  
0                         
1    Fiber  Fat  Sat_fat  
2     

In [45]:
df['Food'].unique()

array(['Dairy products', 'Food', "Cows' milk, whole", 'skim',
       'Buttermilk, cultured', 'Evaporated, undiluted', 'Fortified milk',
       'Powdered milk, whole', 'skim, instant', 'skim, non-instant',
       "Goats' milk, fresh", 'Malted milk', '(1/2 cup ice cream)',
       'Cocoa', 'Yogurt, of partially', 'skim. milk', 'Milk pudding',
       '(cornstarch)', 'Custard, baked', 'Ice cream, commercial',
       'Ice milk, commercial', 'Cream, light,', 'or half-and-half',
       'Cream, heavy,', 'or whipping', 'Cheese, cottage, creamed',
       'uncreamed', 'Cheddar, orAmerican', 'Cheddar, grated cup',
       'Cream cheese', 'Processed cheese', 'Roquefort type', 'Swiss',
       'Eggs, boiled, poached,', 'or raw', 'Scrambled,omelet,',
       'or fried', 'Yolksonly', 'Oils, fats and shortenings', 'Butter',
       'or', 'Hydrogenated cooking fat', 'Lard',
       'Margarine, 1/2 pound or', 'Margarine, 2 pat or', 'Mayonnaise',
       'Oils', 'Corn,soy,peanut', 'orcottonseed', 'Olive',
      

In [72]:
df

Unnamed: 0,Food,Measure,Grams,Calories,Protein,Carb,Fiber,Fat,Sat_fat
0,Dairy products,,,,,,,,
1,Food,Measure,Grams,Calories,Protein,Carb,Fiber,Fat,Sat_fat
2,"Cows' milk, whole",1 qt.,976,660,32,48,0,40,36
3,skim,1 qt.,984,360,36,52,0,t,t
4,"Buttermilk, cultured",1 cup,246,127,9,13,0,5,4
...,...,...,...,...,...,...,...,...,...
414,Fruit-flavored soda,12 oz.,346,161,0,42,0,0,0
415,Ginger ale,12 oz.,346,105,0,28,0,0,0
416,Root beer,12 oz.,346,140,0,35,0,0,0
417,"Coffee, black, unsweetened",1 cup,230,3,t,1,0,0,0


`Giving the product_type to each product that they belongs to.`

In [73]:
df.loc[df.index[range(2,37)], 'Product Type'] = "Dairy"

In [74]:
df.loc[df.index[range(40,57)], 'Product Type'] = "Oils, fats and shortenings"

In [75]:
df.loc[df.index[range(60,92)], 'Product Type'] = "Meat and poultry"

In [76]:
df.loc[df.index[range(95,116)], 'Product Type'] = "Fish and seafood"

In [77]:
df.loc[df.index[range(119,199)], 'Product Type'] = "Vegetables"

In [78]:
df.loc[df.index[range(202,273)], 'Product Type'] = "Fruits"

In [79]:
df.loc[df.index[range(276,332)], 'Product Type'] = "Breads, cereals, and grains"

In [80]:
df.loc[df.index[range(335,344)], 'Product Type'] = "Soups: canned and diluted"

In [81]:
df.loc[df.index[range(347,387)], 'Product Type'] = "Desserts and sweets"

In [82]:
df.loc[df.index[range(390,402)], 'Product Type'] = "Nuts and seeds"

In [83]:
df.loc[df.index[range(405,418)], 'Product Type'] = "Beverages"

In [84]:
df

Unnamed: 0,Food,Measure,Grams,Calories,Protein,Carb,Fiber,Fat,Sat_fat,Product Type
0,Dairy products,,,,,,,,,
1,Food,Measure,Grams,Calories,Protein,Carb,Fiber,Fat,Sat_fat,
2,"Cows' milk, whole",1 qt.,976,660,32,48,0,40,36,Dairy
3,skim,1 qt.,984,360,36,52,0,t,t,Dairy
4,"Buttermilk, cultured",1 cup,246,127,9,13,0,5,4,Dairy
...,...,...,...,...,...,...,...,...,...,...
414,Fruit-flavored soda,12 oz.,346,161,0,42,0,0,0,Beverages
415,Ginger ale,12 oz.,346,105,0,28,0,0,0,Beverages
416,Root beer,12 oz.,346,140,0,35,0,0,0,Beverages
417,"Coffee, black, unsweetened",1 cup,230,3,t,1,0,0,0,Beverages


`Removing uneccessary data`

`by just dropping the value which having Nan values`

In [87]:
df.dropna(inplace=True)

`After doing the above step we get our index in a non continuos form where which the value get's deleted`

In [89]:
df.reset_index(drop=True, inplace=True)

In [90]:
df

Unnamed: 0,Food,Measure,Grams,Calories,Protein,Carb,Fiber,Fat,Sat_fat,Product Type
0,"Cows' milk, whole",1 qt.,976,660,32,48,0,40,36,Dairy
1,skim,1 qt.,984,360,36,52,0,t,t,Dairy
2,"Buttermilk, cultured",1 cup,246,127,9,13,0,5,4,Dairy
3,"Evaporated, undiluted",1 cup,252,345,16,24,0,20,18,Dairy
4,Fortified milk,6 cups,1419,1373,89,119,1.4,42,23,Dairy
...,...,...,...,...,...,...,...,...,...,...
381,"Cola drinks,sweetened",12 oz.,346,137,0,38,0,0,0,Beverages
382,Fruit-flavored soda,12 oz.,346,161,0,42,0,0,0,Beverages
383,Ginger ale,12 oz.,346,105,0,28,0,0,0,Beverages
384,Root beer,12 oz.,346,140,0,35,0,0,0,Beverages


`Converting it into DataFrame`

In [91]:
df.to_csv('Food_Nutrients.csv')

`<Requests 200> means our request is successfully granted.`