## 1. Importing libraries :

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import re
import io
from PIL import Image
h = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}

## 2. Reading the .csv file :

In [3]:
df=pd.read_csv("WW_champagne.csv")

In [4]:
df.head()

Unnamed: 0,Product Title,Final Price,Reduction?,Price Without Reduction,URL Product,URL Image,Stock Status
0,Champagne Henriot Blanc de Blancs Brut NV (Wit...,598,no,,https://www.watsonswine.com/en/wine/champagne-...,https://www.watsonswine.com/medias/sys_master/...,In Stock
1,Champagne Palmer & Co Blanc de Blancs Brut NV-...,438,yes,548.0,https://www.watsonswine.com/en/wine/champagne-...,https://www.watsonswine.com/medias/sys_master/...,In Stock
2,Le Brun de Neuville Extra Brut Blanc de Blancs...,480,no,,https://www.watsonswine.com/en/wine/le-brun-de...,https://www.watsonswine.com/medias/sys_master/...,In Stock
3,Le Brun de Neuville Extra Brut Champagne Le Ch...,490,no,,https://www.watsonswine.com/en/wine/le-brun-de...,https://www.watsonswine.com/medias/sys_master/...,In Stock
4,Champagne Henriot Brut Souverain NV-BP_374019,450,no,,https://www.watsonswine.com/en/wine/champagne-...,https://www.watsonswine.com/medias/sys_master/...,In Stock


In [5]:
URLimage=df["URL Image"].tolist()
URLproduct=df["URL Product"].tolist()
productTitle=df["Product Title"].tolist()

## 3. Creating a function to get the type of product (glass or bottle?) :

In [6]:
def findProductType(url,productTitle,idx,h):
    r = requests.get(url,headers=h)
    soup = bs(r.text, 'lxml')
    
    aboutscrap = soup.find("div", {"class": "about-title"})
    if aboutscrap != None:
        aboutscrap=aboutscrap.text.lower().replace("about the ","").replace("\n","").replace("\t","")
    elif aboutscrap == None:
        aboutscrap=soup.find("div", {"class": "accessory-title"})
        if aboutscrap != None:
            aboutscrap=aboutscrap.text.lower().replace("about the ","").replace("\n","").replace("\t","")
        else :
            aboutscrap="?"
    else :
        aboutscrap="?"
    if "glass" in productTitle[idx].lower():
        isglass="glass"
    else :
        isglass="notglass"
    return [aboutscrap, isglass]

In [7]:
def ProductDescription(restype):
    description=""
    if restype[0]=="wine":
        description+="bottle of wine"
        if restype[1]=='glass':
            description+=" with glass"
    elif restype[0]=="accessory":
        if restype[1]=='glass':
            description+="glass"
        else :
            description+="glass?"
    elif restype[0]=="pack":
        description+="pack of wine"
        if restype[1]=='glass':
            description+=" with glass"
    return description

In [8]:
def getProductsDescription(URLproduct,productTitle,h):
    description_list=[]
    idxmax=len(URLproduct)
    for idx,url in enumerate(URLproduct):
        type_item=findProductType(url,productTitle,idx,h)
        description_list.append(ProductDescription(type_item))
        print(f"{idx+1} of {idxmax}",end="\r")
    return description_list

## 4. Creating a function to get the Height and width of the picture (in pixels) :

In [9]:
def findImgSize(url,h):
    r = requests.get(url,headers=h)
    bytes_im = io.BytesIO(r.content)
    cv_im = Image.open(bytes_im)
    height, width=cv_im.size

    return height, width

In [10]:
def getImagesSizes(URLimage,h):
    height_list=[]
    width_list=[]
    idxmax=len(URLimage)
    for idx,url in enumerate(URLimage):
        height,width = findImgSize(url,h)
        height_list.append(height)
        width_list.append(width)
        print(f"{idx+1} of {idxmax}",end="\r")
    return height_list,width_list

## 5. Creating a function to get the vintage of the champagne (year of production) :

In [11]:
def findYearInString(string, sp="-BP"):
    if sp != None :
        string=string.split(sp)[0]
    if string == None :
        return ""
    return re.match(r'.*([1-2][0-9]{3})', string)

In [12]:
def findStringInContent(url,h):
    r = requests.get(url,headers=h)
    soup = bs(r.text, 'lxml')
    
    aboutcontent = soup.find("div", {"class": "about-content"})
    
    if aboutcontent != None :
        aboutcontent = aboutcontent.text.replace("\n","").replace("\t","")
    
    return aboutcontent

In [13]:
def getProductYear(productTitle, URLproduct, h):
    year_list=[]
    idxmax=len(productTitle)
    for idx,t in enumerate(productTitle):
        year=findYearInString(t)
        if year is not None:
            year=year.group(1)
        else :
            url=URLproduct[idx]
            year = findStringInContent(url,h)
            if year != None :
                year = findYearInString(year,None)
                if year == None :
                    year = ""
                else :
                    year = "*"+year.group(1)+"*"
            else :
                year = ""
        year_list.append(year)
        print(f"{idx+1} of {idxmax}",end="\r")
    return year_list

## 6. Extracting the required informations :

In [14]:
year=getProductYear(productTitle, URLproduct, h)

89 of 89

In [15]:
height,width=getImagesSizes(URLimage,h)

89 of 89

In [16]:
description=getProductsDescription(URLproduct,productTitle,h)

89 of 89

## 7. Inserting them on the dataset

In [17]:
df['Vintage of the Product'] = year
df['Image Height (pixels)'] = height
df['Image Width (pixels)'] = width
df['Type of Product'] = description

In [18]:
df.head()

Unnamed: 0,Product Title,Final Price,Reduction?,Price Without Reduction,URL Product,URL Image,Stock Status,Vintage of the Product,Image Height (pixels),Image Width (pixels),Type of Product
0,Champagne Henriot Blanc de Blancs Brut NV (Wit...,598,no,,https://www.watsonswine.com/en/wine/champagne-...,https://www.watsonswine.com/medias/sys_master/...,In Stock,,417,600,bottle of wine with glass
1,Champagne Palmer & Co Blanc de Blancs Brut NV-...,438,yes,548.0,https://www.watsonswine.com/en/wine/champagne-...,https://www.watsonswine.com/medias/sys_master/...,In Stock,,340,488,bottle of wine
2,Le Brun de Neuville Extra Brut Blanc de Blancs...,480,no,,https://www.watsonswine.com/en/wine/le-brun-de...,https://www.watsonswine.com/medias/sys_master/...,In Stock,,417,600,bottle of wine
3,Le Brun de Neuville Extra Brut Champagne Le Ch...,490,no,,https://www.watsonswine.com/en/wine/le-brun-de...,https://www.watsonswine.com/medias/sys_master/...,In Stock,,417,600,bottle of wine
4,Champagne Henriot Brut Souverain NV-BP_374019,450,no,,https://www.watsonswine.com/en/wine/champagne-...,https://www.watsonswine.com/medias/sys_master/...,In Stock,,417,600,bottle of wine


In [25]:
df.to_csv('WW_champagne_plus.csv', index=False)

## 8. Data frame with hyperlinks :

Observation: Only the urls are hyperlinks.

In [28]:
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

df.style.format(make_clickable)

Unnamed: 0,Product Title,Final Price,Reduction?,Price Without Reduction,URL Product,URL Image,Stock Status,Vintage of the Product,Image Height (pixels),Image Width (pixels),Type of Product
0,Champagne Henriot Blanc de Blancs Brut NV (With 2 Champagne Glasses) -BP_476241,598,no,,https://www.watsonswine.com/en/wine/champagne-henriot-blanc-de-blancs-brut-nv-with-2-champagne-glasses-/p/BP_476241,https://www.watsonswine.com/medias/sys_master/front/prd/9020718350366.png,In Stock,,417,600,bottle of wine with glass
1,Champagne Palmer & Co Blanc de Blancs Brut NV-BP_450340,438,yes,548.0,https://www.watsonswine.com/en/wine/champagne-palmer-co-blanc-de-blancs-brut-nv/p/BP_450340,https://www.watsonswine.com/medias/sys_master/front/prd/8918619783198.png,In Stock,,340,488,bottle of wine
2,Le Brun de Neuville Extra Brut Blanc de Blancs Champagne NV-BP_333228,480,no,,https://www.watsonswine.com/en/wine/le-brun-de-neuville-extra-brut-blanc-de-blancs-champagne-nv/p/BP_333228,https://www.watsonswine.com/medias/sys_master/front/prd/8994189901854.png,In Stock,,417,600,bottle of wine
3,Le Brun de Neuville Extra Brut Champagne Le Chemin Empreinte Blanc de Blancs NV-BP_415277,490,no,,https://www.watsonswine.com/en/wine/le-brun-de-neuville-extra-brut-champagne-le-chemin-empreinte-blanc-de-blancs-nv/p/BP_415277,https://www.watsonswine.com/medias/sys_master/front/prd/8995299950622.png,In Stock,,417,600,bottle of wine
4,Champagne Henriot Brut Souverain NV-BP_374019,450,no,,https://www.watsonswine.com/en/wine/champagne-henriot-brut-souverain-nv/p/BP_374019,https://www.watsonswine.com/medias/sys_master/front/prd/9026003599390.png,In Stock,,417,600,bottle of wine
5,Champagne Palmer & Co Brut Reserve NV-BP_384090,398,no,,https://www.watsonswine.com/en/wine/champagne-palmer-co-brut-reserve-nv/p/BP_384090,https://www.watsonswine.com/medias/sys_master/front/prd/8995341074462.png,Out of Stock,,417,600,bottle of wine
6,Champagne Henriot Blanc de Blancs Brut NV-BP_374020,558,no,,https://www.watsonswine.com/en/wine/champagne-henriot-blanc-de-blancs-brut-nv/p/BP_374020,https://www.watsonswine.com/medias/sys_master/front/prd/9016903761950.png,In Stock,,417,600,bottle of wine
7,Maison De Grand Marquis De La Mysteriale Champagne Brut NV-BP_480309,420,no,,https://www.watsonswine.com/en/wine/maison-de-grand-marquis-de-la-mysteriale-champagne-brut-nv/p/BP_480309,https://www.watsonswine.com/medias/sys_master/front/prd/8842541858846.jpg,In Stock,,340,488,bottle of wine
8,Le Brun de Neuville Brut Champagne Cuvee Authentique Assemblage NV-BP_333229,420,yes,558.0,https://www.watsonswine.com/en/wine/le-brun-de-neuville-brut-champagne-cuvee-authentique-assemblage-nv/p/BP_333229,https://www.watsonswine.com/medias/sys_master/front/prd/8994304851998.png,In Stock,,417,600,bottle of wine
9,Sophienwald Champagne Glass (With Gift Box)-BP_415041,495,no,,https://www.watsonswine.com/en/wine/sophienwald-champagne-glass-with-gift-box-/p/BP_415041,https://www.watsonswine.com/medias/sys_master/front/prd/8962365030430.png,In Stock,,340,488,glass
