## Scraping and Collecting Data

In [1]:
!pip -q install transformers

[K     |████████████████████████████████| 1.3MB 5.6MB/s 
[K     |████████████████████████████████| 890kB 19.0MB/s 
[K     |████████████████████████████████| 1.1MB 28.5MB/s 
[K     |████████████████████████████████| 2.9MB 31.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


### Import Libraries

In [3]:
import numpy as np
import pandas as pd 

from bs4 import BeautifulSoup
import requests
import csv
import json 


from transformers import MarianTokenizer, MarianMTModel
from typing import List

### Creation of a translator, to translate the data that are in French

In [4]:
model_name = f'Helsinki-NLP/opus-mt-{"fr"}-{"en"}'
model = MarianMTModel.from_pretrained(model_name)               #Loads the MarianMT model
tokenizer = MarianTokenizer.from_pretrained(model_name)

def translate_to_english(text): #Translation function
  if (not text):
    return [text]
  batch = tokenizer.prepare_seq2seq_batch(src_texts=[text])  #Creates word batches for the sequential passing to the model
  generate_model = model.generate(**batch)  
  translation: List[str] = tokenizer.batch_decode(generate_model, skip_special_tokens=True)  #Translation
  return translation

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1133.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=300827685.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=802397.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=778395.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1339166.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




### Creation of Functions that take a URL of the site of a startup and return a small Description line

In [None]:
def get_Description(url,Translate):
    print(url)

    """
    Return the a small Description of the Startup, which we can find by extracting meta description from urls
    """
    #Send The Request
    try:
          try:
              page = requests.get(url) 
          except ConnectionError:
              page = requests.get(url.replace("https","http"))
      
          #Parse html code
          soup = BeautifulSoup(page.content, 'html.parser')  
          
          
          texts = soup.find("meta", property="og:description")  
          if (texts):      
            text_from_html=texts['content']

          else :
            metas = soup.find_all('meta') 
            meta= ([ meta.attrs['content'] for meta in metas if 'name' in meta.attrs and meta.attrs['name'] == 'description' ])
            if(len(meta)==0):
              return None
            text_from_html=meta[0]

          if(Translate==True):
              text_from_html=translate_to_english(text_from_html)[0] 
              
          return text_from_html  

    except Exception as e: 
          print("Unexpected error:", e)
          return None  


### Collect Data from lespepitestech.com site

#### Cookies and headers preparation

In [None]:
#cookies and headers that the request must contain when sent to the https server
cookies="__cfduid=d831ddcf3d9060346393db0cdfb62618f1602330544; _ga=GA1.2.826854543.1602330545; _fbp=fb.1.1602330545793.1970562287; hubspotutk=c6c0dad3f644023f04005954627b39ff; __smToken=EUHd4DvXWaAdOpYcYFFVjS4F; __smSmartbarShown=Sat%20Oct%2010%202020%2013:49:08%20GMT+0200%20(Central%20European%20Summer%20Time); cookie-agreed=2; __smVID=407637e5549f4681a9305b7ba344acbdd0397db4f22227020022ee44c7b9d66b; SSESSbfe8f364900a8edd3b0626d58c5cb830=tYh81i3x9sFIRbHb9Bgj_GoyCQ5Bvdb4bmf_49qajWA; has_js=1; _gid=GA1.2.163445558.1602615408; __hstc=138838316.c6c0dad3f644023f04005954627b39ff.1602330546137.1602522364578.1602615412109.3; __hssrc=1; __hssc=138838316.6.1602615412109; _gat_UA-58720480-1=1; _gat=1; _dc_gtm_UA-58720480-1=1"
headers="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36*Content-Type: application/x-www-form-urlencoded; charset=UTF-8*Origin: https://lespepitestech.com*Sec-Fetch-Site: same-origin*Sec-Fetch-Mode: cors*Sec-Fetch-Dest: empty*Referer: https://lespepitestech.com/startup-collection/blockchain*Accept-Encoding: gzip, deflate*Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,fr;q=0.7"

#pre-processing of cookies and headers in a dictionnary form 
cookies="{\""+cookies.replace(";","\",\"").replace("=","\":\"") +"\"}"
cookies = json.loads(cookies)

headers="{\""+headers.replace("*","\",\"").replace(": ","\":\"") +"\"}"
headers = json.loads(headers)

#### Get the url from the page on a one startup

In [None]:
def get_url(soup):
    s=soup.select_one('div.topWebsite a')
    return str(s['href'])

def get_keywords(soup):
    Domaine=[]
    for a in soup.select('div.lpt-dropdown-category a'):
      if(len(a.contents)!=0):
        Domaine.append(str(a.contents[0]))

    for a in soup.select('ul.lpt-dropdown-all-categories.dropdown-menu a'):
      if(len(a.contents)!=0):
        Domaine.append(str(a.contents[0]))

    return "/".join(Domaine)

def get_Description(soup):
    
    p=soup.select_one('h3.topDescr')
    return p.contents[0][1:]

#### Get url of startups from a specific page

In [None]:
def get_startups(urls):
  
    i=0
    k=0

    while(True):
        print(i,urls)
        url = requests.post(urls+"?page="+str(i),headers=headers,cookies=cookies)
        soup = BeautifulSoup(url.content, 'html.parser')
        links=soup.select('a.startup-entry-hitbox')
        if(links):
          for link in links:
                url="https://lespepitestech.com"+link['href']
                url = requests.post(url, headers=headers,cookies=cookies)
                soup = BeautifulSoup(url.content, 'html.parser')
                if (not soup.select_one('div.topWebsite a')):
                      print("Type error, Try again")
                      url = requests.post("https://lespepitestech.com"+link['href'], headers=headers,cookies=cookies)
                      soup = BeautifulSoup(url.content, 'html.parser')
                      if (not soup.select_one('div.topWebsite a')):
                        continue


                startups.append(link['href'].split("/")[-1])
                startups_urls.append(get_url(soup).split("?")[0])
                startups_keywords.append(get_keywords(soup))
                startups_descriptions.append(get_Description(soup))
                
                print(k,link['href'].split("/")[-1])
                k+=1
        else:
            break;

        i+=1



In [None]:
urls="https://lespepitestech.com/les-ecosystemes-de-la-french-tech"
url = requests.post(urls,headers=headers,cookies=cookies)
soup = BeautifulSoup(url.content, 'html.parser')

startups=[]
startups_urls=[]
startups_keywords=[]
startups_descriptions=[]
k=0;
links=soup.select('div.card a')
links=[links[i] for i in range(len(links)) if (i%2)==0]
for link in links[68:]:
      get_startups("https://lespepitestech.com"+link['href'])
      dataframe = pd.DataFrame(columns=['Name','urlwebsite']) 
      dataframe["Name"]=startups
      dataframe["urlwebsite"]=startups_urls
      dataframe["keywords"]=startups_keywords
      dataframe["Description"]=startups_descriptions

      dataframe.to_csv("data.csv")
dataframe

0 https://lespepitestech.com/french-tech-hub/french-tech-boston
0 pentalog 
1 jack-and-ferdi 
2 qubiq 
3 mapwize 
4 matchupbox 
5 openfield 
6 talentoday 
1 https://lespepitestech.com/french-tech-hub/french-tech-boston
0 https://lespepitestech.com/french-tech-hub/french-tech-nordics
0 moneezy 
1 matchbanker 
2 ecotree 
3 karma 
Type error, Try again
4 limouzik 
5 duo 
1 https://lespepitestech.com/french-tech-hub/french-tech-nordics
0 https://lespepitestech.com/french-tech-hub/french-tech-saint-etienne
0 mobishop 
1 bestofone 
2 everything 
3 meal-canteen 
4 neotess 
5 opencrea 
1 https://lespepitestech.com/french-tech-hub/french-tech-saint-etienne
0 https://lespepitestech.com/french-tech-hub/french-tech-philippines
0 heycaptain 
1 youday 
2 waza-education 
3 activity-lab 
4 talkpush 
1 https://lespepitestech.com/french-tech-hub/french-tech-philippines
0 https://lespepitestech.com/french-tech-hub/french-tech-tunis
0 tira-robots-sa 
1 pressing-en-ligne-tn 
2 sesamm 
3 orrlando 
4 avempac

Unnamed: 0,Name,urlwebsite,keywords,Description
0,pentalog,http://www.pentalog.com,,une équipe agile pour vos besoins en développe...
1,jack-and-ferdi,http://www.jackandferdi.com,Voyage,Redonnez du sens à vos voyages d'affaire avec ...
2,qubiq,http://www.qubiqaudio.com/,enregistrement / logiciel / Musique,Plateforme de creation de plugins audio.
3,mapwize,http://www.mapwize.io,carte,The indoor mapping platform
4,matchupbox,http://matchupbox.com/,CRM / FinTech / identités numériques / logiciel,"MatchUpBox développe Pikcio, une application c..."
...,...,...,...,...
102,sesamm,https://www.sesamm.com/,API / big data / Blockchain / finance / FinTe...,Big Data et Intelligence Artificielle pour l'i...
103,les-pepites-tech,http://www.lespepitestech.com,media,La communauté des entrepreneurs français dans ...
104,les-pepites-tech,http://www.lespepitestech.com,media,La communauté des entrepreneurs français dans ...
105,les-pepites-tech,http://www.lespepitestech.com,media,La communauté des entrepreneurs français dans ...


In [None]:
dataframe = pd.DataFrame(columns=['Name','urlwebsite']) 
dataframe["Name"]=startups[:len(startups_urls)]
dataframe["urlwebsite"]=startups_urls
dataframe.to_csv("data.csv")
dataframe.head(10)

Unnamed: 0,Name,urlwebsite
0,pandaloc,https://www.pandaloc.com
1,1food1me,https://1food1me.com
2,appines,https://www.appines.fr/
3,interstis,https://interstis.fr/
4,kshuttle,https://www.kshuttle.io/fr/
5,ticketin,http://www.ticketin.fr
6,citizenwave,https://www.citizenwave.com/
7,darlow,https://darlowparis.com
8,deepsight,https://www.deepsight.io/
9,teambrain,https://teambrain.fr/


In [None]:
import pandas as pd
dataframe=pd.read_csv("data_version_1.csv")
dataframe["Description"]=dataframe.apply(lambda row: get_Description(row["urlwebsite"],False),axis=1)
dataframe.to_csv("data.csv")
dataframe.head(10)

https://1food1me.com
https://www.appines.fr/
https://interstis.fr/
https://www.kshuttle.io/fr/
http://www.ticketin.fr
https://www.citizenwave.com/
https://darlowparis.com
https://www.deepsight.io/
https://teambrain.fr/
https://www.luxurynsight.com/
https://www.captainachat.com
https://trapeze-des-mascareignes.site/
http://www.meet-bs.com
https://www.serialtesteur.fr/
https://www.feelgoodvinyl.com
https://melotick.com
http://examin.eu
https://www.amabilis.fr/
https://crowdsec.net/
https://lici.fr/
http://datasoluce.com/
https://tomojo.co/
https://keeze.co/
https://www.koyeb.com/
https://spectraltms.com/
https://www.auto-ecole.net/
https://www.campings.com/fr/
http://www.avekapeti.com
http://www.skiptax.com
https://xval.fr/
https://digitalessence.fr/
https://deftsoftware.com/
https://spacesense.ai/
http://merca.team/
https://www.kurmi-software.com/fr/
https://www.sekoia.fr/
https://join-implement.com/
https://www.ekestrian.com/
Unexpected error: HTTPSConnectionPool(host='www.ekestrian.co

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://anakine.io/
https://hydrocool.fr
https://xlearn.fr/
https://audi-on.com/
https://www.tripitto.com/
http://www.livejourney.com
https://www.monlouetmoi.fr/
https://looztick.fr/
https://grannycharly.fr/
https://www.olybe.com/
https://www.panda-guide.fr/
https://www.lizee.co/
https://www.greenback.green/
https://www.slatch.io/
http://honing-biosciences.com/
https://www.mces.gg/
https://www.plume-app.co/
https://www.adyouneed.com/
https://ponicode.com/
https://www.luko.eu/fr/
https://neosilver.fr/
https://linktr.ee/le_cryptopolitain
http://www.adenlab.com
https://www.kionect.com
https://sommelierduparfum.com
http://on-board.co
https://www.flasheat.fr
https://greenvillage.io/
http://timees-coworking.fr
https://www.darewise.com/
http://www.ovinia.fr
http://www.james-bang.com
http://www.officeriders.com
http://www.mylfruision.com
https://www.twicpics.com/
https://a-renover.com
Unexpected error: HTTPSConnectionPool(host='a-renover.com', port=443): Max retries exceeded with url: / (Cause

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://www.feedsmartfood.com/fr/
https://www.sapiendo-retraite.fr/
https://weem.fr/
https://www.reelevant.com/
http://nunki.co/
https://qonto.eu/fr
https://www.perouse.paris/
https://www.watiz.io/
https://www.transparencyrights.com/
Unexpected error: HTTPSConnectionPool(host='www.transparencyrights.com', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'www.transparencyrights.com' doesn't match either of 'cluster011.hosting.ovh.net', 'www.cluster011.hosting.ovh.net'",),))
https://www.mformoney.fr/
http://morphosense.com/
https://www.lok-iz.com/fr
Unexpected error: HTTPSConnectionPool(host='www.lok-iz.com', port=443): Max retries exceeded with url: /fr (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7faa1ac3cac8>: Failed to establish a new connection: [Errno 110] Connection timed out',))
https://atexio.fr/cybersecurite/
Unexpected error: HTTPSConnectionPool(host='atexio.fr', port=443): Max retries exceeded

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


http://www.lelapafund.com
Unexpected error: HTTPSConnectionPool(host='www.lelapafund.com', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'www.lelapafund.com' doesn't match either of '*.netlify.com', 'netlify.com'",),))
http://www.coloroptical.com/
Unexpected error: HTTPConnectionPool(host='www.coloroptical.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7faa192d15f8>: Failed to establish a new connection: [Errno 110] Connection timed out',))
http://www.fiftyfor.com
http://www.donasafe.com/
http://www.semly.io
Unexpected error: HTTPConnectionPool(host='www.semly.io', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7faa1a9dd048>: Failed to establish a new connection: [Errno -2] Name or service not known',))
http://www.limber.io/
https://www.milesbooster.fr
http://www.firebnb.com
http://www.euri

  ' Beautiful Soup.' % markup)


https://younited-credit.com
http://www.pepper-site.com/
http://startuponly.com/
http://www.trankility.fr
http://pi-tracker.net
Unexpected error: HTTPConnectionPool(host='pi-tracker.net', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7faa1ae24400>: Failed to establish a new connection: [Errno -2] Name or service not known',))
https://lick.fr
http://www.umalis.fr
http://bit.ly/RoomsByHivy
Unexpected error: HTTPSConnectionPool(host='edenworkplace.com', port=443): Max retries exceeded with url: /?utm_source=lespepitestech&utm_medium=submit&utm_campaign=betarooms (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7faa18d40cf8>: Failed to establish a new connection: [Errno 110] Connection timed out',))
http://www.hellotrip.fr/
Unexpected error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
http://captaincrowd.co
Unexpected error: HTTPC

Unnamed: 0.1,Unnamed: 0,Name,urlwebsite,Description
0,0,1food1me,https://1food1me.com,Votre corps sait ce dont il a besoin. Apprenez...
1,1,appines,https://www.appines.fr/,Télécharge la 1ère app de recrutement de rempl...
2,2,interstis,https://interstis.fr/,Formulaire de recrutement d'interstis
3,3,kshuttle,https://www.kshuttle.io/fr/,kShuttle.io offre une vision unifiée de l'effi...
4,4,ticketin,http://www.ticketin.fr,
5,5,citizenwave,https://www.citizenwave.com/,"Découvrez Citizenwave, la Plateforme analytiqu..."
6,6,darlow,https://darlowparis.com,"Agence fondée par une artiste, votre site sera..."
7,7,deepsight,https://www.deepsight.io/,
8,8,teambrain,https://teambrain.fr/,TeamBrain c'est automatiser les réponses ✔️ sa...
9,9,luxurynsight,https://www.luxurynsight.com/,Combining the best of data science and luxury ...


In [None]:
dataframe.loc

Unnamed: 0.1,Unnamed: 0,Name,urlwebsite,Description
0,0,1food1me,https://1food1me.com,Votre corps sait ce dont il a besoin. Apprenez...
1,1,appines,https://www.appines.fr/,Télécharge la 1ère app de recrutement de rempl...
2,2,interstis,https://interstis.fr/,Formulaire de recrutement d'interstis
3,3,kshuttle,https://www.kshuttle.io/fr/,kShuttle.io offre une vision unifiée de l'effi...
4,4,ticketin,http://www.ticketin.fr,
...,...,...,...,...
3882,3885,zenchef,http://zenchef.com/fr/,"Cahier de réservation digital pour restaurant,..."
3883,3886,la-pause-cinema,http://lapausecinema.co/,
3884,3887,glowee,http://www.glowee.fr/,Glowee est un système de lumière biologique pe...
3885,3888,sparingvision,https://sparingvision.com/en/,SparingVision is a biotechnology company focus...


In [19]:
import pandas as pd
df=pd.read_csv("Data_Startups.csv")
df[df.keywords.isna()]

Unnamed: 0,Name,urlwebsite,keywords,Description_French,Description
4,mylabel,https://www.mylabel.io/,,L'appli qui vous permet de consommer selon vos...,The app allows you to eat according to your va...
10,blueway,https://www.blueway.fr/,,"Editeur de logiciels BPM, MDB, ESB et API inté...","BPM Software publisher, MDB, BSE and integrate..."
15,swwitch,http://www.swwitch.eu,,Swwitch est la première plateforme d'échange d...,Swwitch is the first platform of exchange of h...
30,ohmytools,https://ohmy.tools,,Une liste d'outils pour des entrepreneurs mali...,A list of tools for smart entrepreneurs! To la...
34,ma-petite-gazette,https://mapetitegazette.fr,,La première newsletter collaborative !,The first collaborative newsletter!
...,...,...,...,...,...
8892,woozgo,http://www.woozgo.fr/,,Une nouvelle façon de faire des rencontres ave...,A new way to socialize with thousands of singles
8928,sortirtoday,http://sortirtoday.fr/,,Le site de rencontres amicales qui agite votre...,The friendly dating site that stirs your good ...
8930,vallib,http://www.vallib.com/,,Service de location de valise,suitcase hire
8946,sparingvision,https://sparingvision.com/en/,,Biotech qui développe un médicament contre la ...,Biotech develops a medicament against retiniti...


Unnamed: 0,Name,urlwebsite,keywords,Description_French,Description
0,medeo,https://www.medeo-health.com/,digital / e-santé / objets connectés / santé,Simplifier le quotidien des professionnels de ...,Simplifying the lives of health professionals ...
1,co-cto,https://www.co-cto.fr/,Accompagnement / application / Application eCo...,CO-CTO met au service des startups et PME des ...,CO-CTO at the service experienced startups and...
2,curvway,http://curvway.com,mobilité / SportTech,Conception de planches électriques qui permett...,Design of electrical boards that allow to reca...
3,ecosiag,http://www.ecosiag.com,B2B / Bpifrance / Alimentation saine et durabl...,Développement de fermes urbaines aquaponiques ...,Development of automated aquaponic urban farms
4,mylabel,https://www.mylabel.io/,,L'appli qui vous permet de consommer selon vos...,The app allows you to eat according to your va...
...,...,...,...,...,...
8943,zenchef,http://zenchef.com/fr/,creation de siteweb / / restauration,Le meilleur outil pour gagner et fidéliser de ...,The best tool to win and retain new customers ...
8944,la-pause-cinema,http://lapausecinema.co/,cinema,Du popcorn dans votre boîte mail,Popcorn in your mailbox
8945,glowee,http://www.glowee.fr/,Bio-Eclairage /,Le Bio-Eclairage Vivant,Bio-lighting Living
8946,sparingvision,https://sparingvision.com/en/,,Biotech qui développe un médicament contre la ...,Biotech develops a medicament against retiniti...
