In [1]:
import pandas as pd
import requests
import re
from lxml import html
import datetime

In [2]:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}   

In [3]:
languages_link = 'https://en.wikipedia.org/wiki/List_of_programming_languages'
wiki_link = 'https://en.wikipedia.org'

In [4]:
links_requests = requests.get('https://en.wikipedia.org/wiki/List_of_programming_languages')
data = html.fromstring(links_requests.content)

In [7]:
l_links_to_scrape = data.xpath('//div[@class="div-col"]//li//a/@href')
l_links_to_scrape


['/wiki/A_Sharp_(.NET)',
 '/wiki/A-0_System',
 '/wiki/A%2B_(programming_language)',
 '/wiki/ABAP',
 '/wiki/ABC_(programming_language)',
 '/wiki/ABC_ALGOL',
 '/wiki/ACC_(programming_language)',
 '/wiki/Accent_(programming_language)',
 '/wiki/Distributed_Application_Specification_Language',
 '/wiki/Action!_(programming_language)',
 '/wiki/ActionScript',
 '/wiki/Actor_(programming_language)',
 '/wiki/Ada_(programming_language)',
 '/wiki/Adenine_(programming_language)',
 '/wiki/AdvPL',
 '/wiki/Agda_(theorem_prover)',
 '/wiki/Agilent_VEE',
 '/wiki/Agora_(programming_language)',
 '/wiki/AIMMS',
 '/wiki/Aldor',
 '/wiki/Alef_(programming_language)',
 '/wiki/Algebraic_Logic_Functional_programming_language',
 '/wiki/ALGOL_58',
 '/wiki/ALGOL_60',
 '/wiki/ALGOL_68',
 '/wiki/ALGOL_W',
 '/wiki/Alice_(programming_language)',
 '/wiki/Alma-0',
 '/wiki/AmbientTalk',
 '/wiki/Amiga_E',
 '/wiki/AMOS_(programming_language)',
 '/wiki/AMPL',
 '/wiki/%D0%90%D0%BD%D0%B0%D0%BB%D0%B8%D1%82%D0%B8%D0%BA',
 '/wiki/A

In [12]:
def wiki_scraper(url, data=data, headers=headers):
    """
    This function extract and clean the data languages from Wikipedia
    """
    # request
    r = requests.get(wiki_link+url, headers=headers)
    data = html.fromstring(r.content)    
    
    language_name = re.sub(' ((.*?)programming language).?', '', data.xpath('//h1//text()')[0])
    
    counter = 1
    length = len(data.xpath('//table[@class="infobox vevent"]//tr'))
    temp = {}

    while counter <= length:
        language_feature = data.xpath('//table[@class="infobox vevent"]//tr[{i}]//text()'.format(i=counter))
        
        # hyperlinks cleaner
        for num in range(20):
            for element in language_feature:  
                if element == f'[{num}]':
                    language_feature.remove(f'[{num}]') 
        
        # \n cleaner
        if language_feature != [] and language_feature[-1] == '\n':
            language_feature.pop(-1)   
        if language_feature != [] and len(language_feature) > 1 and language_feature[1] == '\n':
            language_feature.pop(1)            
        for element in language_feature:
            if element == '\n':
                idx = language_feature.index(element)
                language_feature[idx] = ', '
        
        # data extracters
        if language_feature == []:
            pass
        elif re.search('Designed', language_feature[0]):
            if len(language_feature[1:]) > 1:
                temp['designed_by'] = ''.join(language_feature[1:])
            else:
                temp['designed_by'] = language_feature[1:][0]
            
        elif re.search('Developer', language_feature[0]):
            temp['developer'] = language_feature[1:]
            temp['developer'] = temp['developer'][0]
            
        elif re.search('First.*appeared', language_feature[0]) or re.search('Initial.*release', language_feature[0]):
            temp['initial release'] = language_feature[1:][0]
            if re.search('\d{4}', temp['initial release']):
                temp['initial release'] = re.search('\d{4}', temp['initial release']).group(0)
        
        elif re.search('OS', language_feature[0]):
            if len(language_feature[1:]) > 1:
                temp['OS'] = ''.join(language_feature[1:])
            else:
                temp['OS'] = language_feature[1:]
        
        elif re.search('Platform', language_feature[0]):
            if len(language_feature[1:]) > 1:
                temp['Platform'] = ''.join(language_feature[1:])
            else:
                temp['Platform'] = language_feature[1:][0]
        
        elif re.search('License', language_feature[0]):
            if len(language_feature[1:]) > 1:
                temp['license'] = ''.join(language_feature[1:])
            else:
                temp['license'] = language_feature[1:][0]
        
        elif re.search('Typing.*discipline', language_feature[0]):
            if len(language_feature[1:]) > 1:
                temp['typing_discipline'] = ''.join(language_feature[1:])
            else:
                temp['typing_discipline'] = language_feature[1:][0]
        
        elif re.search('Written.*in', language_feature[0]):
            temp['written_in'] = language_feature[1:]
            temp['written_in'] = ''.join(temp['written_in'])
        
        elif re.search('Paradigm', language_feature[0]):
            temp['paradigm'] = language_feature[1:]
            temp['paradigm'] = ''.join(temp['paradigm'])
        counter += 1

    return language_name, temp


In [13]:
wiki_scraper('/wiki/Ada_(programming_language)')

('Ada',
 {'paradigm': 'Multi-paradigm: structured, imperative, object-oriented, concurrent, array, distributed, generic, procedural, meta',
  'designed_by': 'MIL-STD-1815, Ada 83: Jean Ichbiah, Ada 95: Tucker Taft, Ada 2005: Tucker Taft, Ada 2012: Tucker Taft',
  'initial release': '1980',
  'typing_discipline': 'static, strong, safe, nominative',
  'OS': 'Multi- or cross-platform'})

In [14]:
initial_time = datetime.datetime.now()

language_json = {}
for lk in l_links_to_scrape:
    l, t = wiki_scraper(lk)
    language_json[l] = t

final_time = datetime.datetime.now()
time_used = final_time - initial_time
print('The process has taken ' + str(time_used))

The process has taken 0:07:44.876681


In [15]:
language_json

{'A Sharp (.NET)': {'designed_by': 'Dr. Martin C. Carlisle, Lt Col Ricky Sward, Maj Jeff Humphries',
  'developer': 'AdaCore',
  'initial release': '2004',
  'Platform': 'Common Language Infrastructure',
  'OS': ['Cross-platform'],
  'license': 'GNU General Public License'},
 'A-0 System': {},
 'A+': {'paradigm': 'Array',
  'designed_by': 'Arthur Whitney',
  'developer': 'Morgan Stanley',
  'initial release': '1988',
  'typing_discipline': 'Dynamic, strong',
  'license': 'GNU General Public License'},
 'ABAP': {'paradigm': 'Object-oriented, structured, imperative',
  'designed_by': 'SAP SE',
  'initial release': '1983',
  'typing_discipline': 'Static, strong, safe, nominative',
  'OS': ['Cross-platform']},
 'ABC': {'paradigm': 'multi-paradigm: imperative, procedural, structured',
  'designed_by': 'Leo Geurts, Lambert Meertens, Steven Pemberton',
  'developer': 'Centrum Wiskunde & Informatica',
  'initial release': '1987',
  'typing_discipline': 'strong, polymorphic',
  'OS': 'Unix-like

In [16]:
#tt = pd.DataFrame.from_dict(language_json, orient='index')

In [17]:
#tt.to_csv('./languages_data.csv')