In [1]:
from PIL import Image as im
from tqdm import tqdm
import os
import time
import imageio as iio
import requests
import pandas as pd

# Download language data from https://www.worlddata.info/languages/index.php

In [2]:
# scraping website html data
from bs4 import BeautifulSoup
url = "https://www.worlddata.info/languages/index.php"
html = requests.get(url).content

soup = BeautifulSoup(html)
table = soup.select("td , th")

th_all = soup.find_all('th')

In [3]:
# getting headers
col_names = []
for x in th_all:
    col_names.append(x.text)

df = pd.DataFrame(columns=col_names)

# adding data in row by row
ratio = soup.findAll('tr')
ind = 0

# everything past the header row
for x in ratio[1:]:
    row = []
    
    # get each value for each column
    for y in x:
        row.append(y.text)
        
    # append row to dataframe
    df.loc[ind] = row
    ind += 1
    
# df.to_csv('worlddata_main_table.csv')

In [4]:
df_total = df.copy()

In [5]:
df_total['Mother tongue'].unique()

array(['Chinese', 'Hindi', 'English', 'Spanish', 'Arabic', 'Bengali',
       'Portuguese', 'Russian', 'Punjabi', 'Japanese', 'Javanese',
       'Telugu', 'Marathi', 'French', 'German', 'Urdu', 'Tamil',
       'Vietnamese', 'Korean', 'Turkish', 'Gujarati', 'Italian',
       'Persian', 'Hausa', 'Malay', 'Kannada', 'Pashto', 'Yoruba',
       'Tagalog', 'Oriya'], dtype=object)

## Language code mapping

In [6]:
# scraping website html data
from bs4 import BeautifulSoup
url = "https://meta.wikimedia.org/wiki/Template:List_of_language_names_ordered_by_code"
html = requests.get(url).content

soup = BeautifulSoup(html)
table = soup.select("td , th")

th_all = soup.find_all('th')

In [7]:
# getting headers
col_names = []
for x in th_all[0:6:]:
    col_names.append(x.text)

In [8]:
df = pd.DataFrame(columns=col_names)

# adding data in row by row
ratio = soup.findAll('tr')
ind = 0

# everything past the header row
for x in ratio[1:]:
    row = []
    
    # get each value for each column
    for y in x:
        i_string = str(y.string)
        try:
            if i_string != '\n':
                i_string = i_string.replace('\n', '')        
                row.append(i_string)
        except:
            row.append(None)
                
    row.append(None)
    if len(row) < 6:
        row.append(None)
    row = row[0:6]
    
    # append row to dataframe
    try:
        df.loc[ind] = row
    except:
        pass
    
    ind += 1
    
# df.to_csv('worlddata_main_table.csv')

In [9]:
language_mapping = df

In [10]:
language_mapping.head()

Unnamed: 0,code,Englishlanguage name,directionality,locallanguage name,local or EnglishWikipedia article,comment\n
0,aa,Afar,ltr,Afar,,
1,ab,Abkhazian,ltr,Аҧсуа,,
2,af,Afrikaans,ltr,Afrikaans,,
3,ak,Akan,ltr,Akana,,
4,als,Alemannic,ltr,Alemannisch,,


# getting individual page tables

In [11]:
 # getting hyperlinks to all lanague pages
urls = soup.select('td:nth-child(1) a')
links = []
for url in urls:
    links.append(url['href'])

In [12]:
def get_table(link):

    url = 'https://www.worlddata.info/languages/' + link + '.php'
    print (url)
    html = requests.get(url).content

    soup = BeautifulSoup(html)
    table = soup.select("td , th")

    th_all = soup.find_all('th')
    
    # getting headers
    col_names = []
    for x in th_all:
        col_names.append(x.text)

    df = pd.DataFrame(columns=col_names)
    # adding data in row by row
    ratio = soup.findAll('tr')
    ind = 0

    # everything past the header row
    for x in ratio[1:]:
        row = []

        # get each value for each column
        for y in x:
            row.append(y.text)

        # append row to dataframe
        df.loc[ind] = row
        ind += 1
    
    # adding in base page name
    lang = link.replace('/languages/', '').replace('.php', '').capitalize()
    df['Language'] = lang
    
    time.sleep(0.5)
    
    return df

In [13]:
 # getting hyperlinks to all lanague pages
urls = language_mapping['Englishlanguage name'].str.lower().unique()
links = []
for url in urls:
    if '/' not in url: 
        links.append(url)

In [14]:
df = pd.DataFrame()
for link in links:
    df1= get_table(link)
    df = df.append(df1, ignore_index=True)

https://www.worlddata.info/languages/afar.php
https://www.worlddata.info/languages/abkhazian.php
https://www.worlddata.info/languages/afrikaans.php
https://www.worlddata.info/languages/akan.php
https://www.worlddata.info/languages/alemannic.php
https://www.worlddata.info/languages/amharic.php
https://www.worlddata.info/languages/aragonese.php
https://www.worlddata.info/languages/angal.php
https://www.worlddata.info/languages/arabic.php
https://www.worlddata.info/languages/aramaic.php
https://www.worlddata.info/languages/egyptian arabic.php
https://www.worlddata.info/languages/assamese.php
https://www.worlddata.info/languages/asturian.php
https://www.worlddata.info/languages/avar.php
https://www.worlddata.info/languages/awadhi.php
https://www.worlddata.info/languages/aymara.php
https://www.worlddata.info/languages/azerbaijani.php
https://www.worlddata.info/languages/bashkir.php
https://www.worlddata.info/languages/bavarian.php
https://www.worlddata.info/languages/samogitian.php
https://

In [15]:
df.head()

Unnamed: 0,Country,Region,Official language,Distribution,Total,Language
0,Ethiopia,Eastern Africa,no,1.7 %,2045000,Afar
1,Djibouti,Eastern Africa,no,31.5 %,348000,Afar
2,Eritrea,Eastern Africa,no,4.3 %,156000,Afar
3,South Africa,Southern Africa,yes,13.5 %,8018000,Afrikaans
4,Namibia,Southern Africa,no,10.4 %,263000,Afrikaans


In [16]:
df_language = df.copy()

## Join Tables

In [17]:
df_language[df_language['Language'] == 'Afar']

Unnamed: 0,Country,Region,Official language,Distribution,Total,Language
0,Ethiopia,Eastern Africa,no,1.7 %,2045000,Afar
1,Djibouti,Eastern Africa,no,31.5 %,348000,Afar
2,Eritrea,Eastern Africa,no,4.3 %,156000,Afar


In [18]:
df_language['Total'] = df_language['Total'].str.replace(',', '').astype(int)

In [19]:
df_total = df_language.groupby(['Language'])['Total'].sum().reset_index()
df_total.rename(columns = {'Total' : 'Worldwide total'}, inplace = True)
df_total[df_total['Language'] == 'Chinese']

Unnamed: 0,Language,Worldwide total
27,Chinese,1362820200


In [20]:
language_output = pd.merge(df_language[['Country', 'Language', 'Official language', 'Total']], 
                           df_total[['Language', 'Worldwide total']], on = ['Language'])
language_output = language_output.merge(language_mapping[['code', 'Englishlanguage name']],
                                                        left_on = 'Language',  right_on = 'Englishlanguage name')

In [21]:
language_output[language_output['Language'] == 'Finnish']

Unnamed: 0,Country,Language,Official language,Total,Worldwide total,code,Englishlanguage name
270,Finland,Finnish,yes,5164000,5424000,fi,Finnish
271,Sweden,Finnish,no,250000,5424000,fi,Finnish
272,Estonia,Finnish,no,9000,5424000,fi,Finnish
273,Åland Islands,Finnish,no,1000,5424000,fi,Finnish


In [22]:
language_output = language_output[['code', 'Language', 'Country', 'Official language', 'Total', 'Worldwide total']]

In [23]:
import boto3
import io

In [24]:
def to_s3(filename, output_bucket, content):
    client = boto3.client('s3')
    client.put_object(Bucket=output_bucket, Key=filename, Body=content)

In [25]:
output_bucket = 'hbo-outbound-datascience-content-dev'
s3 = boto3.resource('s3')
bucket = s3.Bucket(output_bucket)



In [26]:
csv_buffer = io.StringIO()
language_output.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()
filename = 'market_share_of_demand/Language_Country_Mapping_all.csv'
to_s3(filename, output_bucket, content)

In [27]:
# general
import boto3
import datetime as dt
import json
import numpy as np
import pandas as pd
import snowflake.connector

from scipy.spatial.distance import squareform
#from scipy.cluster.hierarchy import ward
from scipy.cluster.hierarchy import  linkage
from scipy.cluster.hierarchy import fcluster

In [28]:
pd.set_option('display.max_rows', 1000)

### 0.2 : Connection to Snowflake

In [29]:
from abc import ABCMeta, abstractmethod

class Credentials(metaclass=ABCMeta):
    pass
    
    
class SSMPSCredentials(Credentials):
    def __init__(self, secretid: str):
        self._secretid = secretid
        self._secrets = {}
        
    def get_keys(self):
        """
        credential fetching 
        """
        _aws_sm_args = {'service_name': 'secretsmanager', 'region_name': 'us-east-1'}
        secrets_client = boto3.client(**_aws_sm_args)
        get_secret_value_response = secrets_client.get_secret_value(SecretId=self._secretid)
        return get_secret_value_response
    
    
class BaseConnector(metaclass=ABCMeta):
    @abstractmethod
    def connect(self):
        raise NotImplementedError
    

class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_DEV","WORKSPACE")

def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df



In [31]:
language_output.head()

Unnamed: 0,code,Language,Country,Official language,Total,Worldwide total
0,aa,Afar,Ethiopia,no,2045000,2549000
1,aa,Afar,Djibouti,no,348000,2549000
2,aa,Afar,Eritrea,no,156000,2549000
3,af,Afrikaans,South Africa,yes,8018000,8281000
4,af,Afrikaans,Namibia,no,263000,8281000


In [33]:
run_query('''
create or replace table max_prod.workspace.MARKET_DEMAND_LANGURAGE_COUNTRY_MAPPING 
(
Language VARCHAR,
Languag_name	VARCHAR, 
Country VARCHAR,  
OFFICIAL_LANGUAGE	VARCHAR, 
POPULATION	bigint, 
TOTAL bigint
)
''')

Unnamed: 0,status
0,Table MARKET_DEMAND_LANGURAGE_COUNTRY_MAPPING ...


In [34]:
run_query('''
copy into max_prod.WORKSPACE.MARKET_DEMAND_LANGURAGE_COUNTRY_MAPPING
    from(
        select
              $1, $2, $3, $4, $5, $6
        from @HBO_OUTBOUND_DATASCIENCE_CONTENT_DEV/market_share_of_demand/Language_Country_Mapping_all.csv
        )
    file_format = (type = csv null_if=('') skip_header = 1  FIELD_OPTIONALLY_ENCLOSED_BY='"')
    on_error = 'CONTINUE';

''')

Unnamed: 0,file,status,rows_parsed,rows_loaded,error_limit,errors_seen,first_error,first_error_line,first_error_character,first_error_column_name
0,s3://hbo-outbound-datascience-content-dev/mark...,LOADED,769,769,769,0,,,,


In [35]:
run_query('''
create or replace table max_prod.workspace.MARKET_DEMAND_LANGURAGE_COUNTRY_MAPPING  as 
SELECT *, POPULATION/TOTAL AS PERCENTAGE
FROM max_prod.workspace.MARKET_DEMAND_LANGURAGE_COUNTRY_MAPPING
''')

Unnamed: 0,status
0,Table MARKET_DEMAND_LANGURAGE_COUNTRY_MAPPING ...
