Create a table for the app data

In [1]:
#%load_ext sql
import configparser
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [2]:
config = configparser.ConfigParser()
config.read('private.cfg')
DB_NAME_DEFAULT = config.get('SQL', 'DB_NAME_DEFAULT')
DB_USER = config.get('SQL', 'DB_USER')
DB_PASSW = config.get('SQL', 'DB_PASSW')

conn_string = "postgresql://{}:{}@127.0.0.1/escorpus".format(DB_USER, DB_PASSW)


We wish to create a table that contains word frequencies. Words by column, country by row.

In [3]:
engine = create_engine(conn_string)
query = f'SELECT * FROM word_source;'
df = pd.read_sql(query, con=engine)

In [4]:
df.head(10)

Unnamed: 0,textid,wid,word,lemma,country
0,124,11707993,@@124,,AR
1,124,7634,Gran,gran,AR
2,124,4419,convocatoria,convocatoria,AR
3,124,23,para,para,AR
4,124,7,el,el,AR
5,124,5962,docente,docente,AR
6,124,10,que,que,AR
7,124,15,se,se,AR
8,124,6,en,en,AR
9,124,4,la,la,AR


Clean up the data - drop all the lemma that contain special characters.

In [5]:
allowed_characters = r'abcdefghijklmnopqrstuvwxyzáéíñóúü'
df = df[df['lemma'].str.contains(f'^[{allowed_characters}]+$')]


First, count the global frequencies of each word (lemma) in the dataframe

In [6]:
total_freq = df['lemma'].value_counts().to_dict()
total_freq_df = pd.DataFrame({'lemma':total_freq.keys(), 'total_freq':total_freq.values()})
total_freq_df

Unnamed: 0,lemma,total_freq
0,de,133829
1,el,110016
2,la,89173
3,que,70880
4,y,55363
...,...,...
50767,features,1
50768,indiewire,1
50769,marcy,1
50770,durkin,1


Next, the data sample size and vocabulary size by each country.

In [7]:
print(f"Total number of words in the database: {len(df)}")
# Total words sourced from each country
country_data_size = df.groupby('country')['lemma'].count().to_dict()
country_vocab_size = [len(country) for country in df.groupby('country')['lemma'].unique()]
country_data_size_df = pd.DataFrame({'country':country_data_size.keys(), 'size':country_data_size.values(), 'vocab_size': country_vocab_size})
print("Total words sourced by country: \n")
print(country_data_size_df)

Total number of words in the database: 1952238
Total words sourced by country: 

   country    size  vocab_size
0       AR  166593       12346
1       BO   32963        5009
2       CL   69729        7716
3       CO  146143       10852
4       CR   23405        3680
5       CU   42806        5056
6       DO   33436        5001
7       EC   49679        6175
8       ES  436715       21065
9       GT   96802        7952
10      HN   45695        5823
11      MX  238745       15048
12      NI   43334        6002
13      PA   26624        4317
14      PE  100338        9361
15      PR   20819        3534
16      PY   42104        5621
17      SV   32114        4936
18      US  172232       13506
19      UY   70696        7181
20      VE   61266        7060


Next, combining these previous two steps, create a df with lemma frequency grouped by country.

In [8]:
lemma_freq_by_country = df.groupby("country")['lemma'].value_counts().reset_index().rename(columns={'index': 'country', 0: 'count'})
lemma_freq_by_country = lemma_freq_by_country.merge(country_data_size_df, on='country')
lemma_freq_by_country['country_rel_freq'] = lemma_freq_by_country['count'] * 1/lemma_freq_by_country['size']
lemma_freq_by_country

Unnamed: 0,country,lemma,count,size,vocab_size,country_rel_freq
0,AR,de,11614,166593,12346,0.069715
1,AR,el,9567,166593,12346,0.057427
2,AR,la,8412,166593,12346,0.050494
3,AR,que,6087,166593,12346,0.036538
4,AR,y,4584,166593,12346,0.027516
...,...,...,...,...,...,...
167236,VE,primaria,1,61266,7060,0.000016
167237,VE,dyer,1,61266,7060,0.000016
167238,VE,durán,1,61266,7060,0.000016
167239,VE,durch,1,61266,7060,0.000016


Finally, we want calculate frequency of lemma for each country relative to the global relative frequency.

In [9]:
# column for global relative frequency
total_freq_df['glob_rel_freq'] = total_freq_df['total_freq'] * 1/sum(total_freq_df['total_freq'])

# merge the gobal statistics with the national statistics dataframe
lemma_freq_by_country = lemma_freq_by_country.merge(total_freq_df, on='lemma')

# create a column for national (relative) frequency, relative to global (relative) freuqency.
lemma_freq_by_country['country_glob_rel_freq'] = np.log2(lemma_freq_by_country['country_rel_freq'] * 1/lemma_freq_by_country['glob_rel_freq'])

lemma_freq_by_country

Unnamed: 0,country,lemma,count,size,vocab_size,country_rel_freq,total_freq,glob_rel_freq,country_glob_rel_freq
0,AR,de,11614,166593,12346,0.069715,133829,6.855158e-02,0.024275
1,BO,de,2585,32963,5009,0.078421,133829,6.855158e-02,0.194055
2,CL,de,4220,69729,7716,0.060520,133829,6.855158e-02,-0.179778
3,CO,de,10061,146143,10852,0.068844,133829,6.855158e-02,0.006131
4,CR,de,1628,23405,3680,0.069558,133829,6.855158e-02,0.021022
...,...,...,...,...,...,...,...,...,...
167236,VE,echenique,1,61266,7060,0.000016,1,5.122326e-07,4.993898
167237,VE,edelca,1,61266,7060,0.000016,1,5.122326e-07,4.993898
167238,VE,prevenar,1,61266,7060,0.000016,1,5.122326e-07,4.993898
167239,VE,dyer,1,61266,7060,0.000016,1,5.122326e-07,4.993898


Unweildy. To make this dataframe a bit more manageable, create a new dataframe so that each row is a word, and each column is a country, and the values are the relative frequencies.

In [10]:
glob_rel_freq_df = lemma_freq_by_country.pivot(index='country', columns='lemma', values='country_glob_rel_freq')
country_column = glob_rel_freq_df['country']
glob_rel_freq_df.reset_index(inplace=True, drop=True, names='idx')
glob_rel_freq_df.fillna(0, inplace=True)
glob_rel_freq_df.insert(0, 'country_name', country_column.index)
glob_rel_freq_df = glob_rel_freq_df.T
glob_rel_freq_df = glob_rel_freq_df.rename(columns=glob_rel_freq_df.iloc[0]).drop(glob_rel_freq_df.index[0]).reset_index()
glob_rel_freq_df

Unnamed: 0,lemma,AR,BO,CL,CO,CR,CU,DO,EC,ES,...,MX,NI,PA,PE,PR,PY,SV,US,UY,VE
0,a,-0.012839,0.090283,0.003787,-0.026677,-0.099502,-0.061563,-0.208675,0.079204,-0.022195,...,-0.027155,-0.062993,-0.085114,0.008588,-0.009442,0.0962,-0.114239,-0.004776,0.11669,0.060052
1,aa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,aaa,0.0,0.0,0.0,0.0,6.382168,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,aaaaaaaaaaaaaaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.160365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,aaaaaaaaaay,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.502704,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50767,útimo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.160365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50768,útlimo,0.0,0.0,3.807226,0.0,0.0,0.0,0.0,0.0,1.160365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50769,über,0.0,0.0,4.807226,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50770,übermensch,3.550729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Nice. Now add this table to our database.

In [177]:
table_name = 'frequency'
glob_rel_freq_df.to_sql(table_name, con=engine, index=False, if_exists='replace')

772