In [187]:
# !pip3 install openpyxl
import pandas as pd
import numpy as np
import re


### Data source is currently a CSV
### Later on this will be improved to be a webscraper or a data dump to an S3 bucket

In [188]:
df = pd.read_excel(r'UPR_cycle2.xlsx')
tags_matrix = pd.read_excel(r'themes_to_tags.xlsx')

In [189]:
df.index+=1
df.rename(columns={"Reccomending Body": "Recommending Body", "UPR Reccomending States": "UPR Recommending States"}, inplace=True)

# Data Cleaning

##### - Check unique number of text & annotation id to establish primary key
##### - Both have 625 unique entries which is the same as the total database entries
##### - Normalise by removing one of these, two identifiers is uneccesary we are not gaining anymore information

In [190]:
text_count = df.Text.unique().size
print(text_count)

625


In [191]:
annotation_id_count = df["OHCHR Annotation Id"].unique().size
print(annotation_id_count)

625


##### Columns to be dropped
##### Annotation ID, Recommending body, Text, Affected persons, Sdgs, Document publication date, session, date
##### check all entries are recommendation by seeing if text starts with number, then drop this column. 

In [192]:
drop_columns = ["OHCHR Annotation Id","Recommending Body", "Document Publication Date", "Sdgs", "UPR Session", "Date of publication on UHRI", "Affected Persons"]

In [193]:
df.drop(drop_columns, inplace=True, axis=1)

In [194]:
def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))

In [195]:
df = df[df['Text'].apply(lambda x: has_numbers(x))]
df.drop(['Type'], inplace=True, axis=1)

In [196]:
df.index = df.index.set_names(['Recommendation_id'])

#### Standardize the naming convention, by removing hyphens and spaces, of all data entries.

In [197]:
def remove_leading_char(data_entry):
    """"Clean the naming convention for data entries of all columns"""
    clean_aff = re.sub(r'^[^A-Z]*', '', data_entry)
    if clean_aff:
        return clean_aff

In [198]:
def add_empty_cell_string(df):
    """Replacing all empty cells with default string 'Missing data'"""
    df.fillna('Missing data', inplace=True)
    return df 

In [199]:
df2 = add_empty_cell_string(df)

In [200]:
df2['Themes'] = df2['Themes'].str.split('\n')
df2 = df2.explode('Themes')

In [201]:
df3 = df2.applymap(lambda x: remove_leading_char(str(x)))

In [202]:
df4 = df3
df4['Recommending Regions'] = df4['Recommending Regions'].str.split('\n')
df4['UPR Recommending States'] = df4['UPR Recommending States'].str.split('\n')
df4 = df4.explode(['UPR Recommending States', 'Recommending Regions'])
df4 = df4.applymap(lambda x: remove_leading_char(str(x)))

In [203]:
Recommendations_df = df[['Text', 'UPR Position']].applymap(lambda x: remove_leading_char(str(x)))
Recommendations_df= Recommendations_df.reset_index(level=0)

In [204]:
Country_to_recommendations = df4[['Countries Concerned','UPR Recommending States']]
Country_to_recommendations.index = Country_to_recommendations.index.set_names(['Recommendation_id'])
Country_to_recommendations = Country_to_recommendations.reset_index(level=0).drop_duplicates(subset='Recommendation_id', keep='first')

In [205]:
c1 = df4[['Countries Concerned','Regions Concerned']].rename(columns={"Countries Concerned":"Country name", "Regions Concerned": "Regions"})
c2 = df4[['UPR Recommending States', 'Recommending Regions']].rename(columns={"UPR Recommending States":"Country name", "Recommending Regions": "Regions"})
Countries_df = pd.concat([c1,c2], ignore_index=True).drop_duplicates(subset=['Country name'], keep='first')
Countries_df = Countries_df.reset_index(drop='True')
Countries_df.index = Countries_df.index.set_names(['Country id'])
Countries_df.index+=1
Countries_df = Countries_df.reset_index()

In [206]:
Country_to_recommendations = Country_to_recommendations.merge(Countries_df[["Country name","Country id"]], left_on='Countries Concerned', right_on='Country name')
Country_to_recommendations = Country_to_recommendations.drop(columns=['Countries Concerned','Country name']).rename(columns={"Country id": "Countries Concerned"})
Country_to_recommendations = Country_to_recommendations.merge(Countries_df[["Country name","Country id"]], left_on='UPR Recommending States', right_on='Country name')
Country_to_recommendations = Country_to_recommendations.drop(columns=['UPR Recommending States','Country name']).rename(columns={"Country id": "UPR Recommending States"})

In [207]:
Regions_df = pd.DataFrame(Countries_df['Regions'].unique(), columns=['Regions'])
Regions_df.index = Regions_df.index.set_names(['Region id'])
Regions_df.index+=1
Regions_df = Regions_df.reset_index(level=0)

In [208]:
Countries_df = Countries_df.merge(Regions_df, left_on="Regions", right_on="Regions")
Countries_df = Countries_df.drop(columns=["Regions"])

In [209]:
Recommendation_to_theme = df4[['Themes']]
Recommendation_to_theme.index = Recommendation_to_theme.index.set_names(['Recommendation id'])
Recommendation_to_theme = Recommendation_to_theme.reset_index(level=0)

In [210]:
Themes_df = pd.DataFrame(df4['Themes'].unique(), columns=['Themes'])
Themes_df.index = Themes_df.index.set_names(['Theme id'])
Themes_df.index+=1
Themes_df = Themes_df.reset_index(level=0)


In [211]:
Recommendation_to_theme = Recommendation_to_theme.merge(Themes_df, left_on='Themes', right_on='Themes')
Recommendation_to_theme = Recommendation_to_theme.drop(columns=['Themes']).rename(columns={'Theme id':'Themes'})

In [None]:
tags_df = pd.DataFrame(tags_matrix.columns,columns=['Tags'] )
tags_df.index = tags_df.index.set_names(['Tag id'])
tags_df.index +=1
tags_df = tags_df.reset_index()

In [None]:

tags_to_themes = pd.melt(tags_matrix, value_vars=tags_matrix.columns.tolist(), var_name='Tag', value_name='Theme')
tags_to_themes = tags_to_themes.merge(tags_df[["Tags","Tag id"]], left_on='Tag', right_on='Tags').drop(columns=['Tags', 'Tag'])
tags_to_themes = tags_to_themes.merge(Themes_df[["Themes","Theme id"]], left_on='Theme', right_on='Themes').drop(columns=['Themes', 'Theme'])

### Normalized Dataframes

In [212]:
Recommendations_df

Unnamed: 0,Recommendation_id,Text,UPR Position
0,1,Promptly ratify the Optional Protocol to the C...,Noted
1,2,Ensure full and equal access to modern contrac...,Supported
2,3,Protect the rights of indigenous peoples throu...,Supported
3,4,Continue enhancing the school infrastructure f...,Supported
4,5,Continue implementing mechanisms for the prior...,Supported
...,...,...,...
620,621,Adopt legislation to guarantee the fulfilment ...,Supported
621,622,Establish clear consultation procedures in ord...,Noted
622,623,Consider ratifying ILO Convention N° 169 ( Nor...,Supported
623,624,Extend an invitation to the Working Group on e...,Supported


In [213]:
Country_to_recommendations

Unnamed: 0,Recommendation_id,Countries Concerned,UPR Recommending States
0,1,1,70
1,29,11,70
2,154,28,70
3,164,30,70
4,206,33,70
...,...,...,...
620,523,65,147
621,533,67,148
622,590,73,148
623,548,70,68


In [214]:
Countries_df

Unnamed: 0,Country id,Country name,Region id
0,1,Uganda,1
1,6,United Republic of Tanzania,1
2,7,Eswatini,1
3,15,Namibia,1
4,16,Sierra Leone,1
...,...,...,...
144,132,Saudi Arabia,5
145,133,Viet Nam,5
146,135,State of Palestine*,5
147,148,Oman,5


In [215]:
Regions_df

Unnamed: 0,Region id,Regions
0,1,Africa
1,2,Latin America and the Caribbean
2,3,Eastern Europe
3,4,Western Europe & Others
4,5,Asia-Pacific
5,6,Missing data


In [216]:
Recommendation_to_theme

Unnamed: 0,Recommendation id,Themes
0,1,1
1,9,1
2,11,1
3,16,1
4,35,1
...,...,...
1534,525,65
1535,543,66
1536,544,66
1537,589,67


In [217]:
Themes_df

Unnamed: 0,Theme id,Themes
0,1,Labour rights and right to work
1,2,Ratification of & accession to international i...
2,3,Prohibition of torture & ill-treatment (includ...
3,4,Sexual & reproductive health and rights
4,5,Right to participate in public affairs & right...
...,...,...
63,64,Good governance & corruption
64,65,Private life & privacy
65,66,Right to be recognized as a person before the law
66,67,Trade union rights


In [221]:
tags_df

Unnamed: 0,Tag id,Tags
0,1,Scope of international obligations and coopera...
1,2,National human rights framework
2,3,Development
3,4,Environment
4,5,Business and human rights
5,6,Human rights and counter-terrorism
6,7,Right to land
7,8,Civil & political rights - general measures of...
8,9,Equality and non-discrimination
9,10,"Right to life, liberty and security of person"


In [220]:
tags_to_themes

Unnamed: 0,Tag id,Theme id
0,1,40
1,1,2
2,1,54
3,1,14
4,1,26
...,...,...
58,30,43
59,31,49
60,31,47
61,31,50
