In [75]:
# !pip3 install openpyxl
import pandas as pd
import re


In [76]:
df = pd.read_excel(r'UPR_cycle2.xlsx')

In [77]:
df.index+=1
df.rename(columns={"Reccomending Body": "Recommending Body", "UPR Reccomending States": "UPR Recommending States"}, inplace=True)
df.head(2)

Unnamed: 0,Text,Countries Concerned,Recommending Body,Document Symbol,Themes,Affected Persons,Sdgs,Document Publication Date,UPR Recommending States,UPR Position,Type,OHCHR Annotation Id,UPR Session,Regions Concerned,Recommending Regions,Date of publication on UHRI
1,117.22 Promptly ratify the Optional Protocol t...,- Uganda,- UPR,A/HRC/34/10,- Labour rights and right to work\n- Ratificat...,- Indigenous peoples,,2016-12-27,- Guatemala,- Noted,- Recommendations,4100eed4-fcf7-42d6-941a-a89493e77e6c,- 26th Session - Cycle 2,- Africa,- Latin America and the Caribbean,2018-06-20 13:50:21.503
2,133.247 Ensure full and equal access to modern...,- Venezuela (Bolivarian Republic of),- UPR,A/HRC/34/6,- Sexual & reproductive health and rights,- Indigenous peoples\n- Persons living in rura...,,2016-12-27,- Denmark,- Supported,- Recommendations,f910ce5d-0d1c-45de-b64b-3050df7679dc,- 26th Session - Cycle 2,- Latin America and the Caribbean,- Western Europe & Others,2018-05-04 12:18:58.713


# Data Cleaning

##### - Check unique number of text & annotation id to establish primary key
##### - Both have 625 unique entries which is the same as the total database entries
##### - Normalise by removing one of these, two identifiers is uneccesary we are not gaining anymore information

In [78]:
text_count = df.Text.unique().size
print(text_count)

625


In [79]:
annotation_id_count = df["OHCHR Annotation Id"].unique().size
print(annotation_id_count)

625


##### Columns to be dropped
##### Annotation ID, Recommending body, Text, Affected persons, Sdgs, Document publication date, session, date
##### check all entries are recommendation by seeing if text starts with number, then drop this column. 

In [80]:
drop_columns = ["OHCHR Annotation Id","Recommending Body", "Document Publication Date", "Sdgs", "UPR Session", "Date of publication on UHRI", "Affected Persons"]

In [81]:
df.drop(drop_columns, inplace=True, axis=1)

In [82]:
df.head(1)

Unnamed: 0,Text,Countries Concerned,Document Symbol,Themes,UPR Recommending States,UPR Position,Type,Regions Concerned,Recommending Regions
1,117.22 Promptly ratify the Optional Protocol t...,- Uganda,A/HRC/34/10,- Labour rights and right to work\n- Ratificat...,- Guatemala,- Noted,- Recommendations,- Africa,- Latin America and the Caribbean


In [83]:
def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))

In [84]:
df = df[df['Text'].apply(lambda x: has_numbers(x))]
df.drop(['Type'], inplace=True, axis=1)

In [85]:
df.index = df.index.set_names(['Recommendation_id'])
df.head(1)

Unnamed: 0_level_0,Text,Countries Concerned,Document Symbol,Themes,UPR Recommending States,UPR Position,Regions Concerned,Recommending Regions
Recommendation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,117.22 Promptly ratify the Optional Protocol t...,- Uganda,A/HRC/34/10,- Labour rights and right to work\n- Ratificat...,- Guatemala,- Noted,- Africa,- Latin America and the Caribbean


#### Standardize the naming convention, by removing hyphens and spaces, of all data entries.

In [86]:
def remove_leading_char(data_entry):
    """"Clean the naming convention for data entries of all columns"""
    clean_aff = re.sub(r'^[^A-Z]*', '', data_entry)
    if clean_aff:
        return clean_aff

In [87]:
def add_empty_cell_string(df):
    """Replacing all empty cells with default string 'Missing data'"""
    df.fillna('Missing data', inplace=True)
    return df 

In [88]:
df2 = add_empty_cell_string(df)
df2.head(1)

Unnamed: 0_level_0,Text,Countries Concerned,Document Symbol,Themes,UPR Recommending States,UPR Position,Regions Concerned,Recommending Regions
Recommendation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,117.22 Promptly ratify the Optional Protocol t...,- Uganda,A/HRC/34/10,- Labour rights and right to work\n- Ratificat...,- Guatemala,- Noted,- Africa,- Latin America and the Caribbean


In [89]:
df2['Themes'] = df2['Themes'].str.split('\n')
df2 = df2.explode('Themes')
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1481 entries, 1 to 625
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Text                     1481 non-null   object
 1   Countries Concerned      1481 non-null   object
 2   Document Symbol          1481 non-null   object
 3   Themes                   1481 non-null   object
 4   UPR Recommending States  1481 non-null   object
 5   UPR Position             1481 non-null   object
 6   Regions Concerned        1481 non-null   object
 7   Recommending Regions     1481 non-null   object
dtypes: object(8)
memory usage: 104.1+ KB


In [90]:
df3 = df2.applymap(lambda x: remove_leading_char(str(x)))
df3.head(1)

Unnamed: 0_level_0,Text,Countries Concerned,Document Symbol,Themes,UPR Recommending States,UPR Position,Regions Concerned,Recommending Regions
Recommendation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Promptly ratify the Optional Protocol to the C...,Uganda,A/HRC/34/10,Labour rights and right to work,Guatemala,Noted,Africa,Latin America and the Caribbean


In [91]:
df4 = df3
df4['Recommending Regions'] = df4['Recommending Regions'].str.split('\n')
df4['UPR Recommending States'] = df4['UPR Recommending States'].str.split('\n')
df4 = df4.explode(['UPR Recommending States', 'Recommending Regions'])
df4 = df4.applymap(lambda x: remove_leading_char(str(x)))

In [92]:
Recommendations_df = df[['Text', 'UPR Position']].applymap(lambda x: remove_leading_char(str(x)))
Recommendations_df= Recommendations_df.reset_index(level=0)

In [93]:
Country_to_recommendations = df4[['Countries Concerned','UPR Recommending States']]
Country_to_recommendations.index = Country_to_recommendations.index.set_names(['Recommendation_id'])
Country_to_recommendations = Country_to_recommendations.reset_index(level=0).drop_duplicates(subset='Recommendation_id', keep='first')

In [94]:
c1 = df4[['Countries Concerned','Regions Concerned']].rename(columns={"Countries Concerned":"Country name", "Regions Concerned": "Regions"})
c2 = df4[['UPR Recommending States', 'Recommending Regions']].rename(columns={"UPR Recommending States":"Country name", "Recommending Regions": "Regions"})
Countries_df = pd.concat([c1,c2], ignore_index=True).drop_duplicates(subset=['Country name'], keep='first')
Countries_df = Countries_df.reset_index(drop='True')
Countries_df.index = Countries_df.index.set_names(['Country id'])
Countries_df.index+=1
Countries_df = Countries_df.reset_index()

In [95]:
Country_to_recommendations = Country_to_recommendations.merge(Countries_df[["Country name","Country id"]], left_on='Countries Concerned', right_on='Country name')
Country_to_recommendations = Country_to_recommendations.drop(columns=['Countries Concerned','Country name']).rename(columns={"Country id": "Countries Concerned"})
Country_to_recommendations = Country_to_recommendations.merge(Countries_df[["Country name","Country id"]], left_on='UPR Recommending States', right_on='Country name')
Country_to_recommendations = Country_to_recommendations.drop(columns=['UPR Recommending States','Country name']).rename(columns={"Country id": "UPR Recommending States"})

In [96]:
Regions_df = pd.DataFrame(Countries_df['Regions'].unique(), columns=['Regions'])
Regions_df.index = Regions_df.index.set_names(['Region id'])
Regions_df.index+=1
Regions_df = Regions_df.reset_index(level=0)

In [97]:
Countries_df = Countries_df.merge(Regions_df, left_on="Regions", right_on="Regions")
Countries_df.drop(columns=["Regions"])

Unnamed: 0,Country id,Country name,Region id
0,1,Uganda,1
1,6,United Republic of Tanzania,1
2,7,Eswatini,1
3,15,Namibia,1
4,16,Sierra Leone,1
...,...,...,...
144,132,Saudi Arabia,5
145,133,Viet Nam,5
146,135,State of Palestine*,5
147,148,Oman,5


In [98]:
Recommendation_to_theme = df4[['Themes']]
Recommendation_to_theme.index = Recommendation_to_theme.index.set_names(['Recommendation id'])
Recommendation_to_theme = Recommendation_to_theme.reset_index(level=0)

In [99]:
Themes_df = pd.DataFrame(df4['Themes'].unique(), columns=['Themes'])
Themes_df.index = Themes_df.index.set_names(['Theme id'])
Themes_df = Themes_df.reset_index(level=0)
Themes_df.index+=1

In [100]:
Recommendation_to_theme = Recommendation_to_theme.merge(Themes_df, left_on='Themes', right_on='Themes')
Recommendation_to_theme = Recommendation_to_theme.drop(columns=['Themes']).rename(columns={'Theme id':'Themes'})

### Normalized Dataframes

In [101]:
Recommendations_df

Unnamed: 0,Recommendation_id,Text,UPR Position
0,1,Promptly ratify the Optional Protocol to the C...,Noted
1,2,Ensure full and equal access to modern contrac...,Supported
2,3,Protect the rights of indigenous peoples throu...,Supported
3,4,Continue enhancing the school infrastructure f...,Supported
4,5,Continue implementing mechanisms for the prior...,Supported
...,...,...,...
620,621,Adopt legislation to guarantee the fulfilment ...,Supported
621,622,Establish clear consultation procedures in ord...,Noted
622,623,Consider ratifying ILO Convention N° 169 ( Nor...,Supported
623,624,Extend an invitation to the Working Group on e...,Supported


In [102]:
Country_to_recommendations

Unnamed: 0,Recommendation_id,Countries Concerned,UPR Recommending States
0,1,1,70
1,29,11,70
2,154,28,70
3,164,30,70
4,206,33,70
...,...,...,...
620,523,65,147
621,533,67,148
622,590,73,148
623,548,70,68


In [103]:
Countries_df

Unnamed: 0,Country id,Country name,Regions,Region id
0,1,Uganda,Africa,1
1,6,United Republic of Tanzania,Africa,1
2,7,Eswatini,Africa,1
3,15,Namibia,Africa,1
4,16,Sierra Leone,Africa,1
...,...,...,...,...
144,132,Saudi Arabia,Asia-Pacific,5
145,133,Viet Nam,Asia-Pacific,5
146,135,State of Palestine*,Asia-Pacific,5
147,148,Oman,Asia-Pacific,5


In [104]:
Regions_df

Unnamed: 0,Region id,Regions
0,1,Africa
1,2,Latin America and the Caribbean
2,3,Eastern Europe
3,4,Western Europe & Others
4,5,Asia-Pacific
5,6,Missing data


In [105]:
Recommendation_to_theme

Unnamed: 0,Recommendation id,Themes
0,1,0
1,9,0
2,11,0
3,16,0
4,35,0
...,...,...
1534,525,64
1535,543,65
1536,544,65
1537,589,66


In [106]:
Themes_df

Unnamed: 0,Theme id,Themes
1,0,Labour rights and right to work
2,1,Ratification of & accession to international i...
3,2,Prohibition of torture & ill-treatment (includ...
4,3,Sexual & reproductive health and rights
5,4,Right to participate in public affairs & right...
...,...,...
64,63,Good governance & corruption
65,64,Private life & privacy
66,65,Right to be recognized as a person before the law
67,66,Trade union rights
