In [103]:
# !pip3 install openpyxl
import pandas as pd
import re


In [104]:
df = pd.read_excel(r'UPR_cycle2.xlsx')

In [105]:
df.index+=1
df.rename(columns={"Reccomending Body": "Recommending Body", "UPR Reccomending States": "UPR Recommending States"}, inplace=True)
df.head(2)

Unnamed: 0,Text,Countries Concerned,Recommending Body,Document Symbol,Themes,Affected Persons,Sdgs,Document Publication Date,UPR Recommending States,UPR Position,Type,OHCHR Annotation Id,UPR Session,Regions Concerned,Recommending Regions,Date of publication on UHRI
1,117.22 Promptly ratify the Optional Protocol t...,- Uganda,- UPR,A/HRC/34/10,- Labour rights and right to work\n- Ratificat...,- Indigenous peoples,,2016-12-27,- Guatemala,- Noted,- Recommendations,4100eed4-fcf7-42d6-941a-a89493e77e6c,- 26th Session - Cycle 2,- Africa,- Latin America and the Caribbean,2018-06-20 13:50:21.503
2,133.247 Ensure full and equal access to modern...,- Venezuela (Bolivarian Republic of),- UPR,A/HRC/34/6,- Sexual & reproductive health and rights,- Indigenous peoples\n- Persons living in rura...,,2016-12-27,- Denmark,- Supported,- Recommendations,f910ce5d-0d1c-45de-b64b-3050df7679dc,- 26th Session - Cycle 2,- Latin America and the Caribbean,- Western Europe & Others,2018-05-04 12:18:58.713


# Data Cleaning

##### - Check unique number of text & annotation id to establish primary key
##### - Both have 625 unique entries which is the same as the total database entries
##### - Normalise by removing one of these, two identifiers is uneccesary we are not gaining anymore information

In [106]:
text_count = df.Text.unique().size
print(text_count)

625


In [107]:
annotation_id_count = df["OHCHR Annotation Id"].unique().size
print(annotation_id_count)

625


##### Columns to be dropped
##### Annotation ID, Recommending body, Text, Affected persons, Sdgs, Document publication date, session, date
##### check all entries are recommendation by seeing if text starts with number, then drop this column. 

In [108]:
drop_columns = ["OHCHR Annotation Id","Recommending Body", "Document Publication Date", "Sdgs", "UPR Session", "Date of publication on UHRI", "Affected Persons"]

In [109]:
df.drop(drop_columns, inplace=True, axis=1)

In [110]:
df.head(2)

Unnamed: 0,Text,Countries Concerned,Document Symbol,Themes,UPR Recommending States,UPR Position,Type,Regions Concerned,Recommending Regions
1,117.22 Promptly ratify the Optional Protocol t...,- Uganda,A/HRC/34/10,- Labour rights and right to work\n- Ratificat...,- Guatemala,- Noted,- Recommendations,- Africa,- Latin America and the Caribbean
2,133.247 Ensure full and equal access to modern...,- Venezuela (Bolivarian Republic of),A/HRC/34/6,- Sexual & reproductive health and rights,- Denmark,- Supported,- Recommendations,- Latin America and the Caribbean,- Western Europe & Others


In [111]:
def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))

In [112]:
df = df[df['Text'].apply(lambda x: has_numbers(x))]
df.drop(['Type'], inplace=True, axis=1)

In [113]:
df.index = df.index.set_names(['Recommendation_id'])
df.head(2)

Unnamed: 0_level_0,Text,Countries Concerned,Document Symbol,Themes,UPR Recommending States,UPR Position,Regions Concerned,Recommending Regions
Recommendation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,117.22 Promptly ratify the Optional Protocol t...,- Uganda,A/HRC/34/10,- Labour rights and right to work\n- Ratificat...,- Guatemala,- Noted,- Africa,- Latin America and the Caribbean
2,133.247 Ensure full and equal access to modern...,- Venezuela (Bolivarian Republic of),A/HRC/34/6,- Sexual & reproductive health and rights,- Denmark,- Supported,- Latin America and the Caribbean,- Western Europe & Others


#### Standardize the naming convention, by removing hyphens and spaces, of all data entries.

In [114]:
def remove_leading_char(data_entry):
    """"Clean the naming convention for data entries of all columns"""
    clean_aff = re.sub(r'^[^A-Z]*', '', data_entry)
    if clean_aff:
        return clean_aff

In [115]:
def add_empty_cell_string(df):
    """Replacing all empty cells with default string 'Missing data'"""
    df.fillna('Missing data', inplace=True)
    return df 

In [116]:
df2 = add_empty_cell_string(df)
df2.head()

Unnamed: 0_level_0,Text,Countries Concerned,Document Symbol,Themes,UPR Recommending States,UPR Position,Regions Concerned,Recommending Regions
Recommendation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,117.22 Promptly ratify the Optional Protocol t...,- Uganda,A/HRC/34/10,- Labour rights and right to work\n- Ratificat...,- Guatemala,- Noted,- Africa,- Latin America and the Caribbean
2,133.247 Ensure full and equal access to modern...,- Venezuela (Bolivarian Republic of),A/HRC/34/6,- Sexual & reproductive health and rights,- Denmark,- Supported,- Latin America and the Caribbean,- Western Europe & Others
3,133.267 Protect the rights of indigenous peopl...,- Venezuela (Bolivarian Republic of),A/HRC/34/6,- Right to participate in public affairs & rig...,- Peru,- Supported,- Latin America and the Caribbean,- Latin America and the Caribbean
4,133.268 Continue enhancing the school infrastr...,- Venezuela (Bolivarian Republic of),A/HRC/34/6,- Right to education,- South Sudan,- Supported,- Latin America and the Caribbean,- Africa
5,133.269 Continue implementing mechanisms for t...,- Venezuela (Bolivarian Republic of),A/HRC/34/6,- Constitutional & legislative framework,- Cuba,- Supported,- Latin America and the Caribbean,- Latin America and the Caribbean


In [117]:
df2['Themes'] = df2['Themes'].str.split('\n')
df2 = df2.explode('Themes')
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1481 entries, 1 to 625
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Text                     1481 non-null   object
 1   Countries Concerned      1481 non-null   object
 2   Document Symbol          1481 non-null   object
 3   Themes                   1481 non-null   object
 4   UPR Recommending States  1481 non-null   object
 5   UPR Position             1481 non-null   object
 6   Regions Concerned        1481 non-null   object
 7   Recommending Regions     1481 non-null   object
dtypes: object(8)
memory usage: 104.1+ KB


In [118]:
df3 = df2.applymap(lambda x: remove_leading_char(str(x)))
df3.head()

Unnamed: 0_level_0,Text,Countries Concerned,Document Symbol,Themes,UPR Recommending States,UPR Position,Regions Concerned,Recommending Regions
Recommendation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Promptly ratify the Optional Protocol to the C...,Uganda,A/HRC/34/10,Labour rights and right to work,Guatemala,Noted,Africa,Latin America and the Caribbean
1,Promptly ratify the Optional Protocol to the C...,Uganda,A/HRC/34/10,Ratification of & accession to international i...,Guatemala,Noted,Africa,Latin America and the Caribbean
1,Promptly ratify the Optional Protocol to the C...,Uganda,A/HRC/34/10,Prohibition of torture & ill-treatment (includ...,Guatemala,Noted,Africa,Latin America and the Caribbean
2,Ensure full and equal access to modern contrac...,Venezuela (Bolivarian Republic of),A/HRC/34/6,Sexual & reproductive health and rights,Denmark,Supported,Latin America and the Caribbean,Western Europe & Others
3,Protect the rights of indigenous peoples throu...,Venezuela (Bolivarian Republic of),A/HRC/34/6,Right to participate in public affairs & right...,Peru,Supported,Latin America and the Caribbean,Latin America and the Caribbean


In [119]:
df4 = df3
df4['Recommending Regions'] = df4['Recommending Regions'].str.split('\n')
df4 = df4.explode('Recommending Regions')
df4 = df4.applymap(lambda x: remove_leading_char(str(x)))
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1539 entries, 1 to 625
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Text                     1539 non-null   object
 1   Countries Concerned      1539 non-null   object
 2   Document Symbol          1539 non-null   object
 3   Themes                   1539 non-null   object
 4   UPR Recommending States  1539 non-null   object
 5   UPR Position             1539 non-null   object
 6   Regions Concerned        1539 non-null   object
 7   Recommending Regions     1539 non-null   object
dtypes: object(8)
memory usage: 108.2+ KB


In [120]:
Recommendations_df = df[['Text', 'UPR Position']].applymap(lambda x: remove_leading_char(str(x)))
Recommendations_df= Recommendations_df.reset_index(level=0)

In [121]:
Country_to_recommendations = df4[['Countries Concerned','UPR Recommending States']]
Country_to_recommendations.index = Country_to_recommendations.index.set_names(['Recommendation_id'])
Country_to_recommendations = Country_to_recommendations.reset_index(level=0).drop_duplicates(subset='Recommendation_id', keep='first')
Country_to_recommendations

Unnamed: 0,Recommendation_id,Countries Concerned,UPR Recommending States
0,1,Uganda,Guatemala
3,2,Venezuela (Bolivarian Republic of),Denmark
4,3,Venezuela (Bolivarian Republic of),Peru
5,4,Venezuela (Bolivarian Republic of),South Sudan
6,5,Venezuela (Bolivarian Republic of),Cuba
...,...,...,...
1526,621,Ecuador,Paraguay
1530,622,Ecuador,Germany
1532,623,Indonesia,Norway
1533,624,Indonesia,Mexico


In [122]:
Countries_df = pd.DataFrame(df4['Countries Concerned'].unique(), columns=['Country_name'])
Countries_df.index = Countries_df.index.set_names(['Country_id'])
Countries_df = Countries_df.reset_index(level=0)

In [123]:
# for index, row in Country_to_recommendations.iterrows():
#     if row['Countries Concerned'] in list(Countries_df['Country_name']):
#         Country_to_recommendations.at[index,'Countries Concerned'] = Countries_df['Country_id'].where(Countries_df['Country_name'] == row['Countries Concerned'])

### fix error here - when second merge, rows are lost!?

In [124]:
Country_to_recommendations = Country_to_recommendations.merge(Countries_df, left_on='Countries Concerned', right_on='Country_name')
Country_to_recommendations = Country_to_recommendations.drop(columns=['Countries Concerned','Country_name']).rename(columns={"Country_id": "Countries Concerned"})
Country_to_recommendations = Country_to_recommendations.merge(Countries_df, left_on='UPR Recommending States', right_on='Country_name')
Country_to_recommendations = Country_to_recommendations.drop(columns=['UPR Recommending States','Country_name']).rename(columns={"Country_id": "UPR Recommending States"})
Country_to_recommendations 

Unnamed: 0,Recommendation_id,Countries Concerned,UPR Recommending States
0,1,0,69
1,29,10,69
2,154,27,69
3,164,29,69
4,206,32,69
...,...,...,...
330,484,64,79
331,513,64,79
332,487,64,65
333,524,64,65


In [125]:
Regions_df = pd.DataFrame(df4['Regions Concerned'].unique(), columns=['Regions'])
Regions_df.index = Regions_df.index.set_names(['Region_id'])
Regions_df = Regions_df.reset_index(level=0)

In [126]:
Recommendation_to_theme = df4[['Themes']]
Recommendation_to_theme.index = Recommendation_to_theme.index.set_names(['Recommendation_id'])
Recommendation_to_theme = Recommendation_to_theme.reset_index(level=0)

In [127]:
Themes_df = pd.DataFrame(df4['Themes'].unique(), columns=['Themes'])
Themes_df.index = Themes_df.index.set_names(['Theme_id'])
Themes_df = Themes_df.reset_index(level=0)

In [128]:
Recommendation_to_theme = Recommendation_to_theme.merge(Themes_df, left_on='Themes', right_on='Themes')
Recommendation_to_theme = Recommendation_to_theme.drop(columns=['Themes']).rename(columns={'Theme_id':'Themes'})

In [129]:
Recommendations_df

Unnamed: 0,Recommendation_id,Text,UPR Position
0,1,Promptly ratify the Optional Protocol to the C...,Noted
1,2,Ensure full and equal access to modern contrac...,Supported
2,3,Protect the rights of indigenous peoples throu...,Supported
3,4,Continue enhancing the school infrastructure f...,Supported
4,5,Continue implementing mechanisms for the prior...,Supported
...,...,...,...
620,621,Adopt legislation to guarantee the fulfilment ...,Supported
621,622,Establish clear consultation procedures in ord...,Noted
622,623,Consider ratifying ILO Convention N° 169 ( Nor...,Supported
623,624,Extend an invitation to the Working Group on e...,Supported


In [130]:
Country_to_recommendations

Unnamed: 0,Recommendation_id,Countries Concerned,UPR Recommending States
0,1,0,69
1,29,10,69
2,154,27,69
3,164,29,69
4,206,32,69
...,...,...,...
330,484,64,79
331,513,64,79
332,487,64,65
333,524,64,65


### Add region id to table below

In [131]:
Countries_df

Unnamed: 0,Country_id,Country_name
0,0,Uganda
1,1,Venezuela (Bolivarian Republic of)
2,2,Hungary
3,3,Ireland
4,4,Thailand
...,...,...
75,75,Brazil
76,76,Tunisia
77,77,Finland
78,78,Ecuador


In [132]:
Regions_df

Unnamed: 0,Region_id,Regions
0,0,Africa
1,1,Latin America and the Caribbean
2,2,Eastern Europe
3,3,Western Europe & Others
4,4,Asia-Pacific


In [133]:
Recommendation_to_theme

Unnamed: 0,Recommendation_id,Themes
0,1,0
1,9,0
2,11,0
3,16,0
4,35,0
...,...,...
1534,525,64
1535,543,65
1536,544,65
1537,589,66


In [134]:
Themes_df

Unnamed: 0,Theme_id,Themes
0,0,Labour rights and right to work
1,1,Ratification of & accession to international i...
2,2,Prohibition of torture & ill-treatment (includ...
3,3,Sexual & reproductive health and rights
4,4,Right to participate in public affairs & right...
...,...,...
63,63,Good governance & corruption
64,64,Private life & privacy
65,65,Right to be recognized as a person before the law
66,66,Trade union rights
