In [1]:
import numpy as np
import pandas as pd
import re

# GET RAW DATA

In [2]:
raw_data = pd.read_csv('results_by_links.csv')

raw_data

Unnamed: 0,all_result,n_df
0,CENTRO DE EST...,0
1,DIREC.DE SIST.y COMUNICACIONES ...,0
2,* UNIDAD P.A.D.* ...,0
3,RESULTADOS GENE...,0
4,=============================================...,0
...,...,...
139354,0010 354870 PEREZ JOAQUIN MARCOS ...,208
139355,0011 354470 AMADOR AVILA LUIS MARIO ...,208
139356,0012 350570 CASTILLO LIÑAN CLEVER FELIPE ...,208
139357,********************************************...,208


## DELETE AND REPLACE WHITE SPACES

In [3]:
many_white_spaces = re.compile(r'\s+')
one_white_space = ' '

raw_data['all_result'] = (
    raw_data['all_result']
    .str.replace(
        many_white_spaces,
        one_white_space,
        regex=True
    )
    .str.strip()
)

raw_data

Unnamed: 0,all_result,n_df
0,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,0
1,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,0
2,* UNIDAD P.A.D.* << TRUJILLO >> 12/08/2018,0
3,RESULTADOS GENERALES - ORDEN DE MERITO POR ESC...,0
4,==============================================...,0
...,...,...
139354,0010 354870 PEREZ JOAQUIN MARCOS 21.191 15.258...,208
139355,0011 354470 AMADOR AVILA LUIS MARIO 25.687 5.4...,208
139356,0012 350570 CASTILLO LIÑAN CLEVER FELIPE 19.56...,208
139357,**********************************************...,208


# ONLY RESULTS

In [4]:
fourt_digits = re.compile(r'^\d{4}')

filter_by_character = (
    raw_data['all_result']
    .str.contains(fourt_digits, regex=True)
)

raw_results = raw_data[filter_by_character]

raw_results

Unnamed: 0,all_result,n_df
8,0001 171558 RODRIGUEZ SANCHEZ ROSICELA ELIZABE...,0
9,0002 061758 LLANOS SOLIS KIMBERLYN YEI 37.504 ...,0
10,0003 178758 ALVA PEREZ TAMARA ANTONELLA 48.918...,0
11,0004 028758 URIOL FLORES LIZ ROCIO 39.342 57.7...,0
12,0005 170858 VASQUEZ GUERRA CESAR IVAN 35.462 5...,0
...,...,...
139352,0008 353970 BRAVO LEZAMA DARIKS ALEXANDER 18.7...,208
139353,0009 353770 OTINIANO SANCHEZ HELEN NEFTALI 19....,208
139354,0010 354870 PEREZ JOAQUIN MARCOS 21.191 15.258...,208
139355,0011 354470 AMADOR AVILA LUIS MARIO 25.687 5.4...,208


# ONLY TITLES

In [5]:
raw_titles = raw_data[~filter_by_character]

raw_titles

Unnamed: 0,all_result,n_df
0,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,0
1,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,0
2,* UNIDAD P.A.D.* << TRUJILLO >> 12/08/2018,0
3,RESULTADOS GENERALES - ORDEN DE MERITO POR ESC...,0
4,==============================================...,0
...,...,...
139325,OR- I II PUNTAJE,208
139326,DEN CARNET APELIIDOS y NOMBRES SUMATIVO SUMATI...,208
139327,==============================================...,208
139357,**********************************************...,208


# TRANSFORM TITLES

In [6]:
pag_number = re.compile(r'Pag\.\s\d+')
empty_string = ''

raw_titles.loc[:, 'all_result'] = (
    raw_titles['all_result']
    .str.replace(pag_number, empty_string, regex=True)
    .str.rstrip()
    )

raw_titles

Unnamed: 0,all_result,n_df
0,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,0
1,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,0
2,* UNIDAD P.A.D.* << TRUJILLO >> 12/08/2018,0
3,RESULTADOS GENERALES - ORDEN DE MERITO POR ESC...,0
4,==============================================...,0
...,...,...
139325,OR- I II PUNTAJE,208
139326,DEN CARNET APELIIDOS y NOMBRES SUMATIVO SUMATI...,208
139327,==============================================...,208
139357,**********************************************...,208


In [7]:
titles_sep_sign = "="

def filter_group(dataframe_group):

    filter_title = (
        dataframe_group
        ['all_result']
        .str.startswith(titles_sep_sign)
    )

    index_value = (
        dataframe_group
        [filter_title]
        .index
        [0]
    )

    return dataframe_group.loc[:index_value - 1] # le puedes quitar el -1
    # para ver el = en el resultado

titles_group_by_n_df = (
    raw_titles
    .drop_duplicates(['all_result', 'n_df'])
    .groupby('n_df')
)

In [8]:
filtered_titles = map(lambda values: filter_group(values[1]), titles_group_by_n_df)
titles_before_equal_sign = pd.concat(filtered_titles)

titles_before_equal_sign.head(20)

Unnamed: 0,all_result,n_df
0,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,0
1,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,0
2,* UNIDAD P.A.D.* << TRUJILLO >> 12/08/2018,0
3,RESULTADOS GENERALES - ORDEN DE MERITO POR ESC...,0
3370,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,1
3371,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,1
3372,* UNIDAD P.A.D.* << JEQUETEPEQUE >> 12/08/2018,1
3373,RESULTADOS GENERALES - ORDEN DE MERITO POR ESC...,1
3678,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,2
3679,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,2


In [9]:
n_titles_per_df = titles_before_equal_sign.groupby("n_df").count()

n_titles_per_df

Unnamed: 0_level_0,all_result
n_df,Unnamed: 1_level_1
0,4
1,4
2,4
3,4
4,4
...,...
204,3
205,3
206,3
207,3


In [10]:
titles_before_equal_sign = pd.merge(titles_before_equal_sign, n_titles_per_df, left_on='n_df', right_on='n_df')

titles_before_equal_sign

Unnamed: 0,all_result_x,n_df,all_result_y
0,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,0,4
1,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,0,4
2,* UNIDAD P.A.D.* << TRUJILLO >> 12/08/2018,0,4
3,RESULTADOS GENERALES - ORDEN DE MERITO POR ESC...,0,4
4,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,1,4
...,...,...,...
725,21/07/2024 EXAMENES SUMATIVOS 2025- I - CEPUNT I,207,3
726,RESULTADOS GENERALES POR ESCUELA PROFESIONAL *...,207,3
727,OFICINA DE TECNOLOGIAS DE LA INFORMACION DE LA...,208,3
728,21/07/2024 EXAMENES SUMATIVOS 2025- I - CEPUNT I,208,3


In [11]:
def add_serie(df):
    max_value = df['all_result_y'].max()
    df['aux'] = range(max_value)
    return df

In [12]:
titles_with_sequence = map(lambda values: add_serie(values[1]), titles_before_equal_sign.groupby("n_df"))
titles_before_equal_sign = pd.concat(titles_with_sequence)

titles_before_equal_sign

Unnamed: 0,all_result_x,n_df,all_result_y,aux
0,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,0,4,0
1,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,0,4,1
2,* UNIDAD P.A.D.* << TRUJILLO >> 12/08/2018,0,4,2
3,RESULTADOS GENERALES - ORDEN DE MERITO POR ESC...,0,4,3
4,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,1,4,0
...,...,...,...,...
725,21/07/2024 EXAMENES SUMATIVOS 2025- I - CEPUNT I,207,3,1
726,RESULTADOS GENERALES POR ESCUELA PROFESIONAL *...,207,3,2
727,OFICINA DE TECNOLOGIAS DE LA INFORMACION DE LA...,208,3,0
728,21/07/2024 EXAMENES SUMATIVOS 2025- I - CEPUNT I,208,3,1


In [13]:
unstacked_titles = (
    titles_before_equal_sign
    .loc[:, ['n_df', 'aux', 'all_result_x']]
    .set_index(['n_df', 'aux'])
    .unstack()
    .droplevel(['aux'], axis=1)
)

unstacked_titles.columns = ['first', 'second', 'third', 'fourth']

unstacked_titles

Unnamed: 0_level_0,first,second,third,fourth
n_df,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,* UNIDAD P.A.D.* << TRUJILLO >> 12/08/2018,RESULTADOS GENERALES - ORDEN DE MERITO POR ESC...
1,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,* UNIDAD P.A.D.* << JEQUETEPEQUE >> 12/08/2018,RESULTADOS GENERALES - ORDEN DE MERITO POR ESC...
2,CENTRO DE ESTUDIOS PREUNIVERSITARIOS DE LA UNT...,DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATI...,* UNIDAD P.A.D.* << HUAMACHUCO >> 12/08/2018,RESULTADOS GENERALES - ORDEN DE MERITO POR ESC...
3,UNIVERSIDAD NACIONAL DE TRUJILLO - UNT,DIREC.DE SIST.y COMUNIC. EXAMEN DE ADMISION OR...,* UNIDAD P.A.D.* GRUPO : A 22/09/2018,RESULTADOS GENERALES POR ESCUELA PROFESIONAL
4,UNIVERSIDAD NACIONAL DE TRUJILLO - UNT,DIREC.DE SIST.y COMUNIC. EXAMEN DE ADMISION OR...,* UNIDAD P.A.D.* GRUPO : B 23/09/2018,RESULTADOS GENERALES POR ESCUELA PROFESIONAL
...,...,...,...,...
204,OFICINA DE TECNOLOGIAS DE LA INFORMACION DE LA...,17/03/2024 EXAMEN DE ADMISION ORDINARIO 2024-II,RESULTADOS GENERALES POR ESCUELA PROFESIONAL *...,
205,OFICINA DE TECNOLOGIAS DE LA INFORMACION DE LA...,17/03/2024 EXAMEN DE ADMISION ORDINARIO 2024-II,RESULTADOS GENERALES POR ESCUELA PROFESIONAL *...,
206,OFICINA DE TECNOLOGIAS DE LA INFORMACION DE LA...,21/07/2024 EXAMENES SUMATIVOS 2025- I - CEPUNT I,RESULTADOS GENERALES POR ESCUELA PROFESIONAL *...,
207,OFICINA DE TECNOLOGIAS DE LA INFORMACION DE LA...,21/07/2024 EXAMENES SUMATIVOS 2025- I - CEPUNT I,RESULTADOS GENERALES POR ESCUELA PROFESIONAL *...,


## TRANSFORM SECOND COLUMN

In [14]:
unstacked_titles['second'].value_counts().sort_index(ascending=False).head(49)

second
OFI.DE TECNOLOG.DE LA INFORMAC. EXAMENES SUMATIVOS 2024- I - CEPUNT I                 3
OF.DE TECNOLOG.DE LA INFOR. EXAMEN DE ADMISION ORDINARIO 2024-I                       3
OF.DE TECNOLOG.DE LA INFOR. EXAMEN DE ADMISION ORDINARIO 2024- I - TRUJILLO           3
OF.DE TECNOLOG.DE LA INFOR. EXAMEN DE ADMISION ORDINARIO 2023-II - V A L L E          3
OF.DE TECNOLOG.DE LA INFOR. EXAMEN DE ADMISION ORDINARIO 2023-II - TRUJILLO           3
OF.DE TECNOLOG.DE LA INFOR. EXAMEN DE ADMISION ORDINARIO 2023-II - STGO.DE CHUCO      1
OF.DE TECNOLOG.DE LA INFOR. EXAMEN DE ADMISION ORDINARIO 2023-II - HUAMACHUCO         3
OF.DE TECNOLOG.DE LA INFOR. EXAMEN DE ADMISION ORDINARIO 2023-I                       3
OF.DE TECNOLOG.DE LA INFOR. EXAMEN DE ADMISION ORDINARIO 2023- I - TRUJILLO           3
OF.DE TECNOLOG.DE LA INFOR. EXAMEN DE ADMISION ORDINARIO 2022-II - V A L L E          3
OF.DE TECNOLOG.DE LA INFOR. EXAMEN DE ADMISION ORDINARIO 2022-II - TRUJILLO           3
OF.DE TECNOLOG.DE LA INFO

In [15]:
unstacked_titles['second'].value_counts().sort_index(ascending=False).tail(50)

second
DIREC.DE SIST.y COMUNICACIONES EXAMENES SUMATIVOS 2019- I - CEPUNT I                  3
DIREC.DE SIST.y COMUNICAC. EXAMEN DE ADMISION ORDINARIO 2021-II * STGO.DE CHUCO       1
DIREC.DE SIST.y COMUNICAC. EXAMEN DE ADMISION ORDINARIO 2020-II * STGO.DE CHUCO       1
DIREC.DE SIST.y COMUNICAC. EXAMEN DE ADMISION ORDINARIO 2019-II * STGO.DE CHUCO       1
DIREC.DE SIST.y COMUNIC. EXAMEN DE ADMISION ORDINARIO 2021-II - TRUJILLO              3
DIREC.DE SIST.y COMUNIC. EXAMEN DE ADMISION ORDINARIO 2021-II * JEQUETEPEQUE          3
DIREC.DE SIST.y COMUNIC. EXAMEN DE ADMISION ORDINARIO 2021- I - TRUJILLO              4
DIREC.DE SIST.y COMUNIC. EXAMEN DE ADMISION ORDINARIO 2021- I * TRUJILLO              4
DIREC.DE SIST.y COMUNIC. EXAMEN DE ADMISION ORDINARIO 2020-II - TRUJILLO              2
DIREC.DE SIST.y COMUNIC. EXAMEN DE ADMISION ORDINARIO 2020-II * JEQUETEPEQUE          2
DIREC.DE SIST.y COMUNIC. EXAMEN DE ADMISION ORDINARIO 2020- I - TRUJILLO              2
DIREC.DE SIST.y COMUNIC. 

### SECOND COLUMN DATES

In [16]:
valid_date = re.compile(r'(\d\d/\d\d/\d+)')

second_date = (
    unstacked_titles
    ['second']
    .str.extract(valid_date)
    [0]
    # .value_counts()
    # .sum()
)

second_date.value_counts()

0
03/03/2024    7
17/03/2024    4
18/02/2024    3
16/03/2024    3
21/07/2024    3
Name: count, dtype: int64

## TRANSFORM THIRD COLUMN

In [17]:
unstacked_titles['third'].value_counts().sort_index().head(60)

third
* AREA P.A.D.* << HUAMACHUCO >> 20/08/2023                                                           1
* AREA P.A.D.* << TRUJILLO >> 20/08/2023                                                             1
* AREA P.A.D.* << V A L L E >> 20/08/2023                                                            1
* AREA P.A.D.* AREA * A 30/09/2023                                                                   1
* AREA P.A.D.* AREA - D 23/09/2023                                                                   1
* AREA P.A.D.* AREA : A 07/04/2022                                                                   3
* AREA P.A.D.* AREA : A 21/10/2022                                                                   1
* AREA P.A.D.* AREA : A 24/03/2023                                                                   3
* AREA P.A.D.* AREA : A 28/10/2021                                                                   1
* AREA P.A.D.* AREA : B 08/04/2022                                 

In [18]:
unstacked_titles['third'].value_counts().sort_index().tail(60)

third
* UNIDAD P.A.D.* << JEQUETEPEQUE >> 12/08/2018                                             1
* UNIDAD P.A.D.* << JEQUETEPEQUE >> 18/08/2019                                             1
* UNIDAD P.A.D.* << TRUJILLO >> 09/02/2020                                                 1
* UNIDAD P.A.D.* << TRUJILLO >> 10/02/2019                                                 1
* UNIDAD P.A.D.* << TRUJILLO >> 12/08/2018                                                 1
* UNIDAD P.A.D.* << TRUJILLO >> 18/08/2019                                                 1
* UNIDAD P.A.D.* AREA : A 02/06/2021                                                       3
* UNIDAD P.A.D.* AREA : A 02/12/2020                                                       1
* UNIDAD P.A.D.* AREA : B 03/12/2020                                                       1
* UNIDAD P.A.D.* AREA : B 28/05/2021                                                       4
* UNIDAD P.A.D.* AREA : C 04/12/2020                            

### THIRD COLUMN DATES

In [19]:
nombre = 'third'

third_date = (
    unstacked_titles
    [nombre]
    .str.extract(valid_date)
    [0]
    # .value_counts()
    # .sum()
)

third_date.value_counts()

0
23/02/2020    7
24/02/2019    7
08/10/2021    6
11/02/2023    6
26/02/2022    6
             ..
21/09/2021    2
02/12/2020    2
03/12/2020    2
23/09/2023    2
21/11/2020    1
Name: count, Length: 63, dtype: int64

## JOIN COLUMN TITLES

In [20]:
# second_date
# second_users doesn't have users

# third_date

# third_tests doesn't have tests

# fourth_areas

In [21]:
date_title = (
    second_date
    .str.cat(third_date, na_rep=empty_string)
    .replace(empty_string, np.nan, regex=False)
 )

date_title.name = "date"
date_title

n_df
0      12/08/2018
1      12/08/2018
2      12/08/2018
3      22/09/2018
4      23/09/2018
          ...    
204    17/03/2024
205    17/03/2024
206    21/07/2024
207    21/07/2024
208    21/07/2024
Name: date, Length: 209, dtype: object

# TRANSFORM RESULTS

## NAMES

In [22]:
valid_name = re.compile(r'\s(\D+[0]?\D+[0]?)\s-?\d')

names = (
    raw_results
    ['all_result']
    .str.extract(valid_name)
    [0]
    .str.strip()
    .str.replace('0 0', '', regex=False)
    .str.rstrip()
    .str.replace('0', 'O', regex=False)
    .str.lower()
)

names

8         rodriguez sanchez rosicela elizabeth
9                   llanos solis kimberlyn yei
10                 alva perez tamara antonella
11                      uriol flores liz rocio
12                   vasquez guerra cesar ivan
                          ...                 
139352           bravo lezama dariks alexander
139353          otiniano sanchez helen neftali
139354                    perez joaquin marcos
139355                 amador avila luis mario
139356            castillo liñan clever felipe
Name: 0, Length: 119027, dtype: object

## GRADES

In [23]:
valid_grades = re.compile(r'(-?\d+\.\d+)')

grades = (
    raw_results
    ['all_result']
    .str.extractall(valid_grades)
    .unstack()
    .droplevel(level=0, axis=1)
)

grades

match,0,1,2,3,4
8,41.577,78.261,126.368,246.206,94.842
9,37.504,67.242,109.044,213.790,94.842
10,48.918,53.192,109.036,211.146,94.842
11,39.342,57.783,102.429,199.554,94.842
12,35.462,50.730,109.548,195.740,94.842
...,...,...,...,...,...
139352,18.748,24.420,43.168,,
139353,19.980,20.180,40.160,,
139354,21.191,15.258,36.449,,
139355,25.687,5.483,31.170,,


## SCHOOL AND DETAILS

In [24]:
after_grades = re.compile(r'(\.\d+\s[A-Z].*)')
digits = re.compile(r'\.\d+\s')
empty_string = ''

school_and_details = (
        raw_results
        ['all_result']
        .str.extract(after_grades)
        [0]
        .str.replace(digits, empty_string, regex=True)
)

school_and_details

8         SI ADMINISTRACION
9         SI ADMINISTRACION
10        SI ADMINISTRACION
11        SI ADMINISTRACION
12        SI ADMINISTRACION
                ...        
139352      ING.DE MINAS NO
139353      ING.DE MINAS NO
139354      ING.DE MINAS NO
139355      ING.DE MINAS NO
139356      ING.DE MINAS NO
Name: 0, Length: 119027, dtype: object

### DETAILS

In [25]:
valid_result = re.compile(r'(INGRESA.*|NO\sINGRESA.*|ING\.\s?2.*|AUSENTE.*|ANULADO.*|^SI\s|^NO\s|\sSI$|\sNO$)')
possitive_pattern = re.compile(r'INGRESA([\w\s-]+)?')
negative_pattern = re.compile(r'NO\D+')
positive_result = 'SI'
negative_result = 'NO'
ingresa = 'INGRESA'
no_ingresa = 'NO INGRESA'

details = (
        school_and_details
        .str.extract(valid_result)
        [0]
        .str.strip()
        .str.replace(negative_pattern, negative_result, regex=True)
        .str.replace(possitive_pattern, positive_result, regex=True)
        .str.replace(positive_result, ingresa, regex=False)
        .str.replace(negative_result, no_ingresa, regex=False)
        .str.replace('.', '', regex=False)
        .str.replace('ING 2-OPC', 'SEGUNDA OPC', regex=False)
        .str.lower()
        .str.strip()
)

details

8            ingresa
9            ingresa
10           ingresa
11           ingresa
12           ingresa
             ...    
139352    no ingresa
139353    no ingresa
139354    no ingresa
139355    no ingresa
139356    no ingresa
Name: 0, Length: 119027, dtype: object

In [26]:
details.value_counts()

0
no ingresa     98171
ingresa        16268
ausente         3003
segunda opc     1564
anulado           20
Name: count, dtype: int64

In [27]:
details[details.isnull()]

88692    NaN
Name: 0, dtype: object

### SCHOOL

In [28]:
valid_result2 = re.compile(r'INGRESA.*|NO\sINGRESA.*|ING\.\s?2.*|AUSENTE.*|ANULADO.*|^SI\s|^NO\s|\sSI$|\sNO$')

school = (
    school_and_details
    .str.replace(valid_result2, empty_string, regex=True)
    .str.upper()
    .str.strip()
    .str.replace(".", " ", regex=False)    
    .str.replace(":", " ", regex=False)   
    .str.strip()    
    .str.replace(r"\s+", " ", regex=True)   
    .str.lower()   
)

school

8         administracion
9         administracion
10        administracion
11        administracion
12        administracion
               ...      
139352      ing de minas
139353      ing de minas
139354      ing de minas
139355      ing de minas
139356      ing de minas
Name: 0, Length: 119027, dtype: object

In [29]:
school.value_counts()

0
medicina              21189
derecho y cc pp        8464
ing industrial         8105
administracion         7715
enfermeria             6430
ing civil              5918
ing de sistemas        5253
contabil y finan       5250
arquitectura y urb     4676
trabajo social         3341
ing ambiental          3038
ing de minas           2952
farmacia y bbqq        2734
ing mecatronica        2550
economia               2476
ing agroindustrial     2163
informatica            2036
ing mecanica           1970
estomatologia          1959
educacion inicial      1940
ing quimica            1749
ccas comunicacion      1738
ed sec idiomas         1435
ccas biologicas        1311
microbiol y paras      1137
cc politica y gob       943
agronomia               925
ing metalurgica         885
derecho                 869
educacion primaria      805
turismo                 758
ed sec lengua           697
ing agronomica          656
ing agricola            649
ing de materiales       585
ed sec filosofia  

## JOIN RESULTS

In [30]:
column_results = ['names', 'one', 'two', 'three', 'four', 'five', 'career', 'status', 'n_df']

results = pd.concat(
    [
        names,
        grades,
        school,
        details,
        raw_results['n_df']
    ],
    axis=1    
)

results.columns = column_results

results

Unnamed: 0,names,one,two,three,four,five,career,status,n_df
8,rodriguez sanchez rosicela elizabeth,41.577,78.261,126.368,246.206,94.842,administracion,ingresa,0
9,llanos solis kimberlyn yei,37.504,67.242,109.044,213.790,94.842,administracion,ingresa,0
10,alva perez tamara antonella,48.918,53.192,109.036,211.146,94.842,administracion,ingresa,0
11,uriol flores liz rocio,39.342,57.783,102.429,199.554,94.842,administracion,ingresa,0
12,vasquez guerra cesar ivan,35.462,50.730,109.548,195.740,94.842,administracion,ingresa,0
...,...,...,...,...,...,...,...,...,...
139352,bravo lezama dariks alexander,18.748,24.420,43.168,,,ing de minas,no ingresa,208
139353,otiniano sanchez helen neftali,19.980,20.180,40.160,,,ing de minas,no ingresa,208
139354,perez joaquin marcos,21.191,15.258,36.449,,,ing de minas,no ingresa,208
139355,amador avila luis mario,25.687,5.483,31.170,,,ing de minas,no ingresa,208


# JOIN TITLES WITH RESULTS

In [31]:
results_with_dates = pd.merge(
    results,
    date_title,
    left_on='n_df',
    right_index=True
)

results_with_dates

Unnamed: 0,names,one,two,three,four,five,career,status,n_df,date
8,rodriguez sanchez rosicela elizabeth,41.577,78.261,126.368,246.206,94.842,administracion,ingresa,0,12/08/2018
9,llanos solis kimberlyn yei,37.504,67.242,109.044,213.790,94.842,administracion,ingresa,0,12/08/2018
10,alva perez tamara antonella,48.918,53.192,109.036,211.146,94.842,administracion,ingresa,0,12/08/2018
11,uriol flores liz rocio,39.342,57.783,102.429,199.554,94.842,administracion,ingresa,0,12/08/2018
12,vasquez guerra cesar ivan,35.462,50.730,109.548,195.740,94.842,administracion,ingresa,0,12/08/2018
...,...,...,...,...,...,...,...,...,...,...
139352,bravo lezama dariks alexander,18.748,24.420,43.168,,,ing de minas,no ingresa,208,21/07/2024
139353,otiniano sanchez helen neftali,19.980,20.180,40.160,,,ing de minas,no ingresa,208,21/07/2024
139354,perez joaquin marcos,21.191,15.258,36.449,,,ing de minas,no ingresa,208,21/07/2024
139355,amador avila luis mario,25.687,5.483,31.170,,,ing de minas,no ingresa,208,21/07/2024


# READ DESCRIPTION DATA

In [32]:
description = pd.read_csv("cleaned_wp_content.csv")
description

Unnamed: 0,test_type,year_period,period,mod,place,link
0,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
1,cepunt,2019,i,,valle jequetepeque,http://admisionunt.info/docs/padrones/20191/20...
2,cepunt,2019,i,,huamachuco,http://admisionunt.info/docs/padrones/20191/20...
3,ordinario,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
4,ordinario,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
...,...,...,...,...,...,...
204,ordinario,2024,ii,,valle jequetepeque,https://unitru.edu.pe/webfiles///Convocatoria/...
205,ordinario,2024,ii,,santiago de chuco,https://unitru.edu.pe/webfiles///Convocatoria/...
206,cepunt,2025,i,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
207,cepunt,2025,i,,valle jequetepeque,https://unitru.edu.pe/webfiles///Convocatoria/...


In [33]:
##############
description['mod'].value_counts()

mod
excelencia                   23
5to secundaria               17
personas con discapacidad    11
victimas de la violencia      7
deportistas calificados       4
Name: count, dtype: int64

# JOIN RESULTS WITH DESCRIPTION

In [34]:
new_combined_data = (
    pd.merge(
        results_with_dates,
        description,
        left_on='n_df',
        right_index=True
        )
        )

new_combined_data

Unnamed: 0,names,one,two,three,four,five,career,status,n_df,date,test_type,year_period,period,mod,place,link
8,rodriguez sanchez rosicela elizabeth,41.577,78.261,126.368,246.206,94.842,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
9,llanos solis kimberlyn yei,37.504,67.242,109.044,213.790,94.842,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
10,alva perez tamara antonella,48.918,53.192,109.036,211.146,94.842,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
11,uriol flores liz rocio,39.342,57.783,102.429,199.554,94.842,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
12,vasquez guerra cesar ivan,35.462,50.730,109.548,195.740,94.842,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139352,bravo lezama dariks alexander,18.748,24.420,43.168,,,ing de minas,no ingresa,208,21/07/2024,cepunt,2025,i,,huamachuco,https://unitru.edu.pe/webfiles///Convocatoria/...
139353,otiniano sanchez helen neftali,19.980,20.180,40.160,,,ing de minas,no ingresa,208,21/07/2024,cepunt,2025,i,,huamachuco,https://unitru.edu.pe/webfiles///Convocatoria/...
139354,perez joaquin marcos,21.191,15.258,36.449,,,ing de minas,no ingresa,208,21/07/2024,cepunt,2025,i,,huamachuco,https://unitru.edu.pe/webfiles///Convocatoria/...
139355,amador avila luis mario,25.687,5.483,31.170,,,ing de minas,no ingresa,208,21/07/2024,cepunt,2025,i,,huamachuco,https://unitru.edu.pe/webfiles///Convocatoria/...


# DELETE SOME GRADES

## MAIN GRADES ARE FOUND IN R4 COLUMN

In [35]:
filter_by_cepunt = new_combined_data['test_type'] == 'cepunt'

In [36]:
filter_by_periodo = new_combined_data['year_period'].isin([2019, 2020])

In [37]:
main_grade4 = new_combined_data[(filter_by_cepunt) & (filter_by_periodo)]
main_grade4

Unnamed: 0,names,one,two,three,four,five,career,status,n_df,date,test_type,year_period,period,mod,place,link
8,rodriguez sanchez rosicela elizabeth,41.577,78.261,126.368,246.206,94.842,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
9,llanos solis kimberlyn yei,37.504,67.242,109.044,213.790,94.842,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
10,alva perez tamara antonella,48.918,53.192,109.036,211.146,94.842,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
11,uriol flores liz rocio,39.342,57.783,102.429,199.554,94.842,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
12,vasquez guerra cesar ivan,35.462,50.730,109.548,195.740,94.842,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37055,layza sanchez elvis alex,5.900,7.619,7.111,20.630,44.190,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
37056,infantes reyes jonatan,2.238,7.939,7.085,17.262,44.190,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
37057,sanchez fernandez samuel jhonatan,1.827,12.527,0.000,14.354,44.190,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
37058,caipo polo hilario,5.900,0.000,0.000,5.900,44.190,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...


In [38]:
# execute only one time
main_grade4 = main_grade4.drop(['one', 'two', 'three', 'five'], axis=1)

main_grade4 = main_grade4.rename(columns={'four': 'grade'})

main_grade4

Unnamed: 0,names,grade,career,status,n_df,date,test_type,year_period,period,mod,place,link
8,rodriguez sanchez rosicela elizabeth,246.206,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
9,llanos solis kimberlyn yei,213.790,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
10,alva perez tamara antonella,211.146,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
11,uriol flores liz rocio,199.554,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
12,vasquez guerra cesar ivan,195.740,administracion,ingresa,0,12/08/2018,cepunt,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
...,...,...,...,...,...,...,...,...,...,...,...,...
37055,layza sanchez elvis alex,20.630,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
37056,infantes reyes jonatan,17.262,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
37057,sanchez fernandez samuel jhonatan,14.354,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
37058,caipo polo hilario,5.900,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...


## MAIN GRADES ARE FOUND IN R1 COLUMN

In [39]:
filter_by_tipo = new_combined_data['test_type'].isin(["extraordinario", "ordinario"])
filter_by_periodo2 = new_combined_data['year_period'] == 2024
filter_by_number2 = new_combined_data['period'] == "ii"


In [40]:
main_grade1 = new_combined_data[(filter_by_tipo) & (filter_by_periodo2) & (filter_by_number2)]

main_grade1

Unnamed: 0,names,one,two,three,four,five,career,status,n_df,date,test_type,year_period,period,mod,place,link
126006,castro garcia paulo gamaliel,147.835,,,,,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126007,ponce salvador naomi belen,126.434,,,,,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126008,pintado peña luciana nicolle,124.382,,,,,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126009,lezama diaz jennifer anghelina,124.377,,,,,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126010,abila villanueva adriana amparito,123.361,,,,,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135172,castillo rojas hugo gabriel,4.044,,,,,ing ambiental,no ingresa,205,17/03/2024,ordinario,2024,ii,,santiago de chuco,https://unitru.edu.pe/webfiles///Convocatoria/...
135173,ruiz jacobo merly milagros,0.986,,,,,ing ambiental,no ingresa,205,17/03/2024,ordinario,2024,ii,,santiago de chuco,https://unitru.edu.pe/webfiles///Convocatoria/...
135174,ulloa flores manuel jhan fran,-0.100,,,,,ing ambiental,no ingresa,205,17/03/2024,ordinario,2024,ii,,santiago de chuco,https://unitru.edu.pe/webfiles///Convocatoria/...
135175,contreras gutierrez brayan mercedes,-4.159,,,,,ing ambiental,no ingresa,205,17/03/2024,ordinario,2024,ii,,santiago de chuco,https://unitru.edu.pe/webfiles///Convocatoria/...


In [41]:
# execute only one time
main_grade1 = main_grade1.drop(['two', 'three', 'four', 'five'], axis=1)
main_grade1 = main_grade1.rename(columns={'one': 'grade'})

main_grade1

Unnamed: 0,names,grade,career,status,n_df,date,test_type,year_period,period,mod,place,link
126006,castro garcia paulo gamaliel,147.835,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126007,ponce salvador naomi belen,126.434,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126008,pintado peña luciana nicolle,124.382,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126009,lezama diaz jennifer anghelina,124.377,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126010,abila villanueva adriana amparito,123.361,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
...,...,...,...,...,...,...,...,...,...,...,...,...
135172,castillo rojas hugo gabriel,4.044,ing ambiental,no ingresa,205,17/03/2024,ordinario,2024,ii,,santiago de chuco,https://unitru.edu.pe/webfiles///Convocatoria/...
135173,ruiz jacobo merly milagros,0.986,ing ambiental,no ingresa,205,17/03/2024,ordinario,2024,ii,,santiago de chuco,https://unitru.edu.pe/webfiles///Convocatoria/...
135174,ulloa flores manuel jhan fran,-0.100,ing ambiental,no ingresa,205,17/03/2024,ordinario,2024,ii,,santiago de chuco,https://unitru.edu.pe/webfiles///Convocatoria/...
135175,contreras gutierrez brayan mercedes,-4.159,ing ambiental,no ingresa,205,17/03/2024,ordinario,2024,ii,,santiago de chuco,https://unitru.edu.pe/webfiles///Convocatoria/...


## MAIN GRADES ARE FOUND IN R3 COLUMN

In [42]:
filter_general = (((filter_by_cepunt) & (filter_by_periodo)) | ((filter_by_tipo) & (filter_by_periodo2) & (filter_by_number2))) 

main_grade3 = new_combined_data[~filter_general]

In [43]:
# execute only one time
main_grade3 = main_grade3.drop(['one', 'two', 'four', 'five'], axis=1)
main_grade3 = main_grade3.rename(columns={'three': 'grade'})
main_grade3

Unnamed: 0,names,grade,career,status,n_df,date,test_type,year_period,period,mod,place,link
3840,marcelo soto dilmer oliver,190.591,ccas biologicas,ingresa,3,22/09/2018,ordinario,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
3841,arevalo ramirez dulcemaria janina,136.534,ccas biologicas,ingresa,3,22/09/2018,ordinario,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
3842,rodriguez seminario carmen emilia,115.111,ccas biologicas,ingresa,3,22/09/2018,ordinario,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
3843,rodriguez pesantes daiana lisett,115.080,ccas biologicas,ingresa,3,22/09/2018,ordinario,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
3844,dioses ibañez guillermo franco,107.995,ccas biologicas,ingresa,3,22/09/2018,ordinario,2019,i,,trujillo,http://admisionunt.info/docs/padrones/20191/20...
...,...,...,...,...,...,...,...,...,...,...,...,...
139352,bravo lezama dariks alexander,43.168,ing de minas,no ingresa,208,21/07/2024,cepunt,2025,i,,huamachuco,https://unitru.edu.pe/webfiles///Convocatoria/...
139353,otiniano sanchez helen neftali,40.160,ing de minas,no ingresa,208,21/07/2024,cepunt,2025,i,,huamachuco,https://unitru.edu.pe/webfiles///Convocatoria/...
139354,perez joaquin marcos,36.449,ing de minas,no ingresa,208,21/07/2024,cepunt,2025,i,,huamachuco,https://unitru.edu.pe/webfiles///Convocatoria/...
139355,amador avila luis mario,31.170,ing de minas,no ingresa,208,21/07/2024,cepunt,2025,i,,huamachuco,https://unitru.edu.pe/webfiles///Convocatoria/...


## CONCAT GRADES

In [44]:
cleaned_results = pd.concat(
    [
        main_grade1,
        main_grade3,
        main_grade4
    ]
)
cleaned_results

Unnamed: 0,names,grade,career,status,n_df,date,test_type,year_period,period,mod,place,link
126006,castro garcia paulo gamaliel,147.835,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126007,ponce salvador naomi belen,126.434,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126008,pintado peña luciana nicolle,124.382,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126009,lezama diaz jennifer anghelina,124.377,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
126010,abila villanueva adriana amparito,123.361,administracion,ingresa,192,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
...,...,...,...,...,...,...,...,...,...,...,...,...
37055,layza sanchez elvis alex,20.630,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
37056,infantes reyes jonatan,17.262,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
37057,sanchez fernandez samuel jhonatan,14.354,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
37058,caipo polo hilario,5.900,ing de minas,no ingresa,37,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...


In [45]:
cleaned_results = cleaned_results.reset_index()
cleaned_results = cleaned_results.drop(['index', 'n_df'], axis=1)
cleaned_results

Unnamed: 0,names,grade,career,status,date,test_type,year_period,period,mod,place,link
0,castro garcia paulo gamaliel,147.835,administracion,ingresa,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
1,ponce salvador naomi belen,126.434,administracion,ingresa,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
2,pintado peña luciana nicolle,124.382,administracion,ingresa,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
3,lezama diaz jennifer anghelina,124.377,administracion,ingresa,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
4,abila villanueva adriana amparito,123.361,administracion,ingresa,03/03/2024,extraordinario,2024,ii,,trujillo,https://unitru.edu.pe/webfiles///Convocatoria/...
...,...,...,...,...,...,...,...,...,...,...,...
119022,layza sanchez elvis alex,20.630,ing de minas,no ingresa,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
119023,infantes reyes jonatan,17.262,ing de minas,no ingresa,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
119024,sanchez fernandez samuel jhonatan,14.354,ing de minas,no ingresa,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...
119025,caipo polo hilario,5.900,ing de minas,no ingresa,09/02/2020,cepunt,2020,ii,,huamachuco,http://admisionunt.info/docs/padrones/20202/20...


In [46]:
filter_extraor_lugar_nan = (
        (cleaned_results['test_type'] == 'extraordinario') & 
        (cleaned_results['place'].isna())
        )
    
filter_extraor_mod_nan = (
    (cleaned_results['test_type'] == 'extraordinario') & 
    (cleaned_results['mod'].isna())
    )
    
filter_ordi_mod_nan = (
    (cleaned_results['test_type'] == 'ordinario') & 
    (cleaned_results['mod'].isna())
    )
    
filter_cepu_mod_nan = (
    (cleaned_results['test_type'] == 'cepunt') & 
    (cleaned_results['mod'].isna())
    )
    
cleaned_results.loc[filter_extraor_lugar_nan, 'place'] = "trujillo"
cleaned_results.loc[filter_extraor_mod_nan, 'mod'] = "excelencia"
cleaned_results.loc[filter_ordi_mod_nan, 'mod'] = "estandar"
cleaned_results.loc[filter_cepu_mod_nan, 'mod'] = "estandar"

In [47]:
cleaned_results['mod'].value_counts()

mod
estandar                     104747
5to secundaria                 8709
excelencia                     5438
personas con discapacidad       104
victimas de la violencia         24
deportistas calificados           5
Name: count, dtype: int64

In [48]:
cleaned_results['grade'].isna().sum()

0