In this notebook we:
* import the data set
* treat it with different libraries
* prepare it to be analyzed
* export it as a clean CSV data file

We will need the following libraries for this data cleaning task:

In [200]:
import numpy as np
import pandas as pd
import re
import collections
from collections import Counter
from datetime import datetime
pd.set_option("display.max_columns", 120)

### Importing the Dataset & transforming it into pandas Dataframe

In [593]:
db = pd.read_csv('../data/divorces_2000-2015_original.csv')

In [594]:
db.head()

Unnamed: 0,Fecha de Registro,Tipo,Nacionalidad,F_Naci_Do,LocNaci_Do,MpioNaci_Do,EFedNaci_Do,PaisNaci_Do,Edad,MpioHab_Do,EFedHab_Do,PaisHab_Do,ElIngreso,Ocupaci√≥n_Do,LocHabi_Do,Nacionalidad2,F_Naci_Da,Fecha de Registro 2,LocNaci_Da,MpioNaci_Da,EFedNaci_Da,PaisNaci_Da,Edad2,LocHabi_Da,MpioHab_Da,EFedHab_Da,PaisHab_Da,Ocupaci√≥n_Da,EllaIngreso,Fecha,Localidad,Municipio,Entidad Federativa,ElEscolaridad,ElTrabajo,EllaEscolaridad,EllaTrabajo,Matri_A√±os,Matri_Meses,Num_Hijos,Custodia
0,9/6/06,Necesario,MEXICANA,18/12/75,XALAPA - ENRIQUEZ,XALAPA,VERACRUZ,MEXICO,30.0,XALAPA,VERACRUZ,MEXICO,2000.0,PINTOR,XALAPA-ENRIQUEZ,MEXICANA,8/1/83,,PUEBLA,PUEBLA,PUEBLA,MEXICO,22.0,XALAPA-ENRIQUEZ,XALAPA,VERACRUZ,MEXICO,EMPLEADA,1800.0,26/6/00,XALAPA,XALAPA,VERACRUZ,SECUNDARIA,OBRERO,SECUNDARIA,EMPLEADO,5.0,,1.0,
1,1/2/00,Voluntario,MEXICANA,,,,,,47.0,,,,,,,MEXICANA,,,,,,,41.0,,,,,,,17/2/77,XALAPA,XALAPA,VERACRUZ,PREPARATORIA,ESTABLECIMIENTO,PREPARATORIA,EMPLEADO,,,,
2,1/2/05,Necesario,MEXICANA,22/2/55,XALAPA - ENRIQUEZ,XALAPA,VERACRUZ,MEXICO,49.0,,,,,MEDICO,,MEXICANA,21/3/47,,XALAPA-ENRIQUEZ,XALAPA,VERACRUZ,MEXICO,57.0,XALAPA-ENRIQUEZ,XALAPA,VERACRUZ,MEXICO,JUBILADA,,18/12/75,XALAPA,XALAPA,VERACRUZ,PREPARATORIA,OBRERO,,TRABAJADOR POR CUENTA PROPIA EN VIA PUBLICA,,,,
3,1/2/06,Necesario,MEXICANA,20/1/64,XALAPA - ENRIQUEZ,XALAPA,VERACRUZ,MEXICO,42.0,XALAPA,VERACRUZ,MEXICO,6000.0,EMPLEADO,XALAPA-ENRIQUEZ,MEXICANA,,,XALAPA-ENRIQUEZ,XALAPA,VERACRUZ,MEXICO,,XALAPA-ENRIQUEZ,XALAPA,VERACRUZ,MEXICO,COMERCIANTE,5000.0,3/12/87,XALAPA,XALAPA,VERACRUZ,PROFESIONAL,EMPLEADO,PREPARATORIA,EMPLEADO,18.0,,2.0,MADRE
4,1/2/06,Necesario,MEXICANA,30/10/75,XALAPA - ENRIQUEZ,XALAPA,VERACRUZ,MEXICO,30.0,COATEPEC,VERACRUZ,MEXICO,18000.0,MEDICO,COATEPEC,MEXICANA,13/10/78,,XALAPA-ENRIQUEZ,XALAPA,VERACRUZ,MEXICO,27.0,COATEPEC,COATEPEC,VERACRUZ,MEXICO,AMA DE CASA,,14/11/98,XALAPA,XALAPA,VERACRUZ,PROFESIONAL,EMPLEADO,PREPARATORIA,NO TRABAJA,7.0,,2.0,MADRE


### Exploring the data

In [595]:
len(db)

4923

In [596]:
Counter(db.Tipo) 
# stands for the type of divorce
# Only considered 'Necesario'='Necessary' when a member of the couple does not accept the divorce
# 'Necessary' divorces will be considered more 'failed marriages' than 'Voluntary divorces'

Counter({'Necesario': 2528, 'Voluntario': 2395})

In [597]:
Counter(db.Nacionalidad)

Counter({'MEXICANA': 4879,
         'COSTARRICENSE': 1,
         'ARGENTINA': 4,
         'COLOMBIANA': 1,
         'CUBANA': 8,
         'FRANCESA': 2,
         'AUSTRIACA': 1,
         'ESTADOUNIDENSE': 7,
         'ESPA√ëOLA': 6,
         'ITALIANA': 1,
         'CANADIENSE': 2,
         'ALEMANA': 2,
         'VENEZOLANA': 2,
         'CHILENA': 1,
         nan: 1,
         'JAPONESA': 1,
         'CHINA': 1,
         'POLACA': 1,
         'AUSTRALIANA': 1,
         'NICARAGUENSE': 1})

In [598]:
db.isna().sum()

Fecha de Registro         0
Tipo                      0
Nacionalidad              1
F_Naci_Do               381
LocNaci_Do              126
MpioNaci_Do             129
EFedNaci_Do             128
PaisNaci_Do             127
Edad                    107
MpioHab_Do              324
EFedHab_Do              323
PaisHab_Do              324
ElIngreso              1419
Ocupaci√≥n_Do           529
LocHabi_Do              321
Nacionalidad2             3
F_Naci_Da               452
Fecha de Registro 2    2679
LocNaci_Da              140
MpioNaci_Da             139
EFedNaci_Da             140
PaisNaci_Da             139
Edad2                   151
LocHabi_Da              307
MpioHab_Da              307
EFedHab_Da              305
PaisHab_Da              305
Ocupaci√≥n_Da           578
EllaIngreso            2119
Fecha                     0
Localidad                 0
Municipio                 0
Entidad Federativa        0
ElEscolaridad           304
ElTrabajo               356
EllaEscolaridad     

### Droping unnecessary columns & renaming columns

In [601]:
%%capture
# drop columns 
db.drop(columns=['Nacionalidad','LocNaci_Do', 'MpioNaci_Do', 'EFedNaci_Do', 'PaisNaci_Do', 'MpioHab_Do', 'EFedHab_Do', 'PaisHab_Do', 'ElIngreso', 'LocHabi_Do', 'Nacionalidad2', 'Fecha de Registro 2', 'LocNaci_Da', 'MpioNaci_Da', 'EFedNaci_Da', 'PaisNaci_Da', 'LocHabi_Da', 'MpioHab_Da', 'EFedHab_Da', 'PaisHab_Da', 'EllaIngreso', 'Localidad', 'Municipio', 'Entidad Federativa', 'ElEscolaridad', 'ElEscolaridad', 'ElTrabajo', 'EllaEscolaridad', 'EllaTrabajo', 'Matri_Meses'], axis=1, inplace=True)

In [602]:
db.head()

Unnamed: 0,Fecha de Registro,Tipo,F_Naci_Do,Edad,Ocupaci√≥n_Do,F_Naci_Da,Edad2,Ocupaci√≥n_Da,Fecha,Matri_A√±os,Num_Hijos,Custodia
0,9/6/06,Necesario,18/12/75,30.0,PINTOR,8/1/83,22.0,EMPLEADA,26/6/00,5.0,1.0,
1,1/2/00,Voluntario,,47.0,,,41.0,,17/2/77,,,
2,1/2/05,Necesario,22/2/55,49.0,MEDICO,21/3/47,57.0,JUBILADA,18/12/75,,,
3,1/2/06,Necesario,20/1/64,42.0,EMPLEADO,,,COMERCIANTE,3/12/87,18.0,2.0,MADRE
4,1/2/06,Necesario,30/10/75,30.0,MEDICO,13/10/78,27.0,AMA DE CASA,14/11/98,7.0,2.0,MADRE


In [603]:
db.rename(columns={'Ocupaci√≥n_Do': 'Job_H', 'Ocupaci√≥n_Da': 'Job_M', 'F_Naci_Do':'DOB_H', 'F_Naci_Da':'DOB_M', 'Edad':'Edad_H', 'Edad2':'Edad_M', 'Fecha':'Fecha_Boda', 'Matri_A√±os':'Dur_Matrimonio'}, inplace=True)
# H: Hombre
# M: Mujer

In [604]:
db.head()

Unnamed: 0,Fecha de Registro,Tipo,DOB_H,Edad_H,Job_H,DOB_M,Edad_M,Job_M,Fecha_Boda,Dur_Matrimonio,Num_Hijos,Custodia
0,9/6/06,Necesario,18/12/75,30.0,PINTOR,8/1/83,22.0,EMPLEADA,26/6/00,5.0,1.0,
1,1/2/00,Voluntario,,47.0,,,41.0,,17/2/77,,,
2,1/2/05,Necesario,22/2/55,49.0,MEDICO,21/3/47,57.0,JUBILADA,18/12/75,,,
3,1/2/06,Necesario,20/1/64,42.0,EMPLEADO,,,COMERCIANTE,3/12/87,18.0,2.0,MADRE
4,1/2/06,Necesario,30/10/75,30.0,MEDICO,13/10/78,27.0,AMA DE CASA,14/11/98,7.0,2.0,MADRE


In [605]:
# we get rid of NaN values from date columns

In [606]:
db.isna().sum()

Fecha de Registro       0
Tipo                    0
DOB_H                 381
Edad_H                107
Job_H                 529
DOB_M                 452
Edad_M                151
Job_M                 578
Fecha_Boda              0
Dur_Matrimonio        235
Num_Hijos            1912
Custodia             2851
dtype: int64

In [607]:
db = db.dropna(subset=['DOB_H'])
db = db.dropna(subset=['DOB_M'])

### Assigning each person their relative zodiac sign


In [608]:
type(db['DOB_H'])

pandas.core.series.Series

In [609]:
# we need pandas timeseries

In [610]:
db[['DOB_H', 'DOB_M']] = db[['DOB_H', 'DOB_M']].apply(pd.to_datetime)

In [611]:
db.head()

Unnamed: 0,Fecha de Registro,Tipo,DOB_H,Edad_H,Job_H,DOB_M,Edad_M,Job_M,Fecha_Boda,Dur_Matrimonio,Num_Hijos,Custodia
0,9/6/06,Necesario,1975-12-18,30.0,PINTOR,1983-08-01,22.0,EMPLEADA,26/6/00,5.0,1.0,
2,1/2/05,Necesario,2055-02-22,49.0,MEDICO,2047-03-21,57.0,JUBILADA,18/12/75,,,
4,1/2/06,Necesario,1975-10-30,30.0,MEDICO,1978-10-13,27.0,AMA DE CASA,14/11/98,7.0,2.0,MADRE
5,1/2/06,Necesario,1973-03-28,32.0,EMPLEADO,1976-06-14,29.0,,20/1/95,11.0,2.0,MADRE
6,1/2/07,Necesario,2070-12-13,36.0,EMPLEADO,1971-04-11,35.0,LABORES DOMESTICAS,16/8/91,15.0,2.0,MADRE


In [612]:
# We need to divide the datetimes by day and month columns in order to assign zodiac signs

In [613]:
# Hombres

# Day_OB_H --- Day of Birth Hombre
# Month_OB_H --- Month of Birth Hombre


db['Day_OB_H'], db['Month_OB_H'] = db['DOB_H'].apply(lambda x: int(x.day)), db['DOB_H'].apply(lambda x: int(x.month))

In [614]:
# Mujeres

# Day_OB_M --- Day of Birth Mujeres
# Month_OB_M --- Month of Birth Mujeres


db['Day_OB_M'], db['Month_OB_M'] = db['DOB_M'].apply(lambda x: int(x.day)), db['DOB_M'].apply(lambda x: int(x.month))

In [615]:
db.head(10)

Unnamed: 0,Fecha de Registro,Tipo,DOB_H,Edad_H,Job_H,DOB_M,Edad_M,Job_M,Fecha_Boda,Dur_Matrimonio,Num_Hijos,Custodia,Day_OB_H,Month_OB_H,Day_OB_M,Month_OB_M
0,9/6/06,Necesario,1975-12-18,30.0,PINTOR,1983-08-01,22.0,EMPLEADA,26/6/00,5.0,1.0,,18,12,1,8
2,1/2/05,Necesario,2055-02-22,49.0,MEDICO,2047-03-21,57.0,JUBILADA,18/12/75,,,,22,2,21,3
4,1/2/06,Necesario,1975-10-30,30.0,MEDICO,1978-10-13,27.0,AMA DE CASA,14/11/98,7.0,2.0,MADRE,30,10,13,10
5,1/2/06,Necesario,1973-03-28,32.0,EMPLEADO,1976-06-14,29.0,,20/1/95,11.0,2.0,MADRE,28,3,14,6
6,1/2/07,Necesario,2070-12-13,36.0,EMPLEADO,1971-04-11,35.0,LABORES DOMESTICAS,16/8/91,15.0,2.0,MADRE,13,12,11,4
7,1/2/07,Necesario,1975-02-17,31.0,LICENCIADO,1974-08-27,32.0,LICENCIADA,17/9/99,6.0,1.0,MADRE,17,2,27,8
8,1/2/08,Voluntario,1976-02-12,31.0,COMERCIANTE,1980-03-01,28.0,EMPLEADA,3/6/06,1.0,,,12,2,1,3
9,1/2/08,Voluntario,1976-11-17,31.0,EMPLEADO,1977-03-13,30.0,EMPLEADA,9/2/01,7.0,,,17,11,13,3
10,1/2/11,Necesario,2069-04-06,41.0,COMERCIANTE,2070-02-16,40.0,EMPLEADA,2/2/00,2.0,2.0,MADRE,6,4,16,2
11,1/2/11,Voluntario,1979-11-13,31.0,,1981-05-13,29.0,,13/5/06,2.0,,,13,11,13,5


In [616]:
def assignation(day, month):
    if month == 12: 
        return 'Sagitario' if (day < 22) else 'Capricornio'

    elif month == 1: 
        return 'Capricornio' if (day < 20) else 'Acuario'

    elif month == 2: 
        return 'Acuario' if (day < 19) else 'Piscis'

    elif month == 3: 
        return 'Piscis' if (day < 21) else 'Aries'

    elif month == 4: 
        return 'Aries' if (day < 20) else 'Tauro'

    elif month == 5: 
        return 'Tauro' if (day < 21) else 'Geminis'

    elif month == 6: 
        return 'Geminis' if (day < 21) else 'Cancer'

    elif month == 7: 
        return 'Cancer' if (day < 23) else 'Leo'

    elif month == 8: 
        return 'Leo' if (day < 23) else 'Virgo'

    elif month == 9: 
        return 'Virgo' if (day < 23) else 'Libra'

    elif month == 10: 
        return 'Libra' if (day < 23) else 'Scorpio'

    elif month == 11: 
        return 'Scorpio' if (day < 22) else 'Sagitario'

In [617]:
# aplicamos funcion a nuestro dataframe para obtener:

db['zodiac_H'] = assignation(db['Day_OB_H'].any(), db['Month_OB_H'].any()) # signos hombres
db['zodiac_M'] = assignation(db['Day_OB_M'].any(), db['Month_OB_M'].any()) # signos mujeres

In [618]:
db['zodiac_H'] = db.apply(lambda x: assignation(x['Day_OB_H'], x['Month_OB_H']), axis=1)
db['zodiac_M'] = db.apply(lambda x: assignation(x['Day_OB_M'], x['Month_OB_M']), axis=1)

In [619]:
db.head()

Unnamed: 0,Fecha de Registro,Tipo,DOB_H,Edad_H,Job_H,DOB_M,Edad_M,Job_M,Fecha_Boda,Dur_Matrimonio,Num_Hijos,Custodia,Day_OB_H,Month_OB_H,Day_OB_M,Month_OB_M,zodiac_H,zodiac_M
0,9/6/06,Necesario,1975-12-18,30.0,PINTOR,1983-08-01,22.0,EMPLEADA,26/6/00,5.0,1.0,,18,12,1,8,Sagitario,Leo
2,1/2/05,Necesario,2055-02-22,49.0,MEDICO,2047-03-21,57.0,JUBILADA,18/12/75,,,,22,2,21,3,Piscis,Aries
4,1/2/06,Necesario,1975-10-30,30.0,MEDICO,1978-10-13,27.0,AMA DE CASA,14/11/98,7.0,2.0,MADRE,30,10,13,10,Scorpio,Libra
5,1/2/06,Necesario,1973-03-28,32.0,EMPLEADO,1976-06-14,29.0,,20/1/95,11.0,2.0,MADRE,28,3,14,6,Aries,Geminis
6,1/2/07,Necesario,2070-12-13,36.0,EMPLEADO,1971-04-11,35.0,LABORES DOMESTICAS,16/8/91,15.0,2.0,MADRE,13,12,11,4,Sagitario,Aries


In [620]:
# ya no necesitamos las columnas de dia y mes de nacimiento para cada Hombre y Mujer

In [621]:
db.drop(columns=['Day_OB_H','Month_OB_H', 'Day_OB_M', 'Month_OB_M'], axis=1, inplace=True)

In [622]:
# hacemos la columna del match 

In [623]:
#def bond(column1, column2):
    #return '('+ column1.any() + ', '+ column2.any() +')'

In [624]:
#db['Horoscope-Match'] = bond(db['zodiac_H'], db['zodiac_M'])

In [625]:
#db['Horoscope-Match'] = db.apply(lambda x: bond(db['zodiac_H'], db['zodiac_M']), axis=1)

In [626]:
db["Horoscope-Match"] = db["zodiac_H"] + ', ' + db["zodiac_M"]

In [627]:
db.head()

Unnamed: 0,Fecha de Registro,Tipo,DOB_H,Edad_H,Job_H,DOB_M,Edad_M,Job_M,Fecha_Boda,Dur_Matrimonio,Num_Hijos,Custodia,zodiac_H,zodiac_M,Horoscope-Match
0,9/6/06,Necesario,1975-12-18,30.0,PINTOR,1983-08-01,22.0,EMPLEADA,26/6/00,5.0,1.0,,Sagitario,Leo,"Sagitario, Leo"
2,1/2/05,Necesario,2055-02-22,49.0,MEDICO,2047-03-21,57.0,JUBILADA,18/12/75,,,,Piscis,Aries,"Piscis, Aries"
4,1/2/06,Necesario,1975-10-30,30.0,MEDICO,1978-10-13,27.0,AMA DE CASA,14/11/98,7.0,2.0,MADRE,Scorpio,Libra,"Scorpio, Libra"
5,1/2/06,Necesario,1973-03-28,32.0,EMPLEADO,1976-06-14,29.0,,20/1/95,11.0,2.0,MADRE,Aries,Geminis,"Aries, Geminis"
6,1/2/07,Necesario,2070-12-13,36.0,EMPLEADO,1971-04-11,35.0,LABORES DOMESTICAS,16/8/91,15.0,2.0,MADRE,Sagitario,Aries,"Sagitario, Aries"


### Web Scraping for Zodiac Compatibility Percentage


In [405]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# from pprint import pprint
# from lxml import html
# from lxml.html import fromstring
# import urllib.request
from urllib.request import urlopen
# import random
import re
# import scrapy# I-Data Cleaning

In [406]:
url = 'http://www.mylovecal.com/'
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")

In [407]:
all_matches = soup.find_all('a')[21:165]

In [408]:
raw_matches = [i.text for i in all_matches]

In [409]:
print(raw_matches)

['50%', '50%', '85%', '65%', '95%', '55%', '55%', '75%', '100%', '40%', '90%', '65%', '50%', '100%', '40%', '85%', '70%', '90%', '65%', '75%', '40%', '100%', '40%', '85%', '85%', '40%', '60%', '45%', '90%', '60%', '95%', '50%', '75%', '60%', '100%', '45%', '65%', '85%', '45%', '80%', '85%', '75%', '65%', '90%', '45%', '60%', '40%', '100%', '95%', '70%', '90%', '85%', '50%', '50%', '100%', '40%', '90%', '50%', '55%', '60%', '55%', '90%', '60%', '75%', '50%', '95%', '40%', '70%', '40%', '100%', '55%', '65%', '55%', '65%', '95%', '65%', '100%', '40%', '80%', '50%', '95%', '55%', '90%', '50%', '75%', '75%', '50%', '90%', '40%', '70%', '50%', '70%', '40%', '95%', '45%', '100%', '100%', '40%', '75%', '45%', '90%', '40%', '95%', '40%', '85%', '60%', '95%', '40%', '40%', '100%', '60%', '60%', '50%', '100%', '55%', '95%', '60%', '75%', '60%', '100%', '90%', '40%', '100%', '40%', '55%', '55%', '90%', '45%', '95%', '60%', '80%', '50%', '65%', '85%', '45%', '100%', '60%', '65%', '50%', '100%', '40

In [410]:
# we want the string percentages to become decimal floats

In [411]:
limpiamos = lambda x: float(x.strip('%'))/100
matches_decimal = list(map(limpiamos, raw_matches))
print(matches_decimal)

[0.5, 0.5, 0.85, 0.65, 0.95, 0.55, 0.55, 0.75, 1.0, 0.4, 0.9, 0.65, 0.5, 1.0, 0.4, 0.85, 0.7, 0.9, 0.65, 0.75, 0.4, 1.0, 0.4, 0.85, 0.85, 0.4, 0.6, 0.45, 0.9, 0.6, 0.95, 0.5, 0.75, 0.6, 1.0, 0.45, 0.65, 0.85, 0.45, 0.8, 0.85, 0.75, 0.65, 0.9, 0.45, 0.6, 0.4, 1.0, 0.95, 0.7, 0.9, 0.85, 0.5, 0.5, 1.0, 0.4, 0.9, 0.5, 0.55, 0.6, 0.55, 0.9, 0.6, 0.75, 0.5, 0.95, 0.4, 0.7, 0.4, 1.0, 0.55, 0.65, 0.55, 0.65, 0.95, 0.65, 1.0, 0.4, 0.8, 0.5, 0.95, 0.55, 0.9, 0.5, 0.75, 0.75, 0.5, 0.9, 0.4, 0.7, 0.5, 0.7, 0.4, 0.95, 0.45, 1.0, 1.0, 0.4, 0.75, 0.45, 0.9, 0.4, 0.95, 0.4, 0.85, 0.6, 0.95, 0.4, 0.4, 1.0, 0.6, 0.6, 0.5, 1.0, 0.55, 0.95, 0.6, 0.75, 0.6, 1.0, 0.9, 0.4, 1.0, 0.4, 0.55, 0.55, 0.9, 0.45, 0.95, 0.6, 0.8, 0.5, 0.65, 0.85, 0.45, 1.0, 0.6, 0.65, 0.5, 1.0, 0.4, 1.0, 0.5, 0.75]


In [412]:
# now we need a list of all permutations of signs to match the correct percentage

In [413]:
import itertools

In [571]:
signos = ['Aries', 'Tauro', 'Geminis', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagitario', 'Capricornio', 'Acuario', 'Piscis']

NameError: name 'Aries' is not defined

In [580]:
combinaciones_signos = [x for x in itertools.product(signos, repeat=2)]
print(combinaciones_signos)

[('Aries', 'Aries'), ('Aries', 'Tauro'), ('Aries', 'Geminis'), ('Aries', 'Cancer'), ('Aries', 'Leo'), ('Aries', 'Virgo'), ('Aries', 'Libra'), ('Aries', 'Scorpio'), ('Aries', 'Sagitario'), ('Aries', 'Capricornio'), ('Aries', 'Acuario'), ('Aries', 'Piscis'), ('Tauro', 'Aries'), ('Tauro', 'Tauro'), ('Tauro', 'Geminis'), ('Tauro', 'Cancer'), ('Tauro', 'Leo'), ('Tauro', 'Virgo'), ('Tauro', 'Libra'), ('Tauro', 'Scorpio'), ('Tauro', 'Sagitario'), ('Tauro', 'Capricornio'), ('Tauro', 'Acuario'), ('Tauro', 'Piscis'), ('Geminis', 'Aries'), ('Geminis', 'Tauro'), ('Geminis', 'Geminis'), ('Geminis', 'Cancer'), ('Geminis', 'Leo'), ('Geminis', 'Virgo'), ('Geminis', 'Libra'), ('Geminis', 'Scorpio'), ('Geminis', 'Sagitario'), ('Geminis', 'Capricornio'), ('Geminis', 'Acuario'), ('Geminis', 'Piscis'), ('Cancer', 'Aries'), ('Cancer', 'Tauro'), ('Cancer', 'Geminis'), ('Cancer', 'Cancer'), ('Cancer', 'Leo'), ('Cancer', 'Virgo'), ('Cancer', 'Libra'), ('Cancer', 'Scorpio'), ('Cancer', 'Sagitario'), ('Cancer', 

In [581]:
combinaciones_signos[0]


('Aries', 'Aries')

In [None]:
# se trata de una lista de tuplas, eso no nos interesa

In [582]:
combinaciones_signos = [ str(i[0]) + ', ' + str(i[1]) for i in combinaciones_signos ]

In [588]:
combinaciones_signos[:5]

['Aries, Aries',
 'Aries, Tauro',
 'Aries, Geminis',
 'Aries, Cancer',
 'Aries, Leo']

In [584]:
len(combinaciones_signos) == len(matches_decimal)

True

### Compatibility Percentages for each zodiac combination into DataFrame


In [632]:
df = pd.DataFrame(list(zip(combinaciones_signos, matches_decimal)),
               columns =['Horoscope-Match', 'Compatibility-Rate'])
df

Unnamed: 0,Horoscope-Match,Compatibility-Rate
0,"Aries, Aries",0.50
1,"Aries, Tauro",0.50
2,"Aries, Geminis",0.85
3,"Aries, Cancer",0.65
4,"Aries, Leo",0.95
...,...,...
139,"Piscis, Scorpio",1.00
140,"Piscis, Sagitario",0.40
141,"Piscis, Capricornio",1.00
142,"Piscis, Acuario",0.50


### Merging Dataframes


In [418]:
# both data frames have a column in common: 'Horoscope-Match'

# our goal is to pass the compatibility rate to each combination of divorced couples
# in the db original DataFrame

In [537]:
type(df['Horoscope-Match']) == type(db['Horoscope-Match'])

True

In [630]:
df_ready = db.merge(df, how='left', on='Horoscope-Match')
df_ready.head()

Unnamed: 0,Fecha de Registro,Tipo,DOB_H,Edad_H,Job_H,DOB_M,Edad_M,Job_M,Fecha_Boda,Dur_Matrimonio,Num_Hijos,Custodia,zodiac_H,zodiac_M,Horoscope-Match,Compatibility-Rate
0,9/6/06,Necesario,1975-12-18,30.0,PINTOR,1983-08-01,22.0,EMPLEADA,26/6/00,5.0,1.0,,Sagitario,Leo,"Sagitario, Leo",0.9
1,1/2/05,Necesario,2055-02-22,49.0,MEDICO,2047-03-21,57.0,JUBILADA,18/12/75,,,,Piscis,Aries,"Piscis, Aries",0.65
2,1/2/06,Necesario,1975-10-30,30.0,MEDICO,1978-10-13,27.0,AMA DE CASA,14/11/98,7.0,2.0,MADRE,Scorpio,Libra,"Scorpio, Libra",0.5
3,1/2/06,Necesario,1973-03-28,32.0,EMPLEADO,1976-06-14,29.0,,20/1/95,11.0,2.0,MADRE,Aries,Geminis,"Aries, Geminis",0.85
4,1/2/07,Necesario,2070-12-13,36.0,EMPLEADO,1971-04-11,35.0,LABORES DOMESTICAS,16/8/91,15.0,2.0,MADRE,Sagitario,Aries,"Sagitario, Aries",1.0


### Saving the merged dataframe as cvs for the ultimate analysis


In [631]:
df_ready.to_csv('../data/ready_for_analysis.csv', index = False)