In [1]:
import pandas as pd
import dask.dataframe as dd
from dask import compute
pd.options.display.max_columns = None
import numpy as np
import dask
nan = np.nan
from dask.distributed import Client
client = dask.distributed.Client()

In [2]:
path = 't_mdco_tfra400_Base3M_1.0.csv'

In [3]:
df = dd.read_csv(path,
                 dtype={"producto_1_number":str,
                        "bin_2_number":str, 
                        "NSS":str,
                        "pasaporte":str,
                        "clave_id":str,
                        "clave_secundaria_text": str
                       })

In [4]:
# Direccion
df['direccion'] = df.loc[:,'correo_id'].apply(lambda x: x[::-1],meta=('correo_id', 'object'))

In [5]:
# Obtener nombre completo
def build_name(row):
    name = row['desc_text']
    lastShort = row['desc_id'][2:]
    index = name.find(lastShort.lower())
    return f'{name[:index].capitalize()} {name[index:].capitalize()}'
df['full_name'] = df[['desc_text', 'desc_id']].apply(build_name, axis=1, meta=('correo_id', 'object'))

In [6]:
# Obtener email completo
def build_email(row):
    user = row['nombre_text']
    domain = str(row['apellid@_text']).replace('!','.com')
    return f'{user}@{domain}'
df['email'] = df[['nombre_text', 'apellid@_text']].apply(build_email, axis=1)#, meta=('correo_id', 'object'))

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



In [7]:
# Obtener los 17 numeros del pasaporte
def build_passport(value):
    if str(value) == 'nan' or value == nan:
        return nan
    else:
        return str(int(str(value), 16)).zfill(17)
df['pasaporte_'] = df['pasaporte'].apply(
    build_passport,
    meta=('pasaporte_', 'object')
)

In [8]:
# Obtener los 17 numeros del pasaporte
def build_nss(value):
    if str(value) == 'nan' or value == nan:
        return nan
    else:
        return str(int(str(value), 8)).zfill(11)
df['NSS_'] = df['NSS'].apply(
    build_nss,
    meta=('NSS_', 'object')
)

In [9]:
# Obtener numeros de telefono
mapper = {
    'A' : '4',
    'B' : '8',
    'Z' : '2',
    'I' : '1',
    'S' : '5',
    'E' : '3',
    'l' : '1',
    'O' : '0'
}
def unleet(string):
    res = ''
    if str(string) == 'nan' or string == nan:
        return nan
    else:
        for c in string: res += mapper.get(c, c)
        return res
df['phone1'] = df['clave_secundaria_text'].apply(unleet, meta=('telefono_1', 'object'))
df['phone2'] = df['clave_primaria_text'].apply(unleet, meta=('telefono_2', 'object'))


In [10]:
# Obtener fecha de algo
def get_date(row):
    clave = row['clave_id'][-2:]
    registro = row['registro.xls'][7:-4]
    return f'{registro[:4]}-{registro[4:]}-{clave}'
df['date0'] = df[['clave_id', 'registro.xls']].apply(get_date, axis=1, meta=('date0', 'object'))

In [11]:
def validate(value):
    return str(value) != 'nan' and value != nan

In [12]:
# Tarjeta
def build_tarjeta(row):
    a = row['producto_1_number']
    b = row['bin_2_number']
    c = row['INFE_id']
    for x in (a,b,c):
        if not validate(x):
            return nan
    
    return f'{int(a, 2)}{str(int(b, 2)).zfill(3)}{str(c[3:]).zfill(10)}'
    
df['tarjeta'] = df[['producto_1_number', 'bin_2_number','INFE_id']].apply(
    build_tarjeta,
    axis=1, 
    meta=('tarjeta', 'object')
)

In [13]:
# CURP
def get_curp(row):
    pass

In [14]:
res_df = df[['direccion', 'full_name', 'email', 'pasaporte_', 'NSS_', 'phone1', 'phone2', 'date0','tarjeta']]

In [15]:
res_df = res_df.compute()
res_df

Unnamed: 0,direccion,full_name,email,pasaporte_,NSS_,phone1,phone2,date0,tarjeta
0,19GrimPoint,Iolande Snaddon,isnaddon0@prlog.org,96686099162290543,04601204149,+66 724 768 7856,53(585)474-6125,1964-04-02,5342610641086212
1,81BaysideLane,Jemie Peachman,jpeachman1@people.com.cn,,71680198205,+351 324 617 4436,230(763)964-0746,1982-07-13,5343404568194236
2,8152MillerPark,Justus Murton,jmurton2@pen.io,35250537203699050,56513681524,+62 251 724 4330,86(593)843-0864,1958-09-05,5416452996077629
3,171SachsStreet,Forester Nanni,fnanni3@ocn.ne.jp,,31258182058,+55 469 292 7587,63(529)865-2320,1953-08-05,9631055328474945
4,458HazelcrestHill,Nathalia Chiles,nchiles4@dell.com,06542376803149778,31080683464,+964 980 897 1011,86(612)872-3533,1980-07-31,5313099286783184
...,...,...,...,...,...,...,...,...,...
150781,5BrentwoodDrive,Zara Dresser,zdresser255n@bigcartel.com,72916483490239047,02013657870,+230 536 393 7818,86(339)948-7180,1943-05-31,9320302452944829
150782,510VictoriaStreet,Skipper Mander,smander255o@geocities.jp,20435652049221291,79899154557,+20 610 110 4446,351(564)236-3879,1983-12-11,9321083038445815
150783,7GracelandJunction,Debora Mounce,dmounce255p@time.com,,76274686747,+54 375 388 6061,93(575)255-3836,1988-09-03,8893777225031133
150784,618ThiererCenter,Alanah Holde,aholde255q@bandcamp.com,43855354376114202,26357308824,+351 954 450 7594,55(694)848-4220,1995-01-01,8893770962206429
