In [1]:
import pandas as pd

In [2]:
clients = pd.read_sas('data/anagrf_srs_an.sas7bdat'
                      , index='Klient_ID', encoding='unicode_escape')

In [3]:
clients_contracts = pd.read_sas('data/a_to_c_srs_an.sas7bdat'
                                , index='Klient_ID', encoding='unicode_escape')

In [4]:
contracts = pd.read_sas('data/cntrct_srs_an.sas7bdat'
                       , index='Contract_ID', encoding='unicode_escape')

## Combining data sources
- `merge`: SQL-style (join two tables/dataframes in given columns).
- `join`: performs a join on indices.
- `concat`: puts together everything without checking for duplicates.

#### Joining on indices

In [40]:
df = pd.merge(clients, 
              clients_contracts, 
              how='outer', left_index=True, right_index=True, 
             validate='one_to_many')

In [15]:
df.shape

(471692, 5)

In [18]:
df1 = clients.join(clients_contracts) # Equivalent to the above ==> Needs to join on indices

#### Join on non-index column

In [21]:
data = pd.merge(clients_contracts, contracts, how='left', left_on='Contract_ID', right_index=True)

In [22]:
all_data = pd.merge(clients, data, how='left', left_index=True, right_index=True)

In [27]:
clients.index

Index(['K_000001', 'K_000002', 'K_000003', 'K_000004', 'K_000005', 'K_000006',
       'K_000007', 'K_000008', 'K_000009', 'K_000010',
       ...
       'K_099991', 'K_099992', 'K_099993', 'K_099994', 'K_099995', 'K_099996',
       'K_099997', 'K_099998', 'K_099999', 'K_100000'],
      dtype='object', name='Klient_ID', length=100000)

In [35]:
clients_dedup = clients.drop_duplicates()

In [36]:
clients.shape == clients_dedup.shape

False

In [37]:
clients.shape

(100000, 3)

In [38]:
clients_dedup.shape

(37360, 3)

In [42]:
clients.index[clients.index.duplicated()]

Index([], dtype='object', name='Klient_ID')

In [43]:
clients_contracts.index[clients_contracts.index.duplicated()]

Index(['K_071210', 'K_036798', 'K_006520', 'K_018397', 'K_016680', 'K_047932',
       'K_026633', 'K_039827', 'K_052798', 'K_073678',
       ...
       'K_094170', 'K_064526', 'K_053855', 'K_094053', 'K_043115', 'K_042919',
       'K_084589', 'K_075796', 'K_006494', 'K_018647'],
      dtype='object', name='Klient_ID', length=371692)

In [49]:
all_data.drop_duplicates(subset=['Contract_ID'], keep='first') # Keeps first record of a contract per client

Unnamed: 0_level_0,SEX,TIPO_FINZ_GARN,DAT_NSC,COD_LGM,Contract_ID,COD_FASE_OPE,COD_TIPO_OPE,COD_STT_OPE,DAT_INS,DAT_INZ_OPE,DAT_END_OPE_P,DAT_END_OPE,Instit_ID
Klient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
K_000001,M,1,1980-12-21,A,S_077650,UP,OU,,17223.0,17244.0,19409.0,17470.0,H_002
K_000001,M,1,1980-12-21,A,S_123622,EX,HU,,18532.0,18557.0,29570.0,,H_018
K_000001,M,1,1980-12-21,A,S_424532,EX,KS,,18651.0,18653.0,,,H_008
K_000002,Z,1,1986-12-29,A,S_437852,EX,OU,,17223.0,17223.0,19771.0,,H_005
K_000003,Z,2,1968-01-27,A,S_253038,EX,FL,,17224.0,17226.0,19052.0,,L_023
K_000004,Z,1,1982-10-22,A,S_139244,EX,SU,,18574.0,18574.0,22234.0,,H_003
K_000004,Z,1,1982-10-22,A,S_437873,UK,SU,,17227.0,17231.0,17611.0,17611.0,H_003
K_000004,Z,1,1982-10-22,A,S_443301,UK,KK,,18184.0,18184.0,18369.0,18371.0,H_003
K_000005,M,1,1984-10-19,A,S_172650,UK,SU,,17200.0,17200.0,17931.0,17956.0,L_026
K_000005,M,1,1984-10-19,A,S_369179,UK,HU,,17297.0,17280.0,17587.0,17591.0,H_002


In [51]:
all_data.sort_values(by='SEX', ascending=False) # Index can also be sorted in the same way, .sort_index

Unnamed: 0_level_0,SEX,TIPO_FINZ_GARN,DAT_NSC,COD_LGM,Contract_ID,COD_FASE_OPE,COD_TIPO_OPE,COD_STT_OPE,DAT_INS,DAT_INZ_OPE,DAT_END_OPE_P,DAT_END_OPE,Instit_ID
Klient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
K_100000,Z,1,1979-12-21,A,S_374522,EX,SU,,18282.0,18023.0,20600.0,,H_003
K_040950,Z,1,1953-08-27,A,S_113787,UK,OU,D,16876.0,16876.0,17377.0,18049.0,H_005
K_040950,Z,1,1953-08-27,A,S_186053,UK,OU,D,17104.0,17104.0,17285.0,17837.0,H_005
K_040950,Z,1,1953-08-27,A,S_194543,EX,OU,F,18547.0,18507.0,18507.0,,H_005
K_040950,Z,1,1953-08-27,A,S_343115,UK,KS,,16891.0,16879.0,,18143.0,H_005
K_040950,Z,1,1953-08-27,A,S_432680,UK,OU,D,16648.0,16636.0,17285.0,17867.0,H_005
K_040950,Z,1,1953-08-27,A,S_453831,UK,KU,F,16436.0,16419.0,,18506.0,H_005
K_040952,Z,1,1963-12-20,S,S_112254,UP,OU,,16425.0,16427.0,18262.0,18231.0,H_005
K_040953,Z,1,1952-09-18,A,S_073110,EX,OU,,18103.0,18072.0,20808.0,,H_005
K_040953,Z,1,1952-09-18,A,S_074426,EX,KS,,18390.0,18371.0,,,H_005
