In [1]:
# Dependencies
import pandas as pd
import numpy as np

In [57]:
# Importing data
df = pd.read_table("Lucas_unique_directors_naturalperson_gender.csv", header= None, sep= None, engine="python")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,isin,v1,companynamelatinalphabet,bvdidnumber,name,cname,dmfullname,dmuciuniquecontactidentifie,dmjobtitleinenglish,dmjobtitle,...,gender,dmcorrespondingbvdidwhenapp,dmasanycategory,dmcurrentorprevious,dmbirthplace,dmhasasignatoryright,dmhasapowerofattorney,dmnoofcosinwhichacurrent,compensationsalaryeur,naturalperson
1,NL0012015705,2225,TAKEAWAY.COM N.V.,NL08142836,JUST EAT TAKEAWA,JUST EAT TAKEAWAY COM NV,GRIBHOLD B.V.,C000459947,Director,Bestuurder,...,,NL06089183,No,Previous,,No,No,0,,0
2,NL0000383800,1837,SMIT INTERNATIONALE NV,NL24004888,SMIT INTL.CERTS. DEAD - 05/05/10,SMIT INTL.,Boskalis Holding B.V.,C001024491,Directeur,Directeur,...,,NL23056607,Yes,Current,,No,No,3,,0
3,NL0000289783,12368,ROBECO GLOBAL STARS EQUITIES FUND N.V.,NL24041906,ROBECO SUST.GLB. STARS EQTIES.FD.,ROBECO GLB.STARS EQ.FD.,Robeco Fund Management B.V.,C001055291,Director,Bestuurder,...,,,,,,,,,,
4,NL0000350361,3289,EXACT HOLDING NV,NL27234422,EXACT HOLDING DEAD - 31/03/15,EXACT HOLDING NV,Exact Management B.V.,C001133291,Directeur,Directeur,...,,NL27228442,No,Current,,No,No,,,0


In [59]:
# Reshaping data frame
data_prev = df.rename(columns=df.iloc[0])
data_inter = data_prev.drop(labels=0,axis="index")
data = data_inter.drop("naturalperson", axis=1)
data.head() # Desired data frame shape

Unnamed: 0,isin,v1,companynamelatinalphabet,bvdidnumber,name,cname,dmfullname,dmuciuniquecontactidentifie,dmjobtitleinenglish,dmjobtitle,...,dmcorp,gender,dmcorrespondingbvdidwhenapp,dmasanycategory,dmcurrentorprevious,dmbirthplace,dmhasasignatoryright,dmhasapowerofattorney,dmnoofcosinwhichacurrent,compensationsalaryeur
1,NL0012015705,2225,TAKEAWAY.COM N.V.,NL08142836,JUST EAT TAKEAWA,JUST EAT TAKEAWAY COM NV,GRIBHOLD B.V.,C000459947,Director,Bestuurder,...,Previous,,NL06089183,No,Previous,,No,No,0.0,
2,NL0000383800,1837,SMIT INTERNATIONALE NV,NL24004888,SMIT INTL.CERTS. DEAD - 05/05/10,SMIT INTL.,Boskalis Holding B.V.,C001024491,Directeur,Directeur,...,Current,,NL23056607,Yes,Current,,No,No,3.0,
3,NL0000289783,12368,ROBECO GLOBAL STARS EQUITIES FUND N.V.,NL24041906,ROBECO SUST.GLB. STARS EQTIES.FD.,ROBECO GLB.STARS EQ.FD.,Robeco Fund Management B.V.,C001055291,Director,Bestuurder,...,Previous,,,,,,,,,
4,NL0000350361,3289,EXACT HOLDING NV,NL27234422,EXACT HOLDING DEAD - 31/03/15,EXACT HOLDING NV,Exact Management B.V.,C001133291,Directeur,Directeur,...,Current,,NL27228442,No,Current,,No,No,,
5,NL0009508720,13696,LBI INTERNATIONAL N.V.,NL30277334,LBI INTERNATIONAL DEAD - 07/03/13,LBI INTERNATIONAL NV,Fint Management B.V.,C001200105,Director,Bestuurder,...,Previous,,,,,,,,,


### Task 1: Creating dummy variable 'natural person'

__Logic:__ Using custom binary technique based on string attributes. 
   1. I am checking if column dmfullname has special addressing techniques (e.g. Sir, Dr., Madam ...);
   2. I am checking if column dmfullname has any non-natural person attribute (e.g. numbers, punctuation);
   3. I am checking if column dmfullname has any non-natural person addressing techniques for main regions mapped in the data(e.g. company entity registrations like GmbH or Ltd);

In [4]:
# First filter: Title identifier

# Part 1: function special_person_address_identifier identifies which observations 
# have one of the titles options in dmfullname and creates intermediary columns named
# after the title list option with 1 for positive, e.g. there is a title in dmfullnane,
# or 0 if there is no.

def special_person_address_identifier(address):
    for i in address:
        data[i] = np.where(data["dmfullname"].str.contains(i), 1, 0)
    
titles = ["Sir", "Madam", "Ms", "Mr", "Mrs","Miss", "Dr", "Professor"]
special_person_address_identifier(titles)

# Part 2: I have merge all intermediary title columns, e.g. Sir, into one column called 'TitleCheck'.
# I have then used the same process as Part 1 to investigate which observations had a positive value of 1 and which not.
# This would indicate then which have a positive 'FilterTitle' status. The ones that have are likely natural person while
# the ones with a negative status, a.k.a 0 value, not.

data["TitleCheck"] = data[titles].apply(lambda row: "+" .join(row.values.astype(str)), axis=1)

data["FilterTitle"] = np.where(data["TitleCheck"].str.contains("1"), 1, 0)
data = data.drop(["Sir", "Madam", "Ms", "Mr", "Mrs","Miss", "Dr", "Professor","TitleCheck"], axis=1)
data["FilterTitle"].value_counts()

1    115982
0      7530
Name: FilterTitle, dtype: int64

In [5]:
# Second filter: Non-natural person identifier (Numbers, Signals and other)

def non_natural_identifier(symbols):
    for i in symbols:
        data[i] = np.where(data["dmfullname"].str.contains(i), 1, 0)

symbols_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
non_natural_identifier(symbols_list)

data["SymbolsCheck"] = data[symbols_list].apply(lambda row: "+" .join(row.values.astype(str)), axis=1)

data["FilterSymbols"] = np.where(data["SymbolsCheck"].str.contains("1"), 1, 0)
data = data.drop(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "SymbolsCheck"], axis=1)
data["FilterSymbols"].value_counts() # Result shows that 120 observations are certainly not natural.

0    123392
1       120
Name: FilterSymbols, dtype: int64

In [6]:
# Third filter: Location / Region based company entity identifier
data["dmcountry"].value_counts()

def company_legal_entity_identifier(entities):
    for i in entities:
        data[i] = np.where(data["dmfullname"].str.contains(i), 1, 0)

# Main regions considered: Italy, UK, Germany, France, Austria, USA, Netherlands, Sweden, 
#                          Hungary, Luxembourg, Norway, Finland, Ireland and Greece.
regions_entity = ["S.p.a.", "S.r.l.", "Ltd", "PLC", "B.V.", "Limited", "GmbH", 
                       "AG", "UG", "e.V.", "SAS", "SARL", "SA", "OG", "KG", "LLC", 
                       "U.A.", "C.V.", "AB", "HB", "KB", "Zrt.", "Kft.", "S.à r.l.", 
                       "S.A.", "SCSp", "LLCs", "Oy", "LTD","L.T.D.","P.C."]

company_legal_entity_identifier(regions_entity)

data["EntitiesCheck"] = data[regions_entity].apply(lambda row: "+" .join(row.values.astype(str)), axis=1)

data["FilterEntities"] = np.where(data["EntitiesCheck"].str.contains("1"), 1, 0)
data = data.drop(["S.p.a.", "S.r.l.", "Ltd", "PLC", "B.V.", "Limited", "GmbH", 
                       "AG", "UG", "e.V.", "SAS", "SARL", "SA", "OG", "KG", "LLC", 
                       "U.A.", "C.V.", "AB", "HB", "KB", "Zrt.", "Kft.", "S.à r.l.", 
                       "S.A.", "SCSp", "LLCs", "Oy", "LTD","L.T.D.","P.C.", "EntitiesCheck"], axis=1)

data["FilterEntities"].value_counts() # Result shows that 120 observations are certainly not natural.

0    121992
1      1520
Name: FilterEntities, dtype: int64

##### Intermediary Results:

__Dummy Variables:__
  1. __FilterTitle__ indicates, at current moment that, 115982 observations have one of the titles investigated, e.g. Sir, while 7530 doesnt. 
  2. __FilterSymbols__ indicates that 120 observations are certainly not natural persons because they have numeric values in dmfullname.
  3. __FilterEntities__ indicates that 1520 observations have company legal entity names in dmfullname and are therefore certaily not natural persons.

In [7]:
list(data.columns)

['isin',
 'v1',
 'companynamelatinalphabet',
 'bvdidnumber',
 'name',
 'cname',
 'dmfullname',
 'dmuciuniquecontactidentifie',
 'dmjobtitleinenglish',
 'dmjobtitle',
 'dmappointmentdate',
 'dmresignationdate',
 'dmtitle',
 'dmsalutation',
 'dmfirstname',
 'dmmiddlename',
 'dmlastname',
 'dmsuffix',
 'dmgender',
 'dmbirthdate',
 'dmage',
 'dmagebracket',
 'dmcountryiesofnationality',
 'dmaddress',
 'dmcountry',
 'dmemailaddress',
 'dmbiography',
 'dmtypeofrole',
 'dmboardcommitteeordepartmen',
 'dmlevelofresponsibility',
 'dmalsoashareholder',
 'dmconfirmationdates',
 'dmdateslastreceivedfromip',
 'dmnotvalidafterdate',
 'dminformationsources',
 'dminformationproviders',
 'dmcollege',
 'dmdegreecode',
 'dmmajor',
 'dmgraduationdate',
 'dmcorp',
 'gender',
 'dmcorrespondingbvdidwhenapp',
 'dmasanycategory',
 'dmcurrentorprevious',
 'dmbirthplace',
 'dmhasasignatoryright',
 'dmhasapowerofattorney',
 'dmnoofcosinwhichacurrent',
 'compensationsalaryeur',
 'FilterTitle',
 'FilterSymbols',
 '

In [8]:
# Looking for potential columns that have a good amount of observations filled.
a = list(data["dmage"].value_counts())
sum(a)

83491

In [9]:
aa = list(data["dmbirthdate"].value_counts())
sum(aa)

82100

In [10]:
b = list(data["dmgender"].value_counts())
sum(b)

119046

##### New filter ideas

Columns dmage, dmbirthdate and dmgender are promising columns to reduce the number of observations I have to check manually once all these columns have more than 80000 filled. Based on this I will continue the filtering process of dummz variable natural_person. 

In [11]:
# Filter Age, Birthdate and Gender

# Coming up with a way to represent empty cells under age , birthday and gender. In this case by filling empty cells with the value -1.
data["FilterAge"] = data["dmage"].fillna(value="-1")  
data["FilterBirthDate"] = data["dmbirthdate"].fillna(value="-1")
data["FilterGender"] = data["dmgender"].fillna(value="-1")

In [12]:
data["FilterGender"].value_counts()

M     98577
F     20469
-1     4466
Name: FilterGender, dtype: int64

In [13]:
# Intermediary csv file with only gender (-1) observations. 
# EVERYTHING HAPPENING FROM HERE IN DATA: MANUAL, IS A TEST.
manual = data[data["FilterGender"] == "-1"]
manual.to_csv("manual_gender_check.csv") # Here, I would have to check 4446 observations.

### Star rating logic

Right now we have 6 filters.  Out of them, 2 are very secure, namely "FilterSymbols" & "FilterEntities", because if you have positive values there you are certainly a non-natural person.

We remain with 4 filters, each has a dummy variable, consequently providing to us 2 variables - therefore, there are 8 variables in questions. There are possibly 8! permutations of these variables - very painful to condition.

At the moment, I can either check 4466 observations manualy or try to scope down further.

The scoping will follow a score mechanism based on only 3 filters: Age, BirthDate and Gender because Filtertitle can be the most misleading and laborious due to these facts: a lot observations to cover in the revision as well as to the possibility of existing companies such as "Dr. Schwarz" or who knows what else.

__Score Structure__

Score can range from 0 to 3 stars. 2 and 3 stars observations won't be revised manually. Observations gain stars based on the existence of a cell value for that attribute or not.

Remember that if an observation had no value in its cell in the columns of FilterAge, FIlterBirthDate and FilterGender they were filled up with value -1. Therefore, the following conditions are being used:



- Age != -1 , BirthDate != -1 , Gender != -1 -> __3 stars rating__
- Age == -1 , BirthDate != -1 , Gender != -1 -> __2 stars rating__
- Age != -1 , BirthDate == -1 , Gender != -1 -> __2 stars rating__
- Age != -1 , BirthDate != -1 , Gender == -1 -> __2 stars rating__

##### Cases that were revised manually
- Age == -1 , BirthDate == -1 , Gender != -1 -> __1 stars rating__
- Age == -1 , BirthDate != -1 , Gender == -1 -> __1 stars rating__
- Age != -1 , BirthDate == -1 , Gender == -1 -> __1 stars rating__
- Age == -1 , BirthDate == -1 , Gender != -1 -> __1 stars rating__
- Age == -1 , BirthDate == -1 , Gender == -1 -> __0 stars rating__

In [14]:
# List of conditions
conditions = [
    (manual['FilterAge'] != "-1") & (manual['FilterBirthDate'] != "-1") & (manual['FilterGender'] != "-1"),
    (manual['FilterAge'] == "-1") & (manual['FilterBirthDate'] != "-1") & (manual['FilterGender'] != "-1"),
    (manual['FilterAge'] != "-1") & (manual['FilterBirthDate'] == "-1") & (manual['FilterGender'] != "-1"),
    (manual['FilterAge'] != "-1") & (manual['FilterBirthDate'] != "-1") & (manual['FilterGender'] == "-1"),
    (manual['FilterAge'] == "-1") & (manual['FilterBirthDate'] == "-1") & (manual['FilterGender'] != "-1"),
    (manual['FilterAge'] == "-1") & (manual['FilterBirthDate'] != "-1") & (manual['FilterGender'] == "-1"),
    (manual['FilterAge'] != "-1") & (manual['FilterBirthDate'] == "-1") & (manual['FilterGender'] == "-1"),
    (manual['FilterAge'] == "-1") & (manual['FilterBirthDate'] == "-1") & (manual['FilterGender'] != "-1"),
    (manual['FilterAge'] == "-1") & (manual['FilterBirthDate'] == "-1") & (manual['FilterGender'] == "-1"),
]

# create a list of the values we want to assign for each condition
values = ['3', '2', '2', '2', '1', '1', '1', '1', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
manual['score'] = np.select(conditions, values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual['score'] = np.select(conditions, values)


In [15]:
manual["score"].value_counts()

0    4153
2     301
1      12
Name: score, dtype: int64

__Result Scoring:__ Based on the first score mechanism I can see that out of 4466 observations, 301 are scored with 2 stars and therefore secure.

### Extending Score mechanism

__Logic:__
- Score != 2 & FilterEntity == 1 -> 'Non'
- Score != 2 & FilterSymbols == 1 -> 'Non'
- Score != 2 & FilterSymbols != 1 -> 'Non'
- Score != 2 & FilterSymbols != 1 -> 'Non'

In [16]:
ext_cond = [(manual["FilterEntities"] == 1) & (manual["score"] != 2),
            (manual["FilterSymbols"] == 1) & (manual["score"] != 2),
            
]

ext_values = [
    "non", "non"
]

manual["ExtendedScore"] = np.select(ext_cond, ext_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual["ExtendedScore"] = np.select(ext_cond, ext_values)


In [17]:
manual["ExtendedScore"].value_counts()

0      3683
non     783
Name: ExtendedScore, dtype: int64

In [18]:
### Final extension to score mechanism
final_conditions = [(manual["ExtendedScore"] != "non") & (manual["FilterTitle"] == 1),
                    
]

final_values = [
    "natural",
]

manual["ExtendedFinalScore"] = np.select(final_conditions, final_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual["ExtendedFinalScore"] = np.select(final_conditions, final_values)


In [19]:
manual["ExtendedFinalScore"].value_counts()

0          4380
natural      86
Name: ExtendedFinalScore, dtype: int64

## Final thoughts on the preprocessing before I engage in manual work.

### Remember that I am operating within the 4466 observations that are not certain.

__Overview results:__

- Encoding dmfullname based on natural person titles: 115982 observations have titles / 7530 observations have no titles.
- Enconding dmfullname based on numeric symbols: 123392 observations have no numbers / 120 observations have numbers in dmfullname.
- Encoding dmfullname based on entitity legal titles: 121992 observations have no entity titles / 1520 observations have entity titles in dmfullname.

- Encoding dmAge, dmBirthDate & dmGender by checking if a value exists or not: 
    - dmGender -> Males: 98577 and Females: 20469 which together (a.k.a Natural) = 119046; -1 (a.k.a Non-natural and possible natural) = 4466
    - dmAge -> Age given: 83491 , Age not given: Total Observations - 83491
    - dmBirthDay -> BirthDay given: 82100, BirthDay not given: Total Observations - 82100

__Upon Ecoding Strategies, summary of Scoring mechanism:__

- 3 scores were created
    - Column 'score' is a variable dependent on a given or not value encoding based on dmAge, dmGender and dmBirthDate; -> 301 observations less to manually check;
    - Column 'ExtendedScore' is a variable dependent on a the scoring mechanism of column 'Score' and filters 'FilterSymbols' and 'FilterEntities'. -> 783 observations less to manually check;
    - Column 'ExtendedFinalScore' is a variable dependent on scoring mechanism 'ExtendedScore' and column FilterTitle. -> 86 observations less to manually check.
    
### Final steps:

In [20]:
score_conditions = [
    (manual['ExtendedFinalScore'] == 'natural'),
    (manual['ExtendedFinalScore'] != 'natural') & (manual['ExtendedScore'] == 'non'),
    (manual['ExtendedFinalScore'] != 'natural') & (manual['ExtendedScore'] != 'non')  
]

score_value = ["natural",
               "non-natural",
               "revision"          
]

manual["NaturalPerson"] = np.select(score_conditions, score_value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual["NaturalPerson"] = np.select(score_conditions, score_value)


In [21]:
manual["NaturalPerson"].value_counts()

revision       3597
non-natural     783
natural          86
Name: NaturalPerson, dtype: int64

#### I need to revise 3597 instead of 4466 to define dummy variables NaturalPerson and GenderManual officialy.

In [22]:
manual.to_csv("revision.csv") # Here, I would have to check 3597 observations.

#### Final steps for task:

1. Embeded checked manual data with secure data.
2. Finish 'NaturalPerson' encoding
3. Create GenderManual dummy

In [23]:
checked_data = pd.read_table("revision_manual.csv", engine="c", sep=",", index_col=0)

In [24]:
final_df = pd.concat([data, checked_data], axis=1, sort=False)
final_df

Unnamed: 0,isin,v1,companynamelatinalphabet,bvdidnumber,name,cname,dmfullname,dmuciuniquecontactidentifie,dmjobtitleinenglish,dmjobtitle,...,FilterTitle,FilterSymbols,FilterEntities,FilterAge,FilterBirthDate,FilterGender,score,ExtendedScore,ExtendedFinalScore,NaturalPerson
1,NL0012015705,2225,TAKEAWAY.COM N.V.,NL08142836,JUST EAT TAKEAWA,JUST EAT TAKEAWAY COM NV,GRIBHOLD B.V.,C000459947,Director,Bestuurder,...,0.0,0.0,1.0,-1.0,-1,-1.0,0.0,non,0,non-natural
2,NL0000383800,1837,SMIT INTERNATIONALE NV,NL24004888,SMIT INTL.CERTS. DEAD - 05/05/10,SMIT INTL.,Boskalis Holding B.V.,C001024491,Directeur,Directeur,...,0.0,0.0,1.0,-1.0,-1,-1.0,0.0,non,0,non-natural
3,NL0000289783,12368,ROBECO GLOBAL STARS EQUITIES FUND N.V.,NL24041906,ROBECO SUST.GLB. STARS EQTIES.FD.,ROBECO GLB.STARS EQ.FD.,Robeco Fund Management B.V.,C001055291,Director,Bestuurder,...,0.0,0.0,1.0,-1.0,-1,-1.0,0.0,non,0,non-natural
4,NL0000350361,3289,EXACT HOLDING NV,NL27234422,EXACT HOLDING DEAD - 31/03/15,EXACT HOLDING NV,Exact Management B.V.,C001133291,Directeur,Directeur,...,0.0,0.0,1.0,-1.0,-1,-1.0,0.0,non,0,non-natural
5,NL0009508720,13696,LBI INTERNATIONAL N.V.,NL30277334,LBI INTERNATIONAL DEAD - 07/03/13,LBI INTERNATIONAL NV,Fint Management B.V.,C001200105,Director,Bestuurder,...,0.0,0.0,1.0,-1.0,-1,-1.0,0.0,non,0,non-natural
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123508,CH0496451508,359,SOFTWAREONE HOLDING AG,CHCHE384378612,SOFTWAREONE HOLDING,SOFTWAREONE HOLDING AG,Daniel Von Stockar,P649256001,Chairman Supervisory Board,Chairman Supervisory Board,...,0.0,0.0,0.0,-1.0,-1,-1.0,0.0,0,0,natural
123509,CY0009731015,8461,XXI CENTURY INVESTMENTS PUBLIC LIMITED,CYC132218,XXI CENTURY INVS.PUBLIC DEAD - 03/12/13,XXI CEN.INVS.PUBLIC LTD.,Ms Stella Aristotelous,P649258636,Director,Director,...,,,,,,,,,,
123510,GRS228003000,4116,IMERYS INDUSTRIAL MINERALS GREECE S.A.,GR094000952,S&B INDUSTRIAL MRLS. DEAD - 12/07/13,S&B INDUSTRIAL MRLS.SA,Anastasia Amvrosiadou,P649259308,Chief Executive Officer,Chief Executive Officer,...,0.0,0.0,0.0,-1.0,-1,-1.0,0.0,0,0,natural
123511,CY0007400613,8843,CHRIS JOANNOU PUBLIC LTD,CYC12013,CHRIS IOANNOU,CHRIS IOANNOU PCL.,Mr Georgios Evripidou,P649264673,Director,Director,...,,,,,,,,,,


### Concatenating logic: 

I have merged both datas horizontally based on their index number which matched. This implies the following: data frame "manual_checked" had columns and values which were not part of data frame "data". The last operation merged the data and gave to the observations which were not consider in data frane "manual_checked" a NaN value. All the observations with NaN value are actual naturalPersons because they presented a gender attribute. These are the 119046 observations mentioned before.

In [25]:
final_df["NaturalPerson"] = final_df["NaturalPerson"].fillna(0)
final_df["NaturalPerson"] = final_df["NaturalPerson"].replace(to_replace=0, value = "natural")

In [26]:
final_df["NaturalPerson"] = np.where(final_df["NaturalPerson"].str.contains("non-natural"), 0, 1)

In [53]:
#final_df["NaturalPerson"].value_counts()
list(final_df.columns)

['isin',
 'v1',
 'companynamelatinalphabet',
 'bvdidnumber',
 'name',
 'cname',
 'dmfullname',
 'dmuciuniquecontactidentifie',
 'dmjobtitleinenglish',
 'dmjobtitle',
 'dmappointmentdate',
 'dmresignationdate',
 'dmtitle',
 'dmsalutation',
 'dmfirstname',
 'dmmiddlename',
 'dmlastname',
 'dmsuffix',
 'dmgender',
 'dmbirthdate',
 'dmage',
 'dmagebracket',
 'dmcountryiesofnationality',
 'dmaddress',
 'dmcountry',
 'dmemailaddress',
 'dmbiography',
 'dmtypeofrole',
 'dmboardcommitteeordepartmen',
 'dmlevelofresponsibility',
 'dmalsoashareholder',
 'dmconfirmationdates',
 'dmdateslastreceivedfromip',
 'dmnotvalidafterdate',
 'dminformationsources',
 'dminformationproviders',
 'dmcollege',
 'dmdegreecode',
 'dmmajor',
 'dmgraduationdate',
 'dmcorp',
 'gender',
 'dmcorrespondingbvdidwhenapp',
 'dmasanycategory',
 'dmcurrentorprevious',
 'dmbirthplace',
 'dmhasasignatoryright',
 'dmhasapowerofattorney',
 'dmnoofcosinwhichacurrent',
 'compensationsalaryeur',
 'FilterTitle',
 'FilterSymbols',
 '

#### Task 1: dummy variable "NaturalPerson" is done. 

__Result:__ 121336 observations are natural persons while 2176 aren't.

### Task 2: dummy variable: gender_manual

__Logic:__ 
- 119046 observations have gender given. 
- 4466 are insecure observations, from which after manual checking them I can say, 2176 are non-natural persons.
- 2290 should be checked / reviewed manually -> Here, I could check the data manually but there names I am not familiar with. For this reason, I have choosen to apply an algorithm to the column "dmfirstname".

In [47]:
## TO DO: Separate the 2290 observation that are natural persons and apply algo in them to define their gender.
gender = checked_data[checked_data["NaturalPerson"] == "natural"]
gender.to_csv("gender_insecure.csv")
dfGender = pd.read_table("gender_insecure.csv", engine="c", sep=",", index_col=0)

In [51]:
list(dfGender.columns)

['isin         ',
 'v1    ',
 'companynamelatinalphabet                                                                                                      ',
 'bvdidnumber    ',
 'name                                            ',
 'cname                    ',
 'dmfullname                                                                   ',
 'dmuciuniquecontactidentifie ',
 'dmjobtitleinenglish                                                                                                                                                              ',
 'dmjobtitle                                                                                                                                                                       ',
 'dmappointmentdate ',
 'dmresignationdate ',
 'dmtitle              ',
 'dmsalutation ',
 'dmfirstname                      ',
 'dmmiddlename              ',
 'dmlastname                                                                   ',
 'dmsuffix ',
 'd

In [52]:
dfGender["dmfirstname                      "].isnull().sum()

0

In [28]:
### Train Part

In [None]:
### Test Part

In [None]:
### TO DO: CLEAN .CSV AND UPLOAD IT TO LRZ

In [29]:
### TO DO: MERGE AND ENCODE BACK INSECURE OBSERVATIONS
