In [1]:
# Dependencies
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

In [141]:
# Importing data
df = pd.read_table("Lucas_unique_directors_naturalperson_gender.csv", header= None, sep= None, engine="python")

# Reshaping data frame
data_prev = df.rename(columns=df.iloc[0])
data_inter = data_prev.drop(labels=0,axis="index")
data = data_inter.drop("naturalperson", axis=1)
data # Desired data frame shape

Unnamed: 0,isin,v1,companynamelatinalphabet,bvdidnumber,name,cname,dmfullname,dmuciuniquecontactidentifie,dmjobtitleinenglish,dmjobtitle,...,dmcorp,gender,dmcorrespondingbvdidwhenapp,dmasanycategory,dmcurrentorprevious,dmbirthplace,dmhasasignatoryright,dmhasapowerofattorney,dmnoofcosinwhichacurrent,compensationsalaryeur
1,NL0012015705,2225,TAKEAWAY.COM N.V.,NL08142836,JUST EAT TAKEAWA,JUST EAT TAKEAWAY COM NV,GRIBHOLD B.V.,C000459947,Director,Bestuurder,...,Previous,,NL06089183,No,Previous,,No,No,0,
2,NL0000383800,1837,SMIT INTERNATIONALE NV,NL24004888,SMIT INTL.CERTS. DEAD - 05/05/10,SMIT INTL.,Boskalis Holding B.V.,C001024491,Directeur,Directeur,...,Current,,NL23056607,Yes,Current,,No,No,3,
3,NL0000289783,12368,ROBECO GLOBAL STARS EQUITIES FUND N.V.,NL24041906,ROBECO SUST.GLB. STARS EQTIES.FD.,ROBECO GLB.STARS EQ.FD.,Robeco Fund Management B.V.,C001055291,Director,Bestuurder,...,Previous,,,,,,,,,
4,NL0000350361,3289,EXACT HOLDING NV,NL27234422,EXACT HOLDING DEAD - 31/03/15,EXACT HOLDING NV,Exact Management B.V.,C001133291,Directeur,Directeur,...,Current,,NL27228442,No,Current,,No,No,,
5,NL0009508720,13696,LBI INTERNATIONAL N.V.,NL30277334,LBI INTERNATIONAL DEAD - 07/03/13,LBI INTERNATIONAL NV,Fint Management B.V.,C001200105,Director,Bestuurder,...,Previous,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123508,CH0496451508,359,SOFTWAREONE HOLDING AG,CHCHE384378612,SOFTWAREONE HOLDING,SOFTWAREONE HOLDING AG,Daniel Von Stockar,P649256001,Chairman Supervisory Board,Chairman Supervisory Board,...,Current,,,No,Current,,No,No,34,
123509,CY0009731015,8461,XXI CENTURY INVESTMENTS PUBLIC LIMITED,CYC132218,XXI CENTURY INVS.PUBLIC DEAD - 03/12/13,XXI CEN.INVS.PUBLIC LTD.,Ms Stella Aristotelous,P649258636,Director,Director,...,Current,F,,,,,,,,
123510,GRS228003000,4116,IMERYS INDUSTRIAL MINERALS GREECE S.A.,GR094000952,S&B INDUSTRIAL MRLS. DEAD - 12/07/13,S&B INDUSTRIAL MRLS.SA,Anastasia Amvrosiadou,P649259308,Chief Executive Officer,Chief Executive Officer,...,Current,,,No,Current,,No,No,1,
123511,CY0007400613,8843,CHRIS JOANNOU PUBLIC LTD,CYC12013,CHRIS IOANNOU,CHRIS IOANNOU PCL.,Mr Georgios Evripidou,P649264673,Director,Director,...,Current,M,,No,Current,,No,No,1,


### Task 1: Creating dummy variable 'natural person'

__Logic:__ Using custom binary technique based on string attributes. 
   1. I am checking if column dmfullname has special addressing techniques (e.g. Sir, Dr., Madam ...);
   2. I am checking if column dmfullname has any non-natural person attribute (e.g. numbers, punctuation);
   3. I am checking if column dmfullname has any non-natural person addressing techniques for main regions mapped in the data(e.g. company entity registrations like GmbH or Ltd);

In [3]:
# First filter: Title identifier

# Part 1: function special_person_address_identifier identifies which observations 
# have one of the titles options in dmfullname and creates intermediary columns named
# after the title list option with 1 for positive, e.g. there is a title in dmfullnane,
# or 0 if there is no.

def special_person_address_identifier(address):
    for i in address:
        data[i] = np.where(data["dmfullname"].str.contains(i), 1, 0)
    
titles = ["Sir", "Madam", "Ms", "Mr", "Mrs","Miss", "Dr", "Professor"]
special_person_address_identifier(titles)

# Part 2: I have merge all intermediary title columns, e.g. Sir, into one column called 'TitleCheck'.
# I have then used the same process as Part 1 to investigate which observations had a positive value of 1 and which not.
# This would indicate then which have a positive 'FilterTitle' status. The ones that have are likely natural person while
# the ones with a negative status, a.k.a 0 value, not.

data["TitleCheck"] = data[titles].apply(lambda row: "+" .join(row.values.astype(str)), axis=1)

data["FilterTitle"] = np.where(data["TitleCheck"].str.contains("1"), 1, 0)
data = data.drop(["Sir", "Madam", "Ms", "Mr", "Mrs","Miss", "Dr", "Professor","TitleCheck"], axis=1)
data["FilterTitle"].value_counts()

1    115982
0      7530
Name: FilterTitle, dtype: int64

In [4]:
# Second filter: Non-natural person identifier (Numbers, Signals and other)

def non_natural_identifier(symbols):
    for i in symbols:
        data[i] = np.where(data["dmfullname"].str.contains(i), 1, 0)

symbols_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
non_natural_identifier(symbols_list)

data["SymbolsCheck"] = data[symbols_list].apply(lambda row: "+" .join(row.values.astype(str)), axis=1)

data["FilterSymbols"] = np.where(data["SymbolsCheck"].str.contains("1"), 1, 0)
data = data.drop(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "SymbolsCheck"], axis=1)
data["FilterSymbols"].value_counts() # Result shows that 120 observations are certainly not natural.

0    123392
1       120
Name: FilterSymbols, dtype: int64

In [5]:
# Third filter: Location / Region based company entity identifier
data["dmcountry"].value_counts()

def company_legal_entity_identifier(entities):
    for i in entities:
        data[i] = np.where(data["dmfullname"].str.contains(i), 1, 0)

# Main regions considered: Italy, UK, Germany, France, Austria, USA, Netherlands, Sweden, 
#                          Hungary, Luxembourg, Norway, Finland, Ireland and Greece.
regions_entity = ["S.p.a.", "S.r.l.", "Ltd", "PLC", "B.V.", "Limited", "GmbH", 
                       "AG", "UG", "e.V.", "SAS", "SARL", "SA", "OG", "KG", "LLC", 
                       "U.A.", "C.V.", "AB", "HB", "KB", "Zrt.", "Kft.", "S.à r.l.", 
                       "S.A.", "SCSp", "LLCs", "Oy", "LTD","L.T.D.","P.C."]

company_legal_entity_identifier(regions_entity)

data["EntitiesCheck"] = data[regions_entity].apply(lambda row: "+" .join(row.values.astype(str)), axis=1)

data["FilterEntities"] = np.where(data["EntitiesCheck"].str.contains("1"), 1, 0)
data = data.drop(["S.p.a.", "S.r.l.", "Ltd", "PLC", "B.V.", "Limited", "GmbH", 
                       "AG", "UG", "e.V.", "SAS", "SARL", "SA", "OG", "KG", "LLC", 
                       "U.A.", "C.V.", "AB", "HB", "KB", "Zrt.", "Kft.", "S.à r.l.", 
                       "S.A.", "SCSp", "LLCs", "Oy", "LTD","L.T.D.","P.C.", "EntitiesCheck"], axis=1)

data["FilterEntities"].value_counts() # Result shows that 120 observations are certainly not natural.

0    121992
1      1520
Name: FilterEntities, dtype: int64

##### Intermediary Results:

__Dummy Variables:__
  1. __FilterTitle__ indicates, at current moment that, 115982 observations have one of the titles investigated, e.g. Sir, while 7530 doesnt. 
  2. __FilterSymbols__ indicates that 120 observations are certainly not natural persons because they have numeric values in dmfullname.
  3. __FilterEntities__ indicates that 1520 observations have company legal entity names in dmfullname and are therefore certaily not natural persons.

In [6]:
# Looking for potential columns that have a good amount of observations filled.
a = list(data["dmage"].value_counts())
sum(a)

83491

In [7]:
aa = list(data["dmbirthdate"].value_counts())
sum(aa)

82100

In [8]:
b = list(data["dmgender"].value_counts())
sum(b)

119046

##### New filter ideas

Columns dmage, dmbirthdate and dmgender are promising columns to reduce the number of observations I have to check manually once all these columns have more than 80000 filled. Based on this I will continue the filtering process of dummz variable natural_person. 

In [9]:
# Filter Age, Birthdate and Gender

# Coming up with a way to represent empty cells under age , birthday and gender. In this case by filling empty cells with the value -1.
data["FilterAge"] = data["dmage"].fillna(value="-1")  
data["FilterBirthDate"] = data["dmbirthdate"].fillna(value="-1")
data["FilterGender"] = data["dmgender"].fillna(value="-1")

In [10]:
data["FilterGender"].value_counts()

M     98577
F     20469
-1     4466
Name: FilterGender, dtype: int64

In [11]:
# Intermediary csv file with only gender (-1) observations. 
# EVERYTHING HAPPENING FROM HERE IN DATA: MANUAL, IS A TEST.
manual = data[data["FilterGender"] == "-1"]
manual.to_csv("manual_gender_check.csv") # Here, I would have to check 4446 observations.

### Star rating logic

Right now we have 6 filters.  Out of them, 2 are very secure, namely "FilterSymbols" & "FilterEntities", because if you have positive values there you are certainly a non-natural person.

We remain with 4 filters, each has a dummy variable, consequently providing to us 2 variables - therefore, there are 8 variables in questions. There are possibly 8! permutations of these variables - very painful to condition.

At the moment, I can either check 4466 observations manualy or try to scope down further.

The scoping will follow a score mechanism based on only 3 filters: Age, BirthDate and Gender because Filtertitle can be the most misleading and laborious due to these facts: a lot observations to cover in the revision as well as to the possibility of existing companies such as "Dr. Schwarz" or who knows what else.

__Score Structure__

Score can range from 0 to 3 stars. 2 and 3 stars observations won't be revised manually. Observations gain stars based on the existence of a cell value for that attribute or not.

Remember that if an observation had no value in its cell in the columns of FilterAge, FIlterBirthDate and FilterGender they were filled up with value -1. Therefore, the following conditions are being used:



- Age != -1 , BirthDate != -1 , Gender != -1 -> __3 stars rating__
- Age == -1 , BirthDate != -1 , Gender != -1 -> __2 stars rating__
- Age != -1 , BirthDate == -1 , Gender != -1 -> __2 stars rating__
- Age != -1 , BirthDate != -1 , Gender == -1 -> __2 stars rating__

##### Cases that were revised manually
- Age == -1 , BirthDate == -1 , Gender != -1 -> __1 stars rating__
- Age == -1 , BirthDate != -1 , Gender == -1 -> __1 stars rating__
- Age != -1 , BirthDate == -1 , Gender == -1 -> __1 stars rating__
- Age == -1 , BirthDate == -1 , Gender != -1 -> __1 stars rating__
- Age == -1 , BirthDate == -1 , Gender == -1 -> __0 stars rating__

In [12]:
# List of conditions
conditions = [
    (manual['FilterAge'] != "-1") & (manual['FilterBirthDate'] != "-1") & (manual['FilterGender'] != "-1"),
    (manual['FilterAge'] == "-1") & (manual['FilterBirthDate'] != "-1") & (manual['FilterGender'] != "-1"),
    (manual['FilterAge'] != "-1") & (manual['FilterBirthDate'] == "-1") & (manual['FilterGender'] != "-1"),
    (manual['FilterAge'] != "-1") & (manual['FilterBirthDate'] != "-1") & (manual['FilterGender'] == "-1"),
    (manual['FilterAge'] == "-1") & (manual['FilterBirthDate'] == "-1") & (manual['FilterGender'] != "-1"),
    (manual['FilterAge'] == "-1") & (manual['FilterBirthDate'] != "-1") & (manual['FilterGender'] == "-1"),
    (manual['FilterAge'] != "-1") & (manual['FilterBirthDate'] == "-1") & (manual['FilterGender'] == "-1"),
    (manual['FilterAge'] == "-1") & (manual['FilterBirthDate'] == "-1") & (manual['FilterGender'] != "-1"),
    (manual['FilterAge'] == "-1") & (manual['FilterBirthDate'] == "-1") & (manual['FilterGender'] == "-1"),
]

# create a list of the values we want to assign for each condition
values = ['3', '2', '2', '2', '1', '1', '1', '1', '0']

# create a new column and use np.select to assign values to it using our lists as arguments
manual['score'] = np.select(conditions, values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual['score'] = np.select(conditions, values)


In [13]:
manual["score"].value_counts()

0    4153
2     301
1      12
Name: score, dtype: int64

__Result Scoring:__ Based on the first score mechanism I can see that out of 4466 observations, 301 are scored with 2 stars and therefore secure.

### Extending Score mechanism

__Logic:__
- Score != 2 & FilterEntity == 1 -> 'Non'
- Score != 2 & FilterSymbols == 1 -> 'Non'
- Score != 2 & FilterSymbols != 1 -> 'Non'
- Score != 2 & FilterSymbols != 1 -> 'Non'

In [14]:
ext_cond = [(manual["FilterEntities"] == 1) & (manual["score"] != 2),
            (manual["FilterSymbols"] == 1) & (manual["score"] != 2),
            
]

ext_values = [
    "non", "non"
]

manual["ExtendedScore"] = np.select(ext_cond, ext_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual["ExtendedScore"] = np.select(ext_cond, ext_values)


In [15]:
manual["ExtendedScore"].value_counts()

0      3683
non     783
Name: ExtendedScore, dtype: int64

In [16]:
### Final extension to score mechanism
final_conditions = [(manual["ExtendedScore"] != "non") & (manual["FilterTitle"] == 1),
                    
]

final_values = [
    "natural",
]

manual["ExtendedFinalScore"] = np.select(final_conditions, final_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual["ExtendedFinalScore"] = np.select(final_conditions, final_values)


In [17]:
manual["ExtendedFinalScore"].value_counts()

0          4380
natural      86
Name: ExtendedFinalScore, dtype: int64

## Final thoughts on the preprocessing before I engage in manual work.

### Remember that I am operating within the 4466 observations that are not certain.

__Overview results:__

- Encoding dmfullname based on natural person titles: 115982 observations have titles / 7530 observations have no titles.
- Enconding dmfullname based on numeric symbols: 123392 observations have no numbers / 120 observations have numbers in dmfullname.
- Encoding dmfullname based on entitity legal titles: 121992 observations have no entity titles / 1520 observations have entity titles in dmfullname.

- Encoding dmAge, dmBirthDate & dmGender by checking if a value exists or not: 
    - dmGender -> Males: 98577 and Females: 20469 which together (a.k.a Natural) = 119046; -1 (a.k.a Non-natural and possible natural) = 4466
    - dmAge -> Age given: 83491 , Age not given: Total Observations - 83491
    - dmBirthDay -> BirthDay given: 82100, BirthDay not given: Total Observations - 82100

__Upon Ecoding Strategies, summary of Scoring mechanism:__

- 3 scores were created
    - Column 'score' is a variable dependent on a given or not value encoding based on dmAge, dmGender and dmBirthDate; -> 301 observations less to manually check;
    - Column 'ExtendedScore' is a variable dependent on a the scoring mechanism of column 'Score' and filters 'FilterSymbols' and 'FilterEntities'. -> 783 observations less to manually check;
    - Column 'ExtendedFinalScore' is a variable dependent on scoring mechanism 'ExtendedScore' and column FilterTitle. -> 86 observations less to manually check.
    
### Final steps:

In [18]:
score_conditions = [
    (manual['ExtendedFinalScore'] == 'natural'),
    (manual['ExtendedFinalScore'] != 'natural') & (manual['ExtendedScore'] == 'non'),
    (manual['ExtendedFinalScore'] != 'natural') & (manual['ExtendedScore'] != 'non')  
]

score_value = ["natural",
               "non-natural",
               "revision"          
]

manual["NaturalPerson"] = np.select(score_conditions, score_value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual["NaturalPerson"] = np.select(score_conditions, score_value)


In [19]:
manual["NaturalPerson"].value_counts()
manual.to_csv("revision.csv") # Here, I would have to check 3597 observations.

#### I need to revise 3597 instead of 4466 to define dummy variables NaturalPerson and GenderManual officialy.

#### Final steps for task:

1. Embeded checked manual data with secure data.
2. Finish 'NaturalPerson' encoding
3. Create GenderManual dummy

### Merging logic: 

I have merged both datas horizontally based on their index number which matched. This implies the following: data frame "manual_checked" had columns and values which were not part of data frame "data". The last operation merged the data and gave to the observations which were not consider in data frane "manual_checked" a NaN value. All the observations with NaN value are actual naturalPersons because they presented a gender attribute. These are the 119046 observations mentioned before.

In [27]:
checked_data = pd.read_table("revision_manual.csv", sep=",", index_col=0)
checked_data["NaturalPerson"].value_counts()

natural        2290
non-natural    2176
Name: NaturalPerson, dtype: int64

In [68]:
data = data.astype(str)
checked_data = checked_data.astype(str)

final = data.combine_first(checked_data) # CHECKING
final["NaturalPerson"].value_counts() # Merge numbers match numbers from "checked_data"
#final

natural        2290
non-natural    2176
Name: NaturalPerson, dtype: int64

In [70]:
final["NaturalPerson"] = final["NaturalPerson"].fillna("natural") # Filling, NaN observations with Natural because all of them had Gender.
final["NaturalPerson"].value_counts() # Done. 

natural        121336
non-natural      2176
Name: NaturalPerson, dtype: int64

#### Task 1: dummy variable "NaturalPerson" is done. 

__Result:__ 121336 observations are natural persons while 2176 aren't.

### Task 2: dummy variable: gender_manual

__Logic:__ 
- 119046 observations have gender given. 
- 4466 are insecure observations, from which after manual checking them I can say, 2176 are non-natural persons.
- 2290 should be checked / reviewed manually -> Here, I could check the data manually but there names I am not familiar with. For this reason, I have choosen to apply an algorithm to the column "dmfirstname".

In [71]:
## TO DO: Separate the 2290 observation that are natural persons and apply algo in them to define their gender.
genderize = checked_data[checked_data["NaturalPerson"] == "natural"]
genderize # 2290 observations that are subjects of gender classification are ready!

Unnamed: 0,isin,v1,companynamelatinalphabet,bvdidnumber,name,cname,dmfullname,dmuciuniquecontactidentifie,dmjobtitleinenglish,dmjobtitle,...,FilterTitle,FilterSymbols,FilterEntities,FilterAge,FilterBirthDate,FilterGender,score,ExtendedScore,ExtendedFinalScore,NaturalPerson
190,FR0000060618,60,RALLYE,FR054500574,RALLYE,RALLYE,Fonciere Euris,C003873150,Director (Board of Directors),Director (Board of Directors),...,0,0,0,-1,-1,-1,0,0,0,natural
297,GB00B39QB067,4037,THE RETHINK GROUP LIMITED,GB05078352,RETHINK GROUP (THE) DEAD - 09/12/14,RETHINK GROUP PLC.,ST JAMES DIRECTORS LIMITED,C007514081,Director (occupation: Limited Company),Director (occupation: Limited Company),...,0,0,0,-1,-1,-1,0,0,0,natural
560,BE0974280126,9221,BONE THERAPEUTICS SA,BE0882015654,BONE THERAPEUTIC,BONE THERAPEUTIC,Enrico Bastianelli,C058959216,Director,Administrateur,...,0,0,0,-1,-1,-1,0,0,0,natural
655,BE0003766806,2712,ION BEAM APPLICATIONS SA,BE0428750985,ION BEAM APPLICATIONS,ION BEAM APPLICATIONS SA,CONSULTANCE MARCEL MILLER,C112065648,Director,Administrateur,...,0,0,0,-1,-1,-1,0,0,0,natural
785,BE0003678894,3322,BEFIMMO SA,BE0455835167,BEFIMMO,BEFIMMO SCA,KADEES,C178679878,Director,Administrateur,...,0,0,0,-1,-1,-1,0,0,0,natural
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123505,MT0000150103,8963,LOQUS HOLDINGS PLC,MTC27140,LOQUS HOLDINGS,LOQUS HOLDINGS PLC.,Joannie Grima,P649250964,Director,Director,...,0,0,0,-1,-1,-1,0,0,0,natural
123506,GRS354003006,7456,KTIMA KOSTAS LAZARIDIS S.A,GR051520819000,KTIMA KOSTAS LAZARIDIS,KTIMA KOSTAS LAZARIDIS,Julia Lazaridis,P649252635,Chairman,Chairman,...,0,0,0,-1,-1,-1,0,0,0,natural
123507,GRS354003006,7456,KTIMA KOSTAS LAZARIDIS S.A,GR051520819000,KTIMA KOSTAS LAZARIDIS,KTIMA KOSTAS LAZARIDIS,Triantaphyllos Tsamantanis,P649252636,Non-Executive-Independent,Non-Executive-Independent,...,0,0,0,-1,-1,-1,0,0,0,natural
123508,CH0496451508,359,SOFTWAREONE HOLDING AG,CHCHE384378612,SOFTWAREONE HOLDING,SOFTWAREONE HOLDING AG,Daniel Von Stockar,P649256001,Chairman Supervisory Board,Chairman Supervisory Board,...,0,0,0,-1,-1,-1,0,0,0,natural


In [72]:
genderize["dmfirstname"].value_counts().sum() # Out of those 2290, 2144 have a dmfirstname given. Which implies that 146 observations would still have to be checked manually.
genderize["dmfirstname"] = genderize["dmfirstname"].fillna(value="revision") # Marking all manual checks with 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genderize["dmfirstname"] = genderize["dmfirstname"].fillna(value="revision") # Marking all manual checks with 0


In [73]:
genderize["dmfirstname"].value_counts() # Checking progress

nan            146
David           16
Michael         16
John            13
Janis           12
              ... 
Danuta           1
Trond            1
Arja             1
Deborah          1
Karen-Marie      1
Name: dmfirstname, Length: 1499, dtype: int64

In [74]:
gender_revision_manual = genderize[genderize["dmfirstname"] == "revision"]
gender_revision_manual.to_csv("gender_revision_manual.csv") # Going manual

# Merging rquirements
gender_checked = pd.read_table("gender_revision_manual 2.csv", sep=",", index_col=0)
gender_checked = gender_checked.astype(str)

In [136]:
# Merge
intermediary_obj = gender_checked.combine_first(genderize) # Updated and manually check no dmfirstname observations that are natural persons
returning_intermediary_to_final = intermediary_obj.combine_first(final) # Adding back some mistakes regarding "naturalpersons". Previously 2176 non-natural, now 2195.

df_apply_classifier = returning_intermediary_to_final[returning_intermediary_to_final["NaturalPerson"] != "non-natural"]
df_apply_classifier["dmfirstname"].value_counts().sum() # Verification, that all my NaturalPerson have actually a dmfirstname to apply the gender classifier.

df_apply_classifier_predict = df_apply_classifier[df_apply_classifier["dmgender"] == "nan"]
df_apply_classifier_names = df_apply_classifier_predict["dmfirstname"]
df_apply_classifier_names.value_counts().sum() # Checking the amount observations I have to predict and numbers matches.

2271

In [137]:
# Credits for the inspiration: https://github.com/Jcharis/Python-Machine-Learning/tree/master/Gender%20Classification%20With%20%20Machine%20Learning

# Setup of classifier
names_data = pd.read_csv('names_dataset.csv') # Total of: 181800 female names & 103275 male names
names_data.sex.replace({'F':0,'M':1},inplace=True) # Female is 0, Male is 1

Xfeatures =names_data['name']

# Feature Extraction 
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [138]:
### Train Part
from sklearn.model_selection import train_test_split

# Labels
y = names_data.sex
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.6398163206734908

In [154]:
### Test / Prediction Part: Female is 0, Male is 1

def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
        
# Gender list that I want to predict in my model
namelist = list(df_apply_classifier_names)

for i in namelist:
    predicted[i] = genderpredictor(i)

Female
Male
Female
Female
Female
Female
Male
Female
Female
Male
Male
Male
Female
Male
Male
Female
Female
Female
Female
Female
Female
Male
Female
Male
Male
Female
Female
Female
Male
Female
Male
Female
Female
Female
Male
Female
Male
Female
Male
Female
Female
Female
Male
Female
Male
Female
Female
Male
Female
Male
Female
Female
Female
Female
Female
Male
Male
Male
Female
Male
Female
Male
Male
Male
Female
Female
Female
Female
Female
Female
Male
Male
Female
Male
Female
Male
Female
Female
Female
Female
Female
Female
Male
Female
Female
Female
Male
Female
Male
Female
Female
Female
Female
Male
Female
Female
Female
Male
Male
Female
Female
Male
Female
Female
Female
Female
Female
Female
Male
Female
Female
Female
Female
Male
Female
Female
Female
Male
Female
Male
Male
Female
Female
Female
Female
Female
Female
Female
Female
Female
Female
Male
Female
Male
Male
Female
Male
Female
Female
Male
Female
Female
Female
Female
Female
Female
Male
Female
Female
Female
Female
Female
Female
Female
Female
Female
Fema

Male
Male
Female
Female
Female
Female
Female
Female
Male
Female
Female
Male
Female
Female
Female
Female
Male
Female
Male
Female
Female
Female
Female
Male
Male
Male
Female
Female
Female
Male
Female
Female
Male
Female
Male
Female
Female
Female
Female
Female
Male
Male
Male
Male
Male
Male
Male
Female
Female
Male
Female
Male
Female
Male
Female
Female
Female
Female
Female
Female
Female
Male
Male
Female
Male
Female
Female
Female
Male
Female
Female
Female
Female
Female
Female
Female
Male
Female
Female
Female
Female
Female
Female
Female
Female
Male
Male
Female
Female
Female
Male
Male
Male
Female
Male
Male
Female
Male
Female
Female
Female
Female
Male
Female
Female
Female
Male
Female
Female
Female
Female
Female
Female
Female
Female
Female
Male
Female
Female
Female
Female
Male
Male
Female
Female
Female
Male
Female
Male
Female
Female
Male
Female
Female
Female
Female
Female
Female
Female
Female
Male
Female
Female
Female
Female
Female
Female
Female
Male
Female
Female
Female
Male
Female
Female
Male
Ma

In [160]:
namelist

['Fonciere',
 'Enrico',
 'Gianbeppi',
 'Marc',
 'Thomas',
 'Amaury',
 'Benoit',
 'Willem',
 'Johan',
 'Gerard',
 'Patrick',
 'Yves',
 'Julian',
 'Leon',
 'Gregory',
 'Barbara',
 'Lidia',
 'Floris',
 'Israel',
 'Israel',
 'Robert',
 'Nicolas',
 'Vander',
 'Lothar',
 'Stefan',
 'Michel',
 'Jo',
 'Benedikte',
 'Jan',
 'Christina',
 'Russell',
 'Jean',
 'Dirk',
 'Thierry',
 'Etienne',
 'Sigrid',
 'Gaetan',
 'Stephane',
 'Guido',
 'Chris',
 'Christine',
 'Arthur',
 'Pierre',
 'Dirk',
 'Francois',
 'Frank',
 'Cedric',
 'Eric',
 'Frank',
 'Pierre',
 'Thierry',
 'Roland',
 'Paul',
 'Francis',
 'Evelyn',
 'Patrick',
 'Baron',
 'Julien',
 'Jacquot',
 'Luc',
 'Francis',
 'Matthieu',
 'Alain',
 'Alain',
 'Christian',
 'Didrik',
 'Kris',
 'Henri',
 'Michael',
 'Gustaaf',
 'Emmanuel',
 'Mark',
 'Patricia',
 'Rudy',
 'Maryse',
 'Etienne',
 'Dominique',
 'Louis',
 'Butland',
 'David',
 'Marion',
 'Inge',
 'Guido',
 'Andre',
 'Vincent',
 'Rosemary',
 'Sebastien',
 'Prudence',
 'Yannick',
 'Cyrille',
 '

In [95]:
### TO DO: MERGE AND ENCODE BACK INSECURE OBSERVATIONS


In [None]:
### TO DO: CLEAN .CSV AND UPLOAD IT TO LRZ

# Big error: I just neet to predict 2271 not all dmfirstnames. Separate data.