# Feature Creation / Feature Engineering


Let's add some new features to our data set:
 - ~~Region~~
 - University ranking
 - ~~Choose if the master is science, art, bussines, etc~~
 - ~~See how many of them have Data Science in the description or degree~~

In [144]:
import pandas as pd
import warnings
import re
warnings.filterwarnings('ignore')

### Region - State

In [145]:
# Data base of world cities: https://simplemaps.com/data/de-cities
cities = pd.read_csv('worldcities.csv')


In [146]:
programmes=pd.read_csv('./Database/DAAD_data_base_cleaned.csv')

In [147]:
# Create new DF with only german cities
cities_ger = cities.copy()
cities_ger = cities_ger[cities_ger.country == 'Germany']
cities_ger.reset_index(inplace=True)

In [148]:
# Correct some misspellings
cities_ger.city[30] = 'Halle (Saale)'
cities_ger.city[339] = 'Wedel (Holstein)'
cities_ger.city[403] = 'Friedberg (Hessen)'
cities_ger.city[185] = 'Neustadt an der Weinstraße'

In [149]:
# Add some missing cities in the DF
new_cities = pd.DataFrame({'city':['Hermannsburg','Eggenstein-Leopoldshafen','Weiden in der Oberpfalz','Köthen / Anhalt','Bergholz-Rehbrücke'],'admin_name':['Lower Saxony','Baden-Württemberg','Bavaria','Saxony-Anhalt','Brandenburg'],'population':[8061,17000,42743,25641,6352]})
cities_ger = cities_ger.append(new_cities,ignore_index=True)

In [150]:
# Cleanning English name from the df programmes
programmes['City']=programmes.City.apply(lambda x: x.replace('München','Munich').replace('Köln','Cologne').replace('Nürnberg','Nuremberg').replace('Frankfurt am Main','Frankfurt').replace('Garching b. Munich','Garching bei München').replace('Landau in der Pfalz','Landau').replace('Esslingen am Neckar','Esslingen').replace('Biberach an der Riß','Biberach').replace('Ludwigshafen am Rhein','Ludwigshafen').replace('Mülheim an der Ruhr','Mülheim').replace('Weidenbach','Ansbach').replace('Leer (Ostfriesland)','Leer').replace('Hoppstädten-Weiersbach','Birkenfeld').replace('Bernburg / Saale','Bernburg').replace('Garching bei Munich','Garching bei München').replace('Kempten (Allgäu)','Kempten'))

In [151]:
# Merge between DFs
programmes = pd.merge(left=programmes,right=cities_ger.loc[:,['city','admin_name','population']],left_on='City', right_on = 'city', how = 'left')

In [152]:
# Delete extra city column (from cities_ger)
programmes.drop('city',axis=1, inplace=True)

Unnamed: 0,Course ID,University,Programme,Degree,Teaching language,Languages,Programme duration,Beginning,Application deadline,Tuition fees per semester in EUR,...,Deadline_in_October,Deadline_in_November,Deadline_in_December,Master,PhD,Bachelor,Contribution per semester,Total contribution,admin_name,population
0,4000,Friedrich Schiller University Jena,Master of Science in Molecular Medicine MSc Mo...,Master of Science in Molecular Medicine,English,Courses are held in English.,4 semesters,Winter semester,"31 May for the following winter semester, appl...",0,...,0,0,0,1,0,0,230,920,Thuringia,111407.0
1,4001,University of Augsburg,Master of Science in Software Engineering Mast...,Master of Science in Software Engineering,English,The courses are in English unless all students...,4 semesters,Winter semester,"1 May, for all applicants",0,...,0,0,0,1,0,0,115,460,Bavaria,295135.0
2,4002,Philipps-Universität Marburg,German as a Foreign Language (MA) German as a ...,Master of Arts,German,German in courses and for the Master's thesis,4 semesters,Winter semester,15 July for the following winter semester,0,...,0,0,0,1,0,0,335,1340,Hesse,76851.0
3,4003,University of Stuttgart,Graduate School Simulation Technology (GS SimT...,PhD,German English,"English about 60%, German about 40%",6 semesters,Only for doctoral programmes: any time,No application deadline,-1,...,0,0,0,0,1,0,180,1074,Baden-Württemberg,634830.0
4,4004,University of Cologne,Bonn-Cologne Graduate School of Physics and As...,Doctoral Degree in Physics,English,The entire programme is taught in English.,6 semesters,Only for doctoral programmes: any time,No deadlines,0,...,0,0,0,0,1,0,290,1740,North Rhine-Westphalia,1085664.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1700,3993,University of Applied Sciences Ravensburg-Wein...,Electrical Engineering and Information Technol...,Bachelor of Engineering (BEng),German English,"English (semesters 1 to 4), German (semesters ...",7 semesters,Summer semester,15 November for the following summer semester,1500,...,0,1,0,0,0,1,170,11690,Baden-Württemberg,24943.0
1701,3995,Otto von Guericke University Magdeburg,"International Management, Marketing, Entrepren...","Master of Science in International Management,...",English,Courses and examinations are held exclusively ...,4 semesters,Winter and summer semester,International applicants: 15 June for the foll...,0,...,0,0,1,1,0,0,128,512,Saxony-Anhalt,238697.0
1702,3996,Dortmund University of Applied Sciences and Arts,European Master's in Project Management (EuroM...,Master of Arts (MA),English,Courses are held in English (100%). Students c...,4 semesters,Winter semester,Application deadline: 15 July for the followin...,0,...,0,0,0,1,0,0,310,1240,North Rhine-Westphalia,587010.0
1703,3997,FH Aachen University of Applied Sciences,Bachelor of Science in Applied Chemistry Bache...,Bachelor of Science in Applied Chemistry,German,The course of study is conducted in German. A ...,6 semesters,Winter semester,1 July for non-EU applicants 15 July for EU ap...,0,...,0,0,0,0,0,1,320,1920,North Rhine-Westphalia,32632.0


### Type of study

- Arts (MA)
- Science (MSc)
- Business (MBA/
- Engineering
- Laws

In [120]:
# First approach
programmes['Programme'][1600:1650]

1600                  Power Engineering Power Engineering
1601    Master of Peace and Conflict Studies (MA) (Int...
1602    International Master's Programme in Computatio...
1603    Bonn-Cologne Graduate School of Physics and As...
1604    BA European Cultures and Society (EUCS) BA Eur...
1605                          Biotechnology Biotechnology
1606    International Max Planck Research School on Tr...
1607    Master of Arts (MA) in Interdisciplinary Latin...
1608    Graduate School of North American Studies (GSN...
1609    East European Studies (MA) East European Studi...
1610    International Max Planck Research School (IMPR...
1611    Business Administration with Informatics Busin...
1612    International Master of Landscape Architecture...
1613    Master of Science in Water Science Master of S...
1614    Bachelor of Science in Industrial Engineering ...
1615    International Master in Agricultural Managemen...
1616    Biochemistry and Molecular Biology (BMB) Bioch...
1617    Gradua

In [153]:
# Masters
programmes['Master of Science'] = programmes['Degree'].apply(lambda x: 0 if re.search('((M|m)aster (of|) (S|s)cience)|(M|m)(S|s)(C|c)',x) == None else 1)
programmes['Master of Arts'] = programmes['Degree'].apply(lambda x: 0 if re.search('((M|m)aster (of|) (A|a)rt(s|))|MA',x) == None else 1)
programmes['MBA'] = programmes['Degree'].apply(lambda x: 0 if re.search('((M|m)aster (of|) (B|b)usiness)|MBA',x) == None else 1)
programmes['Master of Engineering'] = programmes['Degree'].apply(lambda x: 0 if re.search('((M|m)aster (of|) (E|e)ngineering)|M(E|e)(ng|NG)',x) == None else 1)
programmes['Master of Laws'] = programmes['Degree'].apply(lambda x: 0 if re.search('((M|m)aster (of|) (L|l)aw(s|))',x) == None else 1)

In [154]:
#General topic
programmes['Science'] = programmes['Degree'].apply(lambda x: 0 if re.search('(S|s)cience|MSc',x) == None else 1)
programmes['Arts'] = programmes['Degree'].apply(lambda x: 0 if re.search('(A|a)rt|MA|BA',x) == None else 1)
programmes['Business'] = programmes['Degree'].apply(lambda x: 0 if re.search('(B|b)usiness|MBA|Administration',x) == None else 1)
programmes['Engineering'] = programmes['Degree'].apply(lambda x: 0 if re.search('(E|e)ngin|MEng|BEng',x) == None else 1)
programmes['Laws'] = programmes['Degree'].apply(lambda x: 0 if re.search('(L|l)aw(s|)',x) == None else 1)
programmes['Data'] = programmes['Programme'].apply(lambda x: 0 if re.search('(D|d)ata|Analytic(s|)|Intelligence|Machine( Learning|)',x) == None else 1)

In [155]:
programmes.to_csv('./Database/DAAD_data_base_featured.csv',index= False)