In [1]:
import numpy as np
import pandas as pd

In [2]:
from src.utils.clean import Clean

In [3]:
clean = Clean()

In [4]:
data = pd.read_csv('../data/raw/data.csv')

In [5]:
data = data.drop('Unnamed: 0', axis=1)

In [6]:
data.head()

Unnamed: 0,title,degrees,funds,date,location
0,Schaefer Halleen’s Social Equity Scholarship,"Master, Bachelor, Phd",$1000,30 June 2022,united-states
1,President’s Scholarship for Ukraine,"Master, Bachelor, Phd, Course",$2500,31 July 2022,united-states
2,International Students Diversity Contest 2022-...,"Master, Bachelor, Phd, Course","Up to $2,000",22 December 2022,united-states
3,Master in Business Administration 80% OFF your...,Master,80% Tuition Fees,15 September 2022,united-states
4,Improve Men’s Health Scholarship,"Master, Bachelor, Phd",$2000,31 May 2022,united-states


# Fixing names

In [7]:
data = clean.headers(data)

In [8]:
data = clean.categories(data)

In [9]:
data.head()

Unnamed: 0,title,degrees,funds,date,location
0,Schaefer Halleen’S Social Equity Scholarship,"Master, Bachelor, Phd",$1000,30 June 2022,United States
1,President’S Scholarship For Ukraine,"Master, Bachelor, Phd, Course",$2500,31 July 2022,United States
2,International Students Diversity Contest 2022 ...,"Master, Bachelor, Phd, Course","Up To $2,000",22 December 2022,United States
3,Master In Business Administration 80% Off Your...,Master,80% Tuition Fees,15 September 2022,United States
4,Improve Men’S Health Scholarship,"Master, Bachelor, Phd",$2000,31 May 2022,United States


# Fixing Degrees Column

In [10]:
data.degrees.unique()

array(['Master, Bachelor, Phd', 'Master, Bachelor, Phd, Course', 'Master',
       'Course', 'Master, Course', 'Bachelor', 'Master, Bachelor, Course',
       'Phd', 'Bachelor, Master', 'Phd, Master', 'Master, Bachelor',
       'Course, Master', 'Master, Phd', 'Bachelor, Master, Phd, Course',
       'Bachelor, Master, Phd', 'Fully Funded', 'Not Funded', nan,
       'Phd, Master, Bachelor, Course', 'Phd, Master, Bachelor',
       'Bachelor, Course', 'Bachelor, Phd, Master'], dtype=object)

In [11]:
data.degrees.isna().sum()

49

In [12]:
data.degrees.fillna('Not Specified', inplace=True)

In [13]:
data.replace('Not Funded', 'Not Specified', inplace=True)
data.replace('Fully Funded', 'Not Specified', inplace=True)

In [14]:
categories = ['master', 'bachelor', 'phd', 'course']
for category in categories:
    data[category] = data['degrees'].str.contains(category, case=False, na=False).astype(int)

In [15]:
data = data.drop(['degrees', 'funds', 'date'], axis=1)

In [16]:
data.head()

Unnamed: 0,title,location,master,bachelor,phd,course
0,Schaefer Halleen’S Social Equity Scholarship,United States,1,1,1,0
1,President’S Scholarship For Ukraine,United States,1,1,1,1
2,International Students Diversity Contest 2022 ...,United States,1,1,1,1
3,Master In Business Administration 80% Off Your...,United States,1,0,0,0
4,Improve Men’S Health Scholarship,United States,1,1,1,0


In [17]:
data.drop_duplicates(inplace=True)

In [18]:
# data.to_csv('../data/interim/data.csv', index=False)

# Conclusion
- Dropped the funds and date columns because there were too many unique values
- Splitted the degrees column into four different columns
- Dropped the duplicates
- Saved the data for further analysis