# ETL 
------
- Cleaning Data Science Salaries CSV (dropping all Part Time / Free Lance / and CT Employee Data because they make up only 2% of the Data)
- Convert CSV to JSON File


In [1]:
import pandas as pd
import numpy as np
import csv
import json

In [2]:
data = pd.read_csv("ds_salaries.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [5]:
data.employment_type.describe()

count     607
unique      4
top        FT
freq      588
Name: employment_type, dtype: object

In [6]:
data.employment_type.unique()

array(['FT', 'CT', 'PT', 'FL'], dtype=object)

In [7]:
fullTime = data.loc[data["employment_type"] == "FT"].copy()
fullTime

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...,...
602,602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
603,603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
604,604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
605,605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M


In [8]:
# Rename unamed column
fullTime.rename(columns={"Unnamed: 0": "id"}, inplace=True)
fullTime

Unnamed: 0,id,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...,...
602,602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
603,603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
604,604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
605,605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M


In [9]:
# To Csv
fullTime.to_csv("FullTimeEmployeeData.csv", encoding='utf-8', index=False)

In [11]:
# Convert Full time employee CSV to JSON File

def jsonConvert(csvfilepath, jsonfilepath):

    # dictionary
    ftData = {}

    # dict reader
    with open(csvfilepath, encoding='utf-8') as csvfile:
        csvreader = csv.DictReader(csvfile)

        # conver rows to dict and add to data
        for rows in csvreader:
            # primary key is id
            key = rows['id']
            ftData[key] = rows

    # json writer
    with open(jsonfilepath, 'w', encoding='utf-8') as jsonfile:
        jsonfile.write(json.dumps(ftData, indent=4))

# file paths
csvfilepath = r'FullTimeEmployeeData.csv'
jsonfilepath = r'FullTimeEmployeeData.json'

jsonConvert(csvfilepath, jsonfilepath)
