In [1]:
import pandas as pd

# load data
wage_gap = pd.read_csv('data/gender-wage-gap-oecd.csv')

# Rename column headers
wage_gap.rename(columns={'Gender wage gap (OECD 2017)': 'Gap', 'Entity': 'Location'}, inplace=True)

# Display first five rows 
wage_gap.head()



Unnamed: 0,Location,Code,Year,Gap
0,Australia,AUS,1975,21.6
1,Australia,AUS,1976,20.8
2,Australia,AUS,1977,18.4
3,Australia,AUS,1978,19.8
4,Australia,AUS,1979,20.0


In [2]:
# Remove duplicate rows
u_wage_gap = wage_gap.drop_duplicates()

#count of rows in original data frame
row_count = len(wage_gap)

# count of rows in data frame with duplicates dropped
u_row_count = len(u_wage_gap)

# comparing the two row counts to see if there were any duplicates removed.
print(f"Number of rows: {row_count}"  f"\nNumber of unique rows: {u_row_count}")

Number of rows: 636
Number of unique rows: 636


In [3]:
#change data types
ud_wage_gap = u_wage_gap.convert_dtypes()


#compare data types
print(f"Old Data Types: \n{u_wage_gap.dtypes}\n")
print(f"New Data Types: \n{ud_wage_gap.dtypes}")

Old Data Types: 
Location     object
Code         object
Year          int64
Gap         float64
dtype: object

New Data Types: 
Location    string[python]
Code        string[python]
Year                 Int64
Gap                Float64
dtype: object


In [4]:
#identify null values
null_rows = ud_wage_gap[ud_wage_gap.isna().any(axis=1)]

#show count of null values per column
print(ud_wage_gap.isnull().sum()) 

#export null values to csv for review
null_rows.to_csv("data/nullrows.csv")




Location    0
Code        0
Year        0
Gap         0
dtype: int64


In [5]:
#save to csv
ud_wage_gap.to_csv("data_exports/participation_clean.csv")

print(f"File has been saved successfully.")

File has been saved successfully.


In [6]:
import sqlite3
import csv

#setup database
with sqlite3.connect("women_in_the_workforce.db", timeout=30) as con:
    #creates the table
    ud_wage_gap.to_sql('wagegap', con, if_exists='replace', index=False)

cur = con.cursor()


cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall()) 

[('continents',), ('participation',), ('wagegap',)]
