In [5]:
import pandas as pd

# let's load another piece of data
continents = pd.read_csv('data/continents-according-to-our-world-in-data.csv')

# Rename column headers
continents.rename(columns={'World regions according to OWID': 'Continent', 'Entity': 'Location'}, inplace=True)

# Display first five rows 
continents.head()

Unnamed: 0,Location,Code,Year,Continent
0,Afghanistan,AFG,2023,Asia
1,Aland Islands,ALA,2023,Europe
2,Albania,ALB,2023,Europe
3,Algeria,DZA,2023,Africa
4,American Samoa,ASM,2023,Oceania


In [6]:
# Remove duplicate rows
u_continents = continents.drop_duplicates()

#count of rows in original data frame
row_count = len(continents)

# count of rows in data frame with duplicates dropped
u_row_count = len(u_continents)

# comparing the two row counts to see if there were any duplicates removed.
print(f"Number of rows: {row_count}"  f"\nNumber of unique rows: {u_row_count}")

Number of rows: 271
Number of unique rows: 271


In [7]:
#change data types
ud_continents = u_continents.convert_dtypes()


#compare data types
print(f"Old Data Types: \n{u_continents.dtypes}\n")
print(f"New Data Types: \n{ud_continents.dtypes}")

Old Data Types: 
Location     object
Code         object
Year          int64
Continent    object
dtype: object

New Data Types: 
Location     string[python]
Code         string[python]
Year                  Int64
Continent    string[python]
dtype: object


In [8]:
#identify null values
null_rows_continents = ud_continents[ud_continents.isna().any(axis=1)]

#show count of null values per column
print(ud_continents.isnull().sum()) 

#export null values to csv for review
null_rows_continents.to_csv("data/nullrowscontinents.csv")




Location      0
Code         14
Year          0
Continent     0
dtype: int64


In [9]:
#filter by null values in the Code column
null_rows = ud_continents[pd.isnull(ud_continents["Code"])]

# Display first five rows 
null_rows.head()

Unnamed: 0,Location,Code,Year,Continent
61,Democratic Republic of Vietnam,,2023,Asia
66,Duchy of Modena and Reggio,,2023,Europe
67,Duchy of Parma and Piacenza,,2023,Europe
81,Federal Republic of Central America,,2023,North America
93,Grand Duchy of Baden,,2023,Europe


In [10]:
#drop null values
ud_continents.dropna()

Unnamed: 0,Location,Code,Year,Continent
0,Afghanistan,AFG,2023,Asia
1,Aland Islands,ALA,2023,Europe
2,Albania,ALB,2023,Europe
3,Algeria,DZA,2023,Africa
4,American Samoa,ASM,2023,Oceania
...,...,...,...,...
266,Yemen Arab Republic,OWID_YAR,2023,Asia
267,Yemen People's Republic,OWID_YPR,2023,Asia
268,Yugoslavia,OWID_YGS,2023,Europe
269,Zambia,ZMB,2023,Africa


In [11]:
#save to csv
ud_continents.to_csv("data/continents_clean.csv")


In [12]:
import sqlite3
import csv

#setup database
with sqlite3.connect("women_in_the_workforce.db", timeout=30) as con:
    #creates the tables
    ud_continents.to_sql('continents', con, if_exists='replace', index=False)

cur = con.cursor()


cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall()) 

[('participation',), ('continents',)]
