# Data Importing and Saving

In [1]:
import numpy as np
import pandas as pd

In [2]:
from pathlib import Path
# Sets a variable to the home directory of the current user
home = str(Path.home())

## JSON Data

In [3]:
# Manually defined json-like object as a list of dicts
nobel_winners = [
        {
            "category": "Physics",
            "name": "Albert Einstein",
            "nationality": "Swiss",
            "sex": "male",
            "year": "1921"
        },
        {
            "category": "Physics",
            "name": "Paul Dirac",
            "nationality": "British",
            "sex": "male",
            "year": "1933"
        },
        {
            "category": "Chemistry",
            "name": "Marie Curie",
            "nationality": "Polish",
            "sex": "female",
            "year": "1911"
        }
    ]


### Converting JSON Data to CSV

In [4]:
# Create headers from json object (assumes consistent schema)
cols = nobel_winners[0].keys()
columns = sorted(cols)

winners_path = f'{home}/data/nobel_winners.csv'

# Open System file
with open(winners_path, 'w') as f:
    f.write(','.join(columns) + '\n')
    for o in nobel_winners:
        row = [str(o[col]) for col in columns]
        f.write(','.join(row) + '\n')


In [5]:
# Here we are going to use the context manager with ... to verify that we wrote the file properly
with open(winners_path) as f:
    for line in f.readlines():
        print(line)

category,name,nationality,sex,year

Physics,Albert Einstein,Swiss,male,1921

Physics,Paul Dirac,British,male,1933

Chemistry,Marie Curie,Polish,female,1911



### Using Pandas to open a csv and to specify delimiter

In [6]:
# Let's open the file we wrote again, but this time using Pandas
content = pd.read_csv(winners_path)

In [7]:
type(content)

pandas.core.frame.DataFrame

In [8]:
# Displaying the content that has been saved in a pandas DataFrame
content

Unnamed: 0,category,name,nationality,sex,year
0,Physics,Albert Einstein,Swiss,male,1921
1,Physics,Paul Dirac,British,male,1933
2,Chemistry,Marie Curie,Polish,female,1911


In [9]:
# We're going to write a new version of the file, but with a semicolon delimiter this time
# Setting a variable for the path of the new file
winners_path1 = f'{home}/data/nobel_winners1.csv'

In [10]:
with open(winners_path1, 'w') as f:
    f.write(';'.join(columns) + '\n')
    for o in nobel_winners:
        row = [str(o[col]) for col in columns]
        f.write(';'.join(row) + '\n')

In [11]:
# Saving the new content to a new DataFrame without specifying the delimiter
content_no_delimiter_specified = pd.read_csv(winners_path1)

In [12]:
# We can see that when we display this DataFrame that the parsing wasn't right
content_no_delimiter_specified

Unnamed: 0,category;name;nationality;sex;year
0,Physics;Albert Einstein;Swiss;male;1921
1,Physics;Paul Dirac;British;male;1933
2,Chemistry;Marie Curie;Polish;female;1911


In [13]:
# To fix the parsing, now we are going to specify the delimiter to use
# Saving the new content to a new DataFrame and specifying the delimiter
content1 = pd.read_csv(winners_path1, sep =';')

In [14]:
content1

Unnamed: 0,category,name,nationality,sex,year
0,Physics,Albert Einstein,Swiss,male,1921
1,Physics,Paul Dirac,British,male,1933
2,Chemistry,Marie Curie,Polish,female,1911


In [19]:
# Especially useful with big data sets, we can display the first few rows of data
# Optionally, we can specify how many to show at once with an integer
content.head(2)

Unnamed: 0,category,name,nationality,sex,year
0,Physics,Albert Einstein,Swiss,male,1921
1,Physics,Paul Dirac,British,male,1933


In [22]:
updatedColumns = {"category":"field"}

In [23]:
# Here we will rename a column
# This is reminiscent of React's method of merging state changes, rather than overwriting all state
# Similar behavior is also found in Git's patching
content = content.rename(columns=updatedColumns)

In [24]:
content

Unnamed: 0,field,name,nationality,sex,year
0,Physics,Albert Einstein,Swiss,male,1921
1,Physics,Paul Dirac,British,male,1933
2,Chemistry,Marie Curie,Polish,female,1911


In [25]:
# We can also perform queries on the data, similarly to using where clauses in SQL
physicists = content[content.field == "Physics"]

# Here is the result of that query
physicists

Unnamed: 0,field,name,nationality,sex,year
0,Physics,Albert Einstein,Swiss,male,1921
1,Physics,Paul Dirac,British,male,1933


In [26]:
# Similarly, we can view a single column of data
sexes_of_winners = content.sex
sexes_of_winners

0      male
1      male
2    female
Name: sex, dtype: object

In [29]:
# Using loc to perform queries with filters
# See this website for more information:
    # https://www.ritchieng.com/pandas-selecting-multiple-rows-and-columns/
# ufo.loc[ufo.City=='Oakland', 'State']
sex_of_winners_after_1920 = content.loc[content.year >= 1920, ['sex']]
sex_of_winners_after_1920

Unnamed: 0,sex
0,male
1,male


In [30]:
# This performs a query with a filter and also specifies columns to return
nationality_and_year = content.loc[(content.year >= 1920), ['nationality', 'year']]
nationality_and_year

Unnamed: 0,nationality,year
0,Swiss,1921
1,British,1933


In [31]:
# Specifying multiple filters and multiple columns
nationality_and_year_female = content.loc[(content.year >= 1920) & (content.sex != 'male'), ['nationality', 'year']]
nationality_and_year_female

Unnamed: 0,nationality,year
