# Manipulating data

## Recap

In [1]:
import pandas as pd

df = pd.read_csv("artists.csv")

df

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year
0,1,Robert Arneson,American,Male,1930.0,1992.0
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,
2,3,Bill Arnold,American,Male,1941.0,
3,4,Charles Arnoldi,American,Male,1946.0,
4,5,Per Arnoldi,Danish,Male,1941.0,
...,...,...,...,...,...,...
15086,67452,Liu Jianhua,Chinese,Male,1962.0,
15087,67453,Leng Lin,Chinese,Male,1965.0,
15088,67652,Ellie Nagler,,,,
15089,67694,Glenn Williams,,Male,,


## Creating a dataframe on fewer columns

In [2]:
# The next line creates a new dataframe
df[["Name", "Nationality"]]

Unnamed: 0,Name,Nationality
0,Robert Arneson,American
1,Doroteo Arnaiz,Spanish
2,Bill Arnold,American
3,Charles Arnoldi,American
4,Per Arnoldi,Danish
...,...,...
15086,Liu Jianhua,Chinese
15087,Leng Lin,Chinese
15088,Ellie Nagler,
15089,Glenn Williams,


In [3]:
df

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year
0,1,Robert Arneson,American,Male,1930.0,1992.0
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,
2,3,Bill Arnold,American,Male,1941.0,
3,4,Charles Arnoldi,American,Male,1946.0,
4,5,Per Arnoldi,Danish,Male,1941.0,
...,...,...,...,...,...,...
15086,67452,Liu Jianhua,Chinese,Male,1962.0,
15087,67453,Leng Lin,Chinese,Male,1965.0,
15088,67652,Ellie Nagler,,,,
15089,67694,Glenn Williams,,Male,,


In [4]:
# If I want the dataframe with fewer columns to be used again, I should assign somewher
# Remember: we are often creating new dataframes rather than modifying the ones previously created

nn_df = df[["Name", "Nationality"]]
nn_df

Unnamed: 0,Name,Nationality
0,Robert Arneson,American
1,Doroteo Arnaiz,Spanish
2,Bill Arnold,American
3,Charles Arnoldi,American
4,Per Arnoldi,Danish
...,...,...
15086,Liu Jianhua,Chinese
15087,Leng Lin,Chinese
15088,Ellie Nagler,
15089,Glenn Williams,


## Querying columns of a dataframe

In [6]:
#df["Nationality"]
df.Nationality

0        American
1         Spanish
2        American
3        American
4          Danish
           ...   
15086     Chinese
15087     Chinese
15088         NaN
15089         NaN
15090         NaN
Name: Nationality, Length: 15091, dtype: object

In [7]:
df["Nationality"].unique() # How many unique nationalities are there?

array(['American', 'Spanish', 'Danish', 'Italian', 'French', 'Estonian',
       'Mexican', 'Swedish', nan, 'Israeli', 'British', 'Finnish',
       'Polish', 'Japanese', 'Guatemalan', 'Colombian', 'Romanian',
       'Russian', 'German', 'Argentine', 'Kuwaiti', 'Various', 'Belgian',
       'Dutch', 'Norwegian', 'Nationality unknown', 'Chilean', 'Swiss',
       'Costa Rican', 'Czech', 'Brazilian', 'Austrian', 'Canadian',
       'Australian', 'Ukrainian', 'Hungarian', 'Haitian', 'Congolese',
       'Bolivian', 'Cuban', 'Yugoslav', 'Portuguese', 'Indian',
       'Icelandic', 'Irish', 'Guyanese', 'Uruguayan', 'Slovak',
       'Croatian', 'Greek', 'Peruvian', 'Chinese', 'Venezuelan',
       'Turkish', 'Panamanian', 'Algerian', 'Ecuadorian', 'South African',
       'Iranian', 'Korean', 'Canadian Inuit', 'Paraguayan',
       'Luxembourgish', 'Nicaraguan', 'Zimbabwean', 'Moroccan',
       'Tanzanian', 'Bulgarian', 'Tunisian', 'Sudanese', 'Taiwanese',
       'Ethiopian', 'Slovenian', 'Scottish', 

In [8]:
len(df["Nationality"].unique()) # Number of unique nationalities

126

In [9]:
df["Nationality"].describe() # Note that this is a categorical variable

count        12603
unique         125
top       American
freq          5198
Name: Nationality, dtype: object

In [10]:
df["Death Year"].describe() # This is a numerical variable

count    4579.000000
mean     1974.287399
std        31.153665
min      1795.000000
25%      1958.000000
50%      1980.000000
75%      1998.000000
max      2017.000000
Name: Death Year, dtype: float64

In [11]:
df["Death Year"].max()

2017.0

In [12]:
df.describe() # If there are numerical variables, only their data is shown

Unnamed: 0,Artist ID,Birth Year,Death Year
count,15091.0,11237.0,4579.0
mean,18297.556027,1930.852719,1974.287399
std,16632.963898,34.531997,31.153665
min,1.0,1730.0,1795.0
25%,4195.5,1910.0,1958.0
50%,8593.0,1936.0,1980.0
75%,33088.5,1956.0,1998.0
max,67695.0,2012.0,2017.0


In [13]:
df["Birth Year"].isna()

0        False
1        False
2        False
3        False
4        False
         ...  
15086    False
15087    False
15088     True
15089     True
15090     True
Name: Birth Year, Length: 15091, dtype: bool

In [14]:
df["Birth Year"].isna().sum()

3854

In [15]:
df.isna()

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year
0,False,False,False,False,False,False
1,False,False,False,False,False,True
2,False,False,False,False,False,True
3,False,False,False,False,False,True
4,False,False,False,False,False,True
...,...,...,...,...,...,...
15086,False,False,False,False,False,True
15087,False,False,False,False,False,True
15088,False,False,True,True,True,True
15089,False,False,True,False,True,True


In [16]:
df.isna().sum()

Artist ID          0
Name               0
Nationality     2488
Gender          3072
Birth Year      3854
Death Year     10512
dtype: int64

In [17]:
df.isna().sum().max()

10512

In [18]:
# Look at everyone born after 2001

df["Birth Year"].gt(2001).sum()

60

## Filtering with recent birth years with gt()

In [19]:
df.loc[df["Birth Year"].gt(2001)]

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year
9562,26398,United Architects,American,,2002.0,
9859,27783,Vlieger & Vandam,Dutch,,2004.0,
9896,27963,"RedStart Design, LLC",American,,2004.0,
9941,28148,"molo design, ltd., vancouver, canada",,,2004.0,
9958,28211,"Plus Minus Zero Co., Ltd.",Japanese,,2003.0,
9987,28352,Neo Human Toys,Dutch,,2003.0,
9995,28387,Architecture and Vision,German,,2003.0,
10378,28944,"SQUINT/OPERA, London, England",,,2002.0,
10968,31430,Demakersvan,,,2004.0,
10985,31614,Established & Sons,British,,2005.0,


## Calculating the z-Score of a column with mean() and std()

In [20]:
df["Death Age"] = df["Death Year"] - df["Birth Year"]
df["Death Age z-Score"] = (df["Death Age"] - df["Death Age"].mean())/df["Death Age"].std()
df

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year,Death Age,Death Age z-Score
0,1,Robert Arneson,American,Male,1930.0,1992.0,62.0,-0.64739
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,,,
2,3,Bill Arnold,American,Male,1941.0,,,
3,4,Charles Arnoldi,American,Male,1946.0,,,
4,5,Per Arnoldi,Danish,Male,1941.0,,,
...,...,...,...,...,...,...,...,...
15086,67452,Liu Jianhua,Chinese,Male,1962.0,,,
15087,67453,Leng Lin,Chinese,Male,1965.0,,,
15088,67652,Ellie Nagler,,,,,,
15089,67694,Glenn Williams,,Male,,,,


## Creating filtered dataframes based on more complex queries

In [21]:
df["Gender"] == "Female"

0        False
1        False
2        False
3        False
4        False
         ...  
15086    False
15087    False
15088    False
15089    False
15090    False
Name: Gender, Length: 15091, dtype: bool

In [22]:
df[df["Gender"] == "Female"]

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year,Death Age,Death Age z-Score
8,10,Irene Aronson,American,Female,1918.0,,,
16,21,Ruth Asawa,American,Female,1926.0,2013.0,87.0,0.920138
17,22,Isidora Aschheim,Israeli,Female,,,,
23,28,Geneviève Asse,French,Female,1923.0,,,
25,31,Dana Atchley,American,Female,1941.0,2000.0,59.0,-0.835493
...,...,...,...,...,...,...,...,...
15013,50154,Ann Magnuson,American,Female,1956.0,,,
15022,67012,Ka Markelius,,Female,,,,
15034,67122,Giorgia Lupi,Italian,Female,1981.0,,,
15044,67272,Toyin Ojih Odutola,American,Female,1985.0,,,


In [23]:
df["Nationality"] == "American"

0         True
1        False
2         True
3         True
4        False
         ...  
15086    False
15087    False
15088    False
15089    False
15090    False
Name: Nationality, Length: 15091, dtype: bool

In [24]:
# We are filtering for Female AND American with &
df[(df["Gender"] == "Female") & (df["Nationality"] == "American")]

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year,Death Age,Death Age z-Score
8,10,Irene Aronson,American,Female,1918.0,,,
16,21,Ruth Asawa,American,Female,1926.0,2013.0,87.0,0.920138
25,31,Dana Atchley,American,Female,1941.0,2000.0,59.0,-0.835493
35,41,Berenice Abbott,American,Female,1898.0,1991.0,93.0,1.296345
53,61,Alice Adams,American,Female,1930.0,,,
...,...,...,...,...,...,...,...,...
14976,49723,Stephanie Rothman,American,Female,1936.0,,,
14977,49733,Sharon Johnston,American,Female,1965.0,,,
15013,50154,Ann Magnuson,American,Female,1956.0,,,
15044,67272,Toyin Ojih Odutola,American,Female,1985.0,,,


In [25]:
# The vertical bar is the logical OR, so we get any artist who is
# either "Female" or "American" (can be both, but should be at least one)
df[(df["Gender"] == "Female") | (df["Nationality"] == "American")]

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year,Death Age,Death Age z-Score
0,1,Robert Arneson,American,Male,1930.0,1992.0,62.0,-0.647390
2,3,Bill Arnold,American,Male,1941.0,,,
3,4,Charles Arnoldi,American,Male,1946.0,,,
6,7,Bill Aron,American,Male,1941.0,,,
7,9,David Aronson,American,Male,1923.0,,,
...,...,...,...,...,...,...,...,...
15078,67349,Julius Klein,American,,1951.0,,,
15079,67350,Howard Guttenplan,American,,1934.0,2015.0,81.0,0.543931
15080,67352,Neelon Crawford,American,,1946.0,,,
15082,67379,N. Dash,American,Female,1980.0,,,
