In [2]:
import pandas as pd
import numpy as np

In [11]:
Arendelle={
    'first' : ['Elsa','Anna','Kristoff','Olaf',np.nan,'NA',None],
    'last' : ['Queen','Princess','Friend','Snowman',np.nan,'Missing',np.nan],
    'email' : ['elsa@arendelle.com','anna@arendelle.com','kristoff@arendelle.com',None,None,np.nan,'NA'],
    'age' : ['28','24','Missing','5',None,None,'Missing']
}

In [12]:
df = pd.DataFrame(Arendelle)
df

Unnamed: 0,first,last,email,age
0,Elsa,Queen,elsa@arendelle.com,28
1,Anna,Princess,anna@arendelle.com,24
2,Kristoff,Friend,kristoff@arendelle.com,Missing
3,Olaf,Snowman,,5
4,,,,
5,,Missing,,
6,,,,Missing


In [14]:
##delete/removes [na,nan,none] values row
df.dropna()

Unnamed: 0,first,last,email,age
0,Elsa,Queen,elsa@arendelle.com,28
1,Anna,Princess,anna@arendelle.com,24
2,Kristoff,Friend,kristoff@arendelle.com,Missing


In [17]:
#axis tells tells pandas to operate on rows/columns
#how means that if any value in a row is missing (NaN), the entire row will be dropped
#Alternatively, you could use how='all', which drops rows only if all values in the row are missing.
df.dropna(axis='index',how='any')

Unnamed: 0,first,last,email,age
0,Elsa,Queen,elsa@arendelle.com,28
1,Anna,Princess,anna@arendelle.com,24
2,Kristoff,Friend,kristoff@arendelle.com,Missing


In [19]:
df.dropna(axis='columns',how='any')

0
1
2
3
4
5
6


In [20]:
df.dropna(axis='columns',how='all')

Unnamed: 0,first,last,email,age
0,Elsa,Queen,elsa@arendelle.com,28
1,Anna,Princess,anna@arendelle.com,24
2,Kristoff,Friend,kristoff@arendelle.com,Missing
3,Olaf,Snowman,,5
4,,,,
5,,Missing,,
6,,,,Missing


In [18]:
df.dropna(axis='index',how='all')

Unnamed: 0,first,last,email,age
0,Elsa,Queen,elsa@arendelle.com,28
1,Anna,Princess,anna@arendelle.com,24
2,Kristoff,Friend,kristoff@arendelle.com,Missing
3,Olaf,Snowman,,5
5,,Missing,,
6,,,,Missing


In [21]:
#It removes rows from the DataFrame df only if all values in the last and email columns are missing (NaN).
df.dropna(axis='index',how='all', subset=['last','email'])

Unnamed: 0,first,last,email,age
0,Elsa,Queen,elsa@arendelle.com,28
1,Anna,Princess,anna@arendelle.com,24
2,Kristoff,Friend,kristoff@arendelle.com,Missing
3,Olaf,Snowman,,5
5,,Missing,,
6,,,,Missing


In [26]:
df.replace('NA',np.nan ,inplace=True)
df.replace('Missing',np.nan ,inplace=True)


In [27]:
df

Unnamed: 0,first,last,email,age
0,Elsa,Queen,elsa@arendelle.com,28.0
1,Anna,Princess,anna@arendelle.com,24.0
2,Kristoff,Friend,kristoff@arendelle.com,
3,Olaf,Snowman,,5.0
4,,,,
5,,,,
6,,,,


In [28]:
df.dropna(axis='index',how='all', subset=['last','email'])

Unnamed: 0,first,last,email,age
0,Elsa,Queen,elsa@arendelle.com,28.0
1,Anna,Princess,anna@arendelle.com,24.0
2,Kristoff,Friend,kristoff@arendelle.com,
3,Olaf,Snowman,,5.0


In [29]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,True
3,False,False,True,False
4,True,True,True,True
5,True,True,True,True
6,True,True,True,True


In [30]:
df.fillna('Missing')

Unnamed: 0,first,last,email,age
0,Elsa,Queen,elsa@arendelle.com,28
1,Anna,Princess,anna@arendelle.com,24
2,Kristoff,Friend,kristoff@arendelle.com,Missing
3,Olaf,Snowman,Missing,5
4,Missing,Missing,Missing,Missing
5,Missing,Missing,Missing,Missing
6,Missing,Missing,Missing,Missing


In [31]:
df.fillna(0)

Unnamed: 0,first,last,email,age
0,Elsa,Queen,elsa@arendelle.com,28
1,Anna,Princess,anna@arendelle.com,24
2,Kristoff,Friend,kristoff@arendelle.com,0
3,Olaf,Snowman,0,5
4,0,0,0,0
5,0,0,0,0
6,0,0,0,0


In [32]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [34]:
df['age'] = df['age'].astype(float)

In [35]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [37]:
#average of that column
df['age'].mean()

np.float64(19.0)

In [41]:
na_vals = ['NA','Missing']
df2 = pd.read_csv('data/survey_results_public.csv', na_values=na_vals)
schema_df = pd.read_csv('data/survey_results_schema.csv')
pd.set_option('display.max_columns',90)
pd.set_option('display.max_rows',50)

In [43]:
df2['YearsCode'].head(10)

0    NaN
1     20
2     37
3      4
4      9
5     10
6      7
7      1
8     20
9     15
Name: YearsCode, dtype: object

In [47]:
df2['YearsCode'].unique()

array([nan, '20', '37', '4', '9', '10', '7', '1', '15', '30', '31', '6',
       '12', '22', '5', '36', '25', '44', '24', '18', '3', '8',
       'More than 50 years', '11', '29', '40', '39', '2', '42', '34',
       '19', '35', '16', '33', '13', '23', '14', '28', '17', '21', '43',
       '46', '26', '32', '41', '45', '27', '38', '50', '48', '47',
       'Less than 1 year', '49'], dtype=object)

In [48]:
df2['YearsCode'].replace('Less than 1 year',0,inplace=True)

In [49]:
df2['YearsCode'].replace('More than 50 years',51,inplace=True)

In [52]:
df2['YearsCode']=df2['YearsCode'].astype(float)

In [53]:
df2['YearsCode'].mean()

np.float64(14.197497870350265)

In [54]:
df2['YearsCode'].median()

np.float64(11.0)