In [3]:
import pandas as pd
import numpy as np

In [4]:
# Create a table

people = {
    'first':['Corey','Jane','John','Chris',np.nan,None,'NA'],
    'last':['Schafer','Doe','Schafer',np.nan,np.nan,'Missing',None],
    'email':['CoreyMSchafer@gmail.com','JaneDoe@email.com','JohnDoe@email.com',None,np.nan,'Anonymous@email.com','None'],
    'age':['33','55','63','36',None,None,'Missing']
}

In [5]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Schafer,JohnDoe@email.com,63
3,Chris,,,36
4,,,,
5,,Missing,Anonymous@email.com,
6,,,,Missing


In [7]:
# what if we want to convert string to integer
# check first the data type

df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [8]:
# need to convert these to numbers
# because these columns are strings not integers
# when we have NaN, we need to use the float data type
# because Nan Value is a float under the hood

type(np.nan)

float

In [13]:
# if you try to convert this to integer it will give us an error
# df['age'] = df['age'].astype(int) - 
# if you have NaN, convert to zero using fillna() but this is a bad idea, or use float

df['age'] = df['age'].astype(float)

In [12]:
# above will give an error so need to replace NA and Missing with Nan
df = pd.DataFrame(people)
df.replace('NA', np.nan, inplace = True)
df.replace('Missing',np.nan, inplace = True)

In [15]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [16]:
# now you can get the mean without any error

df['age'].mean()

46.75

In [17]:
# if you have entire data frame of numbers you can directly convert it as
# use df.astype() function

In [18]:
# now let's work on the csv file

import pandas as pd

In [22]:
# if you wanted to ignore those custom values when loading in a csv,
# then we can simply pass in argument of a list of values,
# that we want to be treated as missing
# when the csv is run, it will ignore these values and turn these into an NaN values

na_vals = ['NA','Missing']

# read in the file

df = pd.read_csv('/Users/jean110284/Desktop/Everything/MJUPython/survey_results_public.csv')
schema_df = pd.read_csv('/Users/jean110284/Desktop/Everything/MJUPython/survey_results_schema.csv')

In [29]:
df.head(2)

Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Slovakia,,,"Secondary school (e.g. American high school, G...",18 - 24 years,Coding Bootcamp;Other online resources (ex: vi...,,...,25-34 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,62268.0
1,2,I am a student who is learning to code,"Student, full-time",Netherlands,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",7.0,...,18-24 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,


In [23]:
# casting some values
# calculate number of years of coding experience
# check the top 10 of the column

df['YearsCode'].head(10)

# you think there is nothing wrong with the data on the first 10 values
# but if you compute for the mean it gives you an error.
# error "can actually concatenate str (not integer)" to str

0    NaN
1      7
2    NaN
3    NaN
4     17
5    NaN
6      3
7      4
8      6
9      7
Name: YearsCode, dtype: object

In [28]:
# need to convert to a float
# df['YearsCode'] = df['YearsCode'].astype(float)
# this will still produce an error since there is a value error that says
# ValueError: could not convert string to float: 'Less than 1 year'

In [30]:
# check all unique values so we can see exactly what's in here
# and see more something like this
# use unique method

In [31]:
df['YearsCode'].unique()
# as we expect there are lots of data type (strings: less than one year, more than 50 years)

array([nan, '7', '17', '3', '4', '6', '16', '12', '15', '10', '40', '9',
       '26', '14', '39', '20', '8', '19', '5', 'Less than 1 year', '22',
       '2', '1', '34', '21', '13', '25', '24', '30', '31', '18', '38',
       'More than 50 years', '27', '41', '42', '35', '23', '28', '11',
       '37', '44', '43', '36', '33', '45', '29', '50', '46', '32', '47',
       '49', '48'], dtype=object)

In [33]:
# replace the strings depending on the given value
# put inplace=True since we actually want to modify the dataframe

df['YearsCode'].replace('Less than 1 year',0, inplace = True)
df['YearsCode'].replace('More than 50 years',51, inplace = True)

In [34]:
# re-run it
# we can see that it is still an object, still not a float

df['YearsCode'].unique()

array([nan, '7', '17', '3', '4', '6', '16', '12', '15', '10', '40', '9',
       '26', '14', '39', '20', '8', '19', '5', 0, '22', '2', '1', '34',
       '21', '13', '25', '24', '30', '31', '18', '38', '27', '41', '42',
       '35', '23', '28', '11', '37', '44', '43', '36', '33', '45', '29',
       '50', '46', '32', '47', '49', '48'], dtype=object)

In [35]:
# convert it to float

df['YearsCode'] = df['YearsCode'].astype(float)

In [36]:
df

# no more errors

Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Slovakia,,,"Secondary school (e.g. American high school, G...",18 - 24 years,Coding Bootcamp;Other online resources (ex: vi...,,...,25-34 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,62268.0
1,2,I am a student who is learning to code,"Student, full-time",Netherlands,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",7.0,...,18-24 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,
2,3,"I am not primarily a developer, but I write co...","Student, full-time",Russian Federation,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",,...,18-24 years old,Man,No,Prefer not to say,Prefer not to say,None of the above,None of the above,Appropriate in length,Easy,
3,4,I am a developer by profession,Employed full-time,Austria,,,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",11 - 17 years,,,...,35-44 years old,Man,No,Straight / Heterosexual,White or of European descent,I am deaf / hard of hearing,,Appropriate in length,Neither easy nor difficult,
4,5,I am a developer by profession,"Independent contractor, freelancer, or self-em...",United Kingdom of Great Britain and Northern I...,,England,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",5 - 10 years,Friend or family member,17.0,...,25-34 years old,Man,No,,White or of European descent,None of the above,,Appropriate in length,Easy,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83434,83435,I am a developer by profession,Employed full-time,United States of America,Texas,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",6.0,...,25-34 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,I have a concentration and/or memory disorder ...,Appropriate in length,Easy,160500.0
83435,83436,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Benin,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",4.0,...,18-24 years old,Man,No,Straight / Heterosexual,Black or of African descent,None of the above,None of the above,Appropriate in length,Easy,3960.0
83436,83437,I am a developer by profession,Employed full-time,United States of America,New Jersey,,"Secondary school (e.g. American high school, G...",11 - 17 years,School,10.0,...,25-34 years old,Man,No,,White or of European descent,None of the above,None of the above,Appropriate in length,Neither easy nor difficult,90000.0
83437,83438,I am a developer by profession,Employed full-time,Canada,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,Online Courses or Certification;Books / Physic...,5.0,...,25-34 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,I have a mood or emotional disorder (e.g. depr...,Appropriate in length,Neither easy nor difficult,816816.0


In [37]:
# can now compute the mean

df['YearsCode'].mean()

12.232003527639298

In [38]:
df['YearsCode'].median()

10.0

In [None]:
# END! Credits to Brilliant Data Analysis site and Corey Schafer on You Tube as my instructor