# Part 9: Data Cleaning

In [59]:
authors = {
    'first': ['Ernest', 'Stephen', 'Harper', 'J.K.', np.nan, None, 'NA'],
    'last': ['Hemingway', 'King', 'Lee', 'Rowling', np.nan, np.nan, 'Missing'],
    'email': ['ehemingway@email.com', 'sking@email.com', 'hlee@email.com', None, np.nan, 'mangelou@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

ff = pl.DataFrame(authors)
ff

first,last,email,age
str,str,str,str
"""Ernest""","""Hemingway""","""ehemingway@email.com""","""33"""
"""Stephen""","""King""","""sking@email.com""","""55"""
"""Harper""","""Lee""","""hlee@email.com""","""63"""
"""J.K.""","""Rowling""",,"""36"""
,,,
,,"""mangelou@email.com""",
"""NA""","""Missing""","""NA""","""Missing"""


In [60]:
# Drop if no first, last or email or age
ff.drop_nulls()

first,last,email,age
str,str,str,str
"""Ernest""","""Hemingway""","""ehemingway@email.com""","""33"""
"""Stephen""","""King""","""sking@email.com""","""55"""
"""Harper""","""Lee""","""hlee@email.com""","""63"""
"""NA""","""Missing""","""NA""","""Missing"""


In [61]:
# Drop if all the values in row are missing
ff.filter(~pl.all_horizontal(pl.all().is_null()))

first,last,email,age
str,str,str,str
"""Ernest""","""Hemingway""","""ehemingway@email.com""","""33"""
"""Stephen""","""King""","""sking@email.com""","""55"""
"""Harper""","""Lee""","""hlee@email.com""","""63"""
"""J.K.""","""Rowling""",,"""36"""
,,"""mangelou@email.com""",
"""NA""","""Missing""","""NA""","""Missing"""


In [62]:
# Drop missing values in specific column
# Target email to Drop JK Rowling
ff.drop_nulls(subset='email')
# ff.drop_nulls(subset=('last','email')) #either column

first,last,email,age
str,str,str,str
"""Ernest""","""Hemingway""","""ehemingway@email.com""","""33"""
"""Stephen""","""King""","""sking@email.com""","""55"""
"""Harper""","""Lee""","""hlee@email.com""","""63"""
,,"""mangelou@email.com""",
"""NA""","""Missing""","""NA""","""Missing"""


In [63]:
# Drop missing values in both columns (true in both)
ff.filter(~((pl.col('last') == None) & (pl.col('email') == None)))

first,last,email,age
str,str,str,str
"""Ernest""","""Hemingway""","""ehemingway@email.com""","""33"""
"""Stephen""","""King""","""sking@email.com""","""55"""
"""Harper""","""Lee""","""hlee@email.com""","""63"""
"""J.K.""","""Rowling""",,"""36"""
,,"""mangelou@email.com""",
"""NA""","""Missing""","""NA""","""Missing"""


In [64]:
authors_ = {
    'first': ['Ernest', 'Stephen', 'Harper', 'J.K.', np.nan, None, 'NA'],
    'last': ['Hemingway', 'King', 'Lee', 'Rowling', np.nan, np.nan, 'Missing'],
    'email': ['ehemingway@email.com', 'sking@email.com', 'hlee@email.com', None, np.nan, 'mangelou@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing'],
    'bestseller': [None,None,None,None,None,None,None]
}

gg = pl.DataFrame(authors_)
gg

first,last,email,age,bestseller
str,str,str,str,f32
"""Ernest""","""Hemingway""","""ehemingway@email.com""","""33""",
"""Stephen""","""King""","""sking@email.com""","""55""",
"""Harper""","""Lee""","""hlee@email.com""","""63""",
"""J.K.""","""Rowling""",,"""36""",
,,,,
,,"""mangelou@email.com""",,
"""NA""","""Missing""","""NA""","""Missing""",


In [65]:
gg[[s.name for s in gg if not (s.null_count() == gg.height)]]

first,last,email,age
str,str,str,str
"""Ernest""","""Hemingway""","""ehemingway@email.com""","""33"""
"""Stephen""","""King""","""sking@email.com""","""55"""
"""Harper""","""Lee""","""hlee@email.com""","""63"""
"""J.K.""","""Rowling""",,"""36"""
,,,
,,"""mangelou@email.com""",
"""NA""","""Missing""","""NA""","""Missing"""


In [36]:
# Show original dataframe
gg

first,last,email,age,bestseller
str,str,str,str,f32
"""Ernest""","""Hemingway""","""ehemingway@email.com""","""33""",
"""Stephen""","""King""","""sking@email.com""","""55""",
"""Harper""","""Lee""","""hlee@email.com""","""63""",
"""J.K.""","""Rowling""",,"""36""",
,,,,
,,"""mangelou@email.com""",,
"""NA""","""Missing""","""NA""","""Missing""",


In [37]:
# Explanation!
gg['bestseller'].null_count()
# gg.height

7

In [38]:
# Replace NA with empty value
# ff['first'].str.replace('NA','')#.drop_nulls()

# Let's try
# ff['first'].str.replace('NA', np.nan)#.drop_nulls()

# This works!
(ff
 .select(pl.when(pl.col('first') == 'NA')
                .then(None)
                .otherwise(pl.col('first'))
                .alias('first'))
#  .drop_nulls()
 )



first
str
"""Ernest"""
"""Stephen"""
"""Harper"""
"""J.K."""
""
""
""


In [66]:
# Make function to change in every single column.
def na_to_null(ff):
    for column in ff.columns:
        ff = ff.with_columns(
            pl.when(pl.col(column) == 'NA')
            .then(None)
            .when(pl.col(column) == 'Missing')
            .then(None)
            .otherwise(pl.col(column))
            .alias(column)
        )
    
    return ff

ff = ff.pipe(na_to_null)
ff


first,last,email,age
str,str,str,str
"""Ernest""","""Hemingway""","""ehemingway@email.com""","""33"""
"""Stephen""","""King""","""sking@email.com""","""55"""
"""Harper""","""Lee""","""hlee@email.com""","""63"""
"""J.K.""","""Rowling""",,"""36"""
,,,
,,"""mangelou@email.com""",
,,,


In [67]:
# Chekc data type
ff.dtypes

[Utf8, Utf8, Utf8, Utf8]

In [42]:
# Change datatype for age
ff['age'].cast(pl.Int8)#.mean()

46.75

In [43]:
df

Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,OrgSize,DevType,YearsCode,Age1stCode,YearsCodePro,CareerSat,JobSat,MgrIdiot,MgrMoney,MgrWant,JobSeek,LastHireDate,LastInt,FizzBuzz,JobFactors,ResumeUpdate,CurrencySymbol,CurrencyDesc,CompTotal,CompFreq,ConvertedComp,WorkWeekHrs,WorkPlan,WorkChallenge,WorkRemote,WorkLoc,…,PlatformDesireNextYear,WebFrameWorkedWith,WebFrameDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,DevEnviron,OpSys,Containers,BlockchainOrg,BlockchainIs,BetterLife,ITperson,OffOn,SocialMedia,Extraversion,ScreenName,SOVisit1st,SOVisitFreq,SOVisitTo,SOFindAnswer,SOTimeSaved,SOHowMuchTime,SOAccount,SOPartFreq,SOJobs,EntTeams,SOComm,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
1,"""I am a student who is learning to code""","""Yes""","""Never""","""The quality of OSS and closed source software is …","""Not employed, and not looking for work""","""United Kingdom""","""No""","""Primary/elementary school""","""NA""","""Taught yourself a new language, framework, or too…","""NA""","""NA""","""4""","""10""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",…,"""Android;Arduino;Windows""","""Django;Flask""","""Flask;jQuery""","""Node.js""","""Node.js""","""IntelliJ;Notepad++;PyCharm""","""Windows""","""I do not use containers""","""NA""","""NA""","""Yes""","""Fortunately, someone else has that title""","""Yes""","""Twitter""","""Online""","""Username""","""2017""","""A few times per month or weekly""","""Find answers to specific questions;Learn how to d…","""3-5 times per week""","""Stack Overflow was much faster""","""31-60 minutes""","""No""","""NA""","""No, I didn't know that Stack Overflow had a job b…","""No, and I don't know what those are""","""Neutral""","""Just as welcome now as I felt last year""","""Tech articles written by other developers;Industr…","""14""","""Man""","""No""","""Straight / Heterosexual""","""NA""","""No""","""Appropriate in length""","""Neither easy nor difficult"""
2,"""I am a student who is learning to code""","""No""","""Less than once per year""","""The quality of OSS and closed source software is …","""Not employed, but looking for work""","""Bosnia and Herzegovina""","""Yes, full-time""","""Secondary school (e.g. American high school, Germ…","""NA""","""Taken an online course in programming or software…","""NA""","""Developer, desktop or enterprise applications;Dev…","""NA""","""17""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""I am actively looking for a job""","""I've never had a job""","""NA""","""NA""","""Financial performance or funding status of the co…","""Something else changed (education, award, media, …","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",…,"""Windows""","""Django""","""Django""","""NA""","""NA""","""Atom;PyCharm""","""Windows""","""I do not use containers""","""NA""","""Useful across many domains and could change many …","""Yes""","""Yes""","""Yes""","""Instagram""","""Online""","""Username""","""2017""","""Daily or almost daily""","""Find answers to specific questions;Learn how to d…","""3-5 times per week""","""Stack Overflow was much faster""","""11-30 minutes""","""Yes""","""A few times per month or weekly""","""No, I knew that Stack Overflow had a job board bu…","""No, and I don't know what those are""","""Yes, somewhat""","""Just as welcome now as I felt last year""","""Tech articles written by other developers;Industr…","""19""","""Man""","""No""","""Straight / Heterosexual""","""NA""","""No""","""Appropriate in length""","""Neither easy nor difficult"""
3,"""I am not primarily a developer, but I write code …","""Yes""","""Never""","""The quality of OSS and closed source software is …","""Employed full-time""","""Thailand""","""No""","""Bachelor’s degree (BA, BS, B.Eng., etc.)""","""Web development or web design""","""Taught yourself a new language, framework, or too…","""100 to 499 employees""","""Designer;Developer, back-end;Developer, front-end…","""3""","""22""","""1""","""Slightly satisfied""","""Slightly satisfied""","""Not at all confident""","""Not sure""","""Not sure""","""I’m not actively looking, but I am open to new op…","""1-2 years ago""","""Interview with people in peer roles""","""No""","""Languages, frameworks, and other technologies I'd…","""I was preparing for a job search""","""THB""","""Thai baht""","""23000""","""Monthly""","""8820""","""40""","""There's no schedule or spec; I work on what seems…","""Distracting work environment;Inadequate access to…","""Less than once per month / Never""","""Home""",…,"""NA""","""NA""","""Other(s):""","""NA""","""NA""","""Vim;Visual Studio Code""","""Linux-based""","""I do not use containers""","""NA""","""NA""","""Yes""","""Yes""","""Yes""","""Reddit""","""In real life (in person)""","""Username""","""2011""","""A few times per week""","""Find answers to specific questions;Learn how to d…","""6-10 times per week""","""They were about the same""","""NA""","""Yes""","""Less than once per month or monthly""","""Yes""","""No, I've heard of them, but I am not part of a pr…","""Neutral""","""Just as welcome now as I felt last year""","""Tech meetups or events in your area;Courses on te…","""28""","""Man""","""No""","""Straight / Heterosexual""","""NA""","""Yes""","""Appropriate in length""","""Neither easy nor difficult"""
4,"""I am a developer by profession""","""No""","""Never""","""The quality of OSS and closed source software is …","""Employed full-time""","""United States""","""No""","""Bachelor’s degree (BA, BS, B.Eng., etc.)""","""Computer science, computer engineering, or softwa…","""Taken an online course in programming or software…","""100 to 499 employees""","""Developer, full-stack""","""3""","""16""","""Less than 1 year""","""Very satisfied""","""Slightly satisfied""","""Very confident""","""No""","""Not sure""","""I am not interested in new job opportunities""","""Less than a year ago""","""Write code by hand (e.g., on a whiteboard);Interv…","""No""","""Languages, frameworks, and other technologies I'd…","""I was preparing for a job search""","""USD""","""United States dollar""","""61000""","""Yearly""","""61000""","""80""","""There's no schedule or spec; I work on what seems…","""NA""","""Less than once per month / Never""","""Home""",…,"""Linux;Windows""","""NA""","""NA""",""".NET""",""".NET""","""Eclipse;Vim;Visual Studio;Visual Studio Code""","""Windows""","""I do not use containers""","""Not at all""","""Useful for decentralized currency (i.e., Bitcoin)…","""Yes""","""SIGH""","""Yes""","""Reddit""","""In real life (in person)""","""Username""","""2014""","""Daily or almost daily""","""Find answers to specific questions;Pass the time …","""1-2 times per week""","""Stack Overflow was much faster""","""31-60 minutes""","""Yes""","""Less than once per month or monthly""","""Yes""","""No, and I don't know what those are""","""No, not really""","""Just as welcome now as I felt last year""","""Tech articles written by other developers;Industr…","""22""","""Man""","""No""","""Straight / Heterosexual""","""White or of European descent""","""No""","""Appropriate in length""","""Easy"""
5,"""I am a developer by profession""","""Yes""","""Once a month or more often""","""OSS is, on average, of HIGHER quality than propri…","""Employed full-time""","""Ukraine""","""No""","""Bachelor’s degree (BA, BS, B.Eng., etc.)""","""Computer science, computer engineering, or softwa…","""Taken an online course in programming or software…","""10,000 or more employees""","""Academic researcher;Developer, desktop or enterpr…","""16""","""14""","""9""","""Very dissatisfied""","""Slightly dissatisfied""","""Somewhat confident""","""Yes""","""No""","""I am not interested in new job opportunities""","""Less than a year ago""","""Write any code;Write code by hand (e.g., on a whi…","""No""","""Industry that I'd be working in;Languages, framew…","""I was preparing for a job search""","""UAH""","""Ukrainian hryvnia""","""NA""","""NA""","""NA""","""55""","""There is a schedule and/or spec (made by me or by…","""Being tasked with non-development work;Inadequate…","""A few days each month""","""Office""",…,"""Android;Docker;Kubernetes;Linux;Slack""","""Django;Express;Flask;jQuery;React.js;Spring""","""Flask;jQuery;React.js;Spring""","""Cordova;Node.js""","""Apache Spark;Hadoop;Node.js;React Native""","""IntelliJ;Notepad++;Vim""","""Linux-based""","""Outside of work, for personal projects""","""Not at all""","""NA""","""Yes""","""Also Yes""","""Yes""","""Facebook""","""In real life (in person)""","""Username""","""I don't remember""","""Multiple times per day""","""Find answers to specific questions""","""More than 10 times per week""","""Stack Overflow was much faster""","""NA""","""Yes""","""A few times per month or weekly""","""No, I knew that Stack Overflow had a job board bu…","""No, I've heard of them, but I am not part of a pr…","""Yes, definitely""","""Just as welcome now as I felt last year""","""Tech meetups or events in your area;Courses on te…","""30""","""Man""","""No""","""Straight / Heterosexual""","""White or of European descent;Multiracial""","""No""","""Appropriate in length""","""Easy"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
88377,"""NA""","""Yes""","""Less than once a month but more than once per yea…","""The quality of OSS and closed source software is …","""Not employed, and not looking for work""","""Canada""","""No""","""Primary/elementary school""","""NA""","""Taught yourself a new language, framework, or too…","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",…,"""Google Cloud Platform;Linux""","""jQuery""","""jQuery;Vue.js""","""Node.js""","""React Native;Unity 3D;Unreal Engine""","""Atom;Visual Studio;Visual Studio Code""","""Windows""","""I do not use containers""","""NA""","""Useful across many domains and could change many …","""Yes""","""Yes""","""What?""","""YouTube""","""NA""","""Username""","""I don't remember""","""A few times per week""","""Find answers to specific questions;Learn how to d…","""3-5 times per week""","""Stack Overflow was slightly faster""","""11-30 minutes""","""Yes""","""I have never participated in Q&A on Stack Overflo…","""No, I knew that Stack Overflow had a job board bu…","""No, I've heard of them, but I am not part of a pr…","""No, not at all""","""NA""","""Tech articles written by other developers;Tech me…","""NA""","""Man""","""No""","""NA""","""NA""","""No""","""Appropriate in length""","""Easy"""
88601,"""NA""","""No""","""Never""","""The quality of OSS and closed source software is …","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",…,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA"""
88802,"""NA""","""No""","""Never""","""NA""","""Employed full-time""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",…,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA"""
88816,"""NA""","""No""","""Never""","""OSS is, on average, of HIGHER quality than propri…","""Independent contractor, freelancer, or self-emplo…","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",…,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA"""


In [49]:
# Calculation doesn't work.
df['YearsCode'].mean()

In [56]:
(df
 .with_columns(pl.when(pl.col('YearsCode') == 'NA')
               .then(None)
               .when(pl.col('YearsCode') == 'Less than 1 year')
               .then(0)
               .when(pl.col('YearsCode') == 'More than 50 years')
               .then(51)
               .otherwise(pl.col('YearsCode'))
               .alias('YearsCode')
               .cast(pl.Int8))
 .select('YearsCode')
 .median()
 )

YearsCode
f64
9.0
