# Part 8: Grouping & Aggregating

In [7]:
df['ConvertedComp'].head(15)

ConvertedComp
str
"""NA"""
"""NA"""
"""8820"""
"""61000"""
"""NA"""
…
"""NA"""
"""NA"""
"""90000"""
"""57060"""


In [8]:
# Check median salary
# Doesn't work. Datatype is string
df['ConvertedComp'].median()

In [9]:
# Convert to float
(df
 .with_columns(pl.when(pl.col('ConvertedComp') == 'NA')
               .then(None)
               .otherwise(pl.col('ConvertedComp'))
               .alias('ConvertedComp')
               .cast(pl.Float64))
 .select(pl.col('ConvertedComp'))
 .median() #now it works
)

ConvertedComp
f64
57287.0


In [10]:
# Change all cols that must be numerical
columns_to_process = ['CompTotal', 'ConvertedComp', 'WorkWeekHrs', 'CodeRevHrs', 'Age']

# Loop through the columns and apply the transformation
for col_name in columns_to_process:
    df = df.with_columns(
        pl.when(pl.col(col_name) == 'NA')
        .then(None)
        .when(pl.col(col_name) == 'Unknown')
        .then(None)
        .otherwise(pl.col(col_name))
        .alias(col_name)
        .cast(pl.Float64)
    )

In [11]:
# Check to see their datatypes
df[['CompTotal', 'ConvertedComp', 'WorkWeekHrs', 'CodeRevHrs', 'Age']]

CompTotal,ConvertedComp,WorkWeekHrs,CodeRevHrs,Age
f64,f64,f64,f64,f64
,,,,14.0
,,,,19.0
23000.0,8820.0,40.0,,28.0
61000.0,61000.0,80.0,,22.0
,,55.0,,30.0
…,…,…,…,…
,,,,
,,,,
,,,,
,,,,


In [12]:
# Median for entire dataframe
df.median()
df[['CompTotal', 'ConvertedComp', 'WorkWeekHrs', 'CodeRevHrs', 'Age']].median()

CompTotal,ConvertedComp,WorkWeekHrs,CodeRevHrs,Age
f64,f64,f64,f64,f64
62000.0,57287.0,40.0,4.0,29.0


In [13]:
# Use describe to get summary stats
df.describe()
df[['CompTotal', 'ConvertedComp', 'WorkWeekHrs', 'CodeRevHrs', 'Age']].describe()

describe,CompTotal,ConvertedComp,WorkWeekHrs,CodeRevHrs,Age
str,f64,f64,f64,f64,f64
"""count""",88883.0,88883.0,88883.0,88883.0,88883.0
"""null_count""",32938.0,33060.0,24380.0,39093.0,9673.0
"""mean""",551900000000.0,127110.738423,42.127197,5.084308,30.336699
"""std""",73319000000000.0,284152.303842,37.28761,5.513931,9.17839
"""min""",0.0,0.0,1.0,0.0,1.0
"""25%""",20000.0,25776.0,40.0,2.0,24.0
"""50%""",62000.0,57287.0,40.0,4.0,29.0
"""75%""",120000.0,100000.0,45.0,6.0,35.0
"""max""",1e+16,2000000.0,4850.0,99.0,99.0


In [14]:
# How many people answered yes/no
df['Hobbyist'].value_counts()

Hobbyist,counts
str,u32
"""Yes""",71257
"""No""",17626


In [15]:
# Show which social media is used most
df['SocialMedia']

SocialMedia
str
"""Twitter"""
"""Instagram"""
"""Reddit"""
"""Reddit"""
"""Facebook"""
…
"""YouTube"""
"""NA"""
"""NA"""
"""NA"""


In [16]:
# See the actual question that was asked
schema_df.filter(pl.col('Column') == 'SocialMedia')

Column,QuestionText
str,str
"""SocialMedia""","""What social media site do you use the most?"""


In [17]:
# Show which social media is used most
df['SocialMedia'].value_counts(sort=True)

SocialMedia,counts
str,u32
"""Reddit""",14374
"""YouTube""",13830
"""WhatsApp""",13347
"""Facebook""",13178
"""Twitter""",11398
…,…
"""Snapchat""",628
"""VK ВКонта́кте""",603
"""Weibo 新浪微博""",56
"""Youku Tudou 优酷""",21


In [18]:
# Which social media platform is used the most in a specific country
# Combine country and social media cols. This is where groupby comes in handy
# First find which country answered this survey the most.

df['Country'].value_counts(sort=True)

Country,counts
str,u32
"""United States""",20949
"""India""",9061
"""Germany""",5866
"""United Kingdom""",5737
"""Canada""",3395
…,…
"""Papua New Guinea""",1
"""Saint Kitts and Nevis""",1
"""Saint Vincent and the Grenadines""",1
"""Sao Tome and Principe""",1


In [19]:
# Group by country
country_grp = df.group_by('Country')

In [20]:
# Most widely used social media platform in India
(df
 .filter(pl.col('Country') == 'India')
 ['SocialMedia'].value_counts(sort=True)
 )

SocialMedia,counts
str,u32
"""WhatsApp""",2990
"""YouTube""",1820
"""LinkedIn""",955
"""Facebook""",841
"""Instagram""",822
…,…
"""Hello""",5
"""WeChat 微信""",5
"""VK ВКонта́кте""",4
"""Youku Tudou 优酷""",2


In [21]:
df[['Country','SocialMedia']]

Country,SocialMedia
str,str
"""United Kingdom""","""Twitter"""
"""Bosnia and Herzegovina""","""Instagram"""
"""Thailand""","""Reddit"""
"""United States""","""Reddit"""
"""Ukraine""","""Facebook"""
…,…
"""Canada""","""YouTube"""
"""NA""","""NA"""
"""NA""","""NA"""
"""NA""","""NA"""


In [22]:
# Finally!
(df
 .group_by(['Country', 'SocialMedia'])
 .agg(pl.col('SocialMedia').count().alias('Count'))
 .sort('Country')
#  .filter(pl.col('Country') == 'India')
#  .sort('Count', descending=True)
 )

Country,SocialMedia,Count
str,str,u32
"""Afghanistan""","""I don't use social media""",6
"""Afghanistan""","""Twitter""",1
"""Afghanistan""","""WhatsApp""",4
"""Afghanistan""","""Instagram""",1
"""Afghanistan""","""Facebook""",15
…,…,…
"""Zimbabwe""","""Reddit""",1
"""Zimbabwe""","""WhatsApp""",20
"""Zimbabwe""","""Instagram""",2
"""Zimbabwe""","""Facebook""",3


In [23]:
# See median salary for each country
(df
 .group_by('Country')
 .agg(pl.col('ConvertedComp').median())
#  .filter(pl.col('Country') == 'Germany')
 )

Country,ConvertedComp
str,f64
"""Dominica""",
"""Hungary""",26412.0
"""Luxembourg""",74364.0
"""Mali""",10476.0
"""Democratic Republic of the Congo""",110484.0
…,…
"""Swaziland""",62371.0
"""San Marino""",301788.0
"""India""",10080.0
"""North Korea""",


In [24]:
# Multiple aggregates
(df
 .group_by('Country')
#  .agg(pl.median('ConvertedComp'), 
#       pl.mean('ConvertedComp')) #GOTCHA!
 .agg([pl.median('ConvertedComp').prefix('median_'),
       pl.mean('ConvertedComp').prefix('mean_')])
 )

Country,median_ConvertedComp,mean_ConvertedComp
str,f64,f64
"""United Republic of Tanzania""",5192.0,34691.117647
"""Bosnia and Herzegovina""",18360.0,24996.283333
"""Mongolia""",8562.0,8745.5
"""Saudi Arabia""",43200.0,46463.408163
"""Guinea""",1956.0,1956.0
…,…,…
"""Georgia""",13548.0,42008.204082
"""Singapore""",57758.5,120621.50641
"""South Africa""",38354.0,63550.602871
"""Spain""",40101.0,88724.357262


In [25]:
# Median and Mean for Canada
(df
 .group_by('Country')
 .agg([pl.median('ConvertedComp').prefix('median_'),
       pl.mean('ConvertedComp').prefix('mean_')])
 .filter(pl.col('Country') == 'Canada')
#  .melt()
 )

Country,median_ConvertedComp,mean_ConvertedComp
str,f64,f64
"""Canada""",68705.0,134018.564909


In [26]:
# How many people from India know Python
(df
 .filter((pl.col('Country') == 'India')
         & pl.col('LanguageWorkedWith').str.contains('Python'))
 .height
 )

3105

In [27]:
## CHALLENGE
# Percentage of people from each country who know Python
country_respondents = df['Country'].value_counts().rename({'counts': 'NumRespondents'})
country_respondents

Country,NumRespondents
str,u32
"""Turkey""",949
"""Australia""",1903
"""Andorra""",7
"""Indonesia""",507
"""Lesotho""",3
…,…
"""Tunisia""",130
"""Costa Rica""",84
"""Sri Lanka""",372
"""Sierra Leone""",2


In [29]:
# worked!
(df
 .filter(pl.col('LanguageWorkedWith').str.contains('Python'))
 .group_by('Country')
 .agg(pl.col('LanguageWorkedWith').str.contains('Python').count().alias('NumKnowsPython'))
 )

Country,NumKnowsPython
str,u32
"""Pakistan""",251
"""Belarus""",69
"""Armenia""",28
"""Chile""",102
"""Ukraine""",246
…,…
"""Zambia""",4
"""Iran""",268
"""Côte d'Ivoire""",11
"""Libyan Arab Jamahiriya""",2


In [30]:
# Create dataframe for people who know python
python_df = (df
 .filter(pl.col('LanguageWorkedWith').str.contains('Python'))
 .group_by('Country')
 .agg(pl.col('LanguageWorkedWith').str.contains('Python').count().alias('NumKnowsPython'))
 )
python_df

Country,NumKnowsPython
str,u32
"""Malta""",20
"""Democratic Republic of the Congo""",3
"""Slovakia""",101
"""Ireland""",209
"""Qatar""",5
…,…
"""Hong Kong (S.A.R.)""",88
"""Cameroon""",16
"""Azerbaijan""",17
"""Peru""",47


In [31]:
# Join the dataframes
python_df = country_respondents.join(python_df, on='Country', how='left')
python_df

Country,NumRespondents,NumKnowsPython
str,u32,u32
"""Turkey""",949,371
"""Australia""",1903,790
"""Andorra""",7,
"""Indonesia""",507,148
"""Lesotho""",3,1
…,…,…
"""Tunisia""",130,40
"""Costa Rica""",84,30
"""Sri Lanka""",372,137
"""Sierra Leone""",2,1


In [35]:
(python_df
 .with_columns(((pl.col('NumKnowsPython') / pl.col('NumRespondents'))*100).alias('PctKnowsPython'))
 .sort('PctKnowsPython', descending=True) #A little misleading for 1 respondent.
 .filter(pl.col('Country') == 'United States') #see pct for USA
 )

Country,NumRespondents,NumKnowsPython,PctKnowsPython
str,u32,u32,f64
"""United States""",20949,10083,48.131176
