In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import altair as alt

df = pd.read_csv('./developer_survey_2020/survey_results_public.csv')
schema_df = pd.read_csv('./developer_survey_2020/survey_results_schema.csv',encoding= 'unicode_escape')
df.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27.0
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4.0
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8.0


In [2]:
df = df.dropna(subset= ['DevType'])

## Objective:  To Do -- create tests for the functions

In [3]:
def clean_counts(df, col, values, col1, col2='Number_of_Developers'):
    raw_df = df[col].value_counts().reset_index()
    clean_df = defaultdict(int)
    for val in values:
        for index, row in raw_df.iterrows():
            if val in list(row)[0]:
                clean_df[val] += int(list(row)[1])
    clean_df = pd.DataFrame(pd.Series(clean_df)).reset_index()
    clean_df.columns = [col1, col2]
    clean_df = clean_df.reset_index(drop=True)
    return clean_df
    
    
def create_plot(df1, df2, y_axis):
    past_df = df1.copy()
    next_df = df2.copy()
    past_df['usage'] = 'Worked with in PAST year'
    next_df['usage'] = 'Want to work with NEXT year'
    concat_df = pd.concat([past_df, next_df])
    plot = alt.Chart(concat_df).mark_bar(opacity=0.9).encode(
        x=alt.X("Number_of_Developers", title="Number of Developers"),
        y=alt.Y(y_axis, sort='-x'),
        color='usage'
    ).properties(width=650)
    return plot
   
def diff_plot(df1, df2, x_axis):
    copy = df1.copy()
    copy['Number_of_Developers'] = \
        df2['Number_of_Developers'] - df1['Number_of_Developers']
    diff_plot = alt.Chart(copy).mark_bar().encode(
        x=x_axis,
        y=alt.Y("Number_of_Developers", title="Number of Developers"),
        color=alt.condition(
            alt.datum.Number_of_Developers > 0,
            alt.value("steelblue"),  # The positive color
            alt.value("orange")  # The negative color
        )
    ).properties(width=650)
    return diff_plot
    

In [4]:
data_jobs = ['Data or business analyst', 'Data or business analyst', 'Engineer, data']

In [5]:
def job_filter(devtype, data_jobs):
    #data_jobs = ['Data or business analyst', 'Data or business analyst', 'Engineer, data']
    for job in data_jobs:
        if job in devtype:
            return 1
    return 0

In [6]:
test1 =  'Developer, back-end;Educator'
test2 = 'Database administrator;Developer, back-end;Developer, front-end;Developer, full-stack'
test3 = 'Data scientist or machine learning specialist;Developer, back-end;Engineer, data'
job_filter(test1, data_jobs)
job_filter(test2, data_jobs)
job_filter(test3, data_jobs)

1

## How many people answering the survey have data science related jobs?

In [7]:
df['DevType'].apply(job_filter, data_jobs= data_jobs).sum()

6653

In [8]:
df['dataJob'] = df['DevType'].apply(job_filter, data_jobs= data_jobs)

## Select data related jobs:

In [11]:
data_jobs_df = df[df['dataJob'] == 1] 

# What are the most popular programing languages being used by data scientist, data analysts and data engineers in 2020?

In [12]:
languages = ['Assembly', 'Bash/Shell/PowerShell', 'C','C#','C++','Dart' ,'Go', 'Haskell',
             'HTML/CSS', 'Java', 'JavaScript', 'Julia','Kotlin', 'Objective-C', 'Perl',
             'PHP', 'Python', 'R']

In [17]:
LanguageWorkedWith = clean_counts(data_jobs_df,'LanguageWorkedWith', languages, 'language')

In [15]:
LanguageDesireNextYear = clean_counts(data_jobs_df,'LanguageDesireNextYear', languages, 'language')

In [20]:
create_plot(LanguageWorkedWith, LanguageDesireNextYear, 'language')

In [21]:
diff_plot(LanguageWorkedWith, LanguageDesireNextYear, 'language')

# What are the most popular database environments used by Data Scientists in 2020?

In [22]:
databases = ['Cassandra', 'Couchbase', 'DynamoDB', 'Elasticsearch', 'Firebase',
             'IBM DB2', 'MariaDB', 'Microsoft', 'SQL Server', 'MongoDB', 'MySQL', 
             'Oracle']

In [25]:
DatabaseWorkedWith = clean_counts(data_jobs_df,'DatabaseWorkedWith', databases, 'database')

# What are the most desired databases for next year?

In [24]:
DatabaseDesireNextYear = clean_counts(data_jobs_df,'DatabaseDesireNextYear', databases, 'database')

In [27]:
create_plot(DatabaseWorkedWith, DatabaseDesireNextYear, 'database')

In [28]:
diff_plot(DatabaseWorkedWith, DatabaseDesireNextYear, 'database')

# What are the most used platforms by data scientists in 2020?

In [29]:
platforms = ['Android','Arduino', 'AWS', 'Docker', 'Google Cloud Platform', 'Heroku', 
             'IBM Cloud or Watson iOS', 'Kubernetes', 'Linux', 'MacOS', 'Microsoft Azure', 
             'Raspberry Pi', 'Slack Apps and Integrations Windows'] 

In [30]:
PlatformWorkedWith = clean_counts(data_jobs_df,'PlatformWorkedWith', platforms, 'platforms')

# What platforms do data scientists want to work with NEXT year?

In [31]:
PlatformDesireNextYear= clean_counts(data_jobs_df,'PlatformDesireNextYear', platforms, 'platforms')

In [32]:
create_plot(PlatformWorkedWith, PlatformDesireNextYear, 'platforms')

In [33]:
diff_plot(PlatformWorkedWith, PlatformDesireNextYear, 'platforms')

# what are the most used frameworks for data scientists?

In [34]:
other_framworks = ['.NET', '.NET', 'Core', 'Ansible', 'Apache Spark', 'Chef', 
                 'Cordova', 'Flutter', 'Hadoop', 'Keras', 'Node.js', 'Pandas', 
                 'Puppet', 'React', 'Native', 'TensorFlow', 'Terraform', 'Torch/PyTorch', 'Unity', 
                 '3D Unreal Engine'] 

In [38]:
MiscTechWorkedWith = clean_counts(data_jobs_df,'MiscTechWorkedWith', other_framworks, 'frameworks')

# what are the most desired frameworks for data scientists for the future?

In [39]:
MiscTechDesireNextYear = clean_counts(data_jobs_df,'MiscTechDesireNextYear', other_framworks, 'frameworks')

In [41]:
create_plot(MiscTechWorkedWith, MiscTechDesireNextYear, 'frameworks')