In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('survey_results_public.csv')

In [3]:
df.head(5)

Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Slovakia,,,"Secondary school (e.g. American high school, G...",18 - 24 years,Coding Bootcamp;Other online resources (ex: vi...,,...,25-34 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,62268.0
1,2,I am a student who is learning to code,"Student, full-time",Netherlands,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",7.0,...,18-24 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,
2,3,"I am not primarily a developer, but I write co...","Student, full-time",Russian Federation,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",,...,18-24 years old,Man,No,Prefer not to say,Prefer not to say,None of the above,None of the above,Appropriate in length,Easy,
3,4,I am a developer by profession,Employed full-time,Austria,,,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",11 - 17 years,,,...,35-44 years old,Man,No,Straight / Heterosexual,White or of European descent,I am deaf / hard of hearing,,Appropriate in length,Neither easy nor difficult,
4,5,I am a developer by profession,"Independent contractor, freelancer, or self-em...",United Kingdom of Great Britain and Northern I...,,England,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",5 - 10 years,Friend or family member,17.0,...,25-34 years old,Man,No,,White or of European descent,None of the above,,Appropriate in length,Easy,


In [4]:
df = df[["Country","EdLevel","YearsCodePro","Employment","ConvertedCompYearly"]]
df = df.rename({"ConvertedCompYearly":"Salary"},axis=1)

In [5]:
df.head()

Unnamed: 0,Country,EdLevel,YearsCodePro,Employment,Salary
0,Slovakia,"Secondary school (e.g. American high school, G...",,"Independent contractor, freelancer, or self-em...",62268.0
1,Netherlands,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",,"Student, full-time",
2,Russian Federation,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",,"Student, full-time",
3,Austria,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",,Employed full-time,
4,United Kingdom of Great Britain and Northern I...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",10.0,"Independent contractor, freelancer, or self-em...",


In [6]:
df = df[df["Salary"].notnull()]

In [7]:
df.head()

Unnamed: 0,Country,EdLevel,YearsCodePro,Employment,Salary
0,Slovakia,"Secondary school (e.g. American high school, G...",,"Independent contractor, freelancer, or self-em...",62268.0
9,Sweden,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",4.0,Employed full-time,51552.0
11,Spain,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",5.0,Employed full-time,46482.0
12,Germany,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",6.0,Employed full-time,77290.0
16,Turkey,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",2.0,Employed full-time,17748.0


In [8]:
df = df.dropna()

In [9]:
df.isnull().sum()

Country         0
EdLevel         0
YearsCodePro    0
Employment      0
Salary          0
dtype: int64

In [10]:
df = df[df["Employment"] == "Employed, full-time"]

In [11]:
df = df.drop("Employment",axis=1)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country       0 non-null      object 
 1   EdLevel       0 non-null      object 
 2   YearsCodePro  0 non-null      object 
 3   Salary        0 non-null      float64
dtypes: float64(1), object(3)
memory usage: 0.0+ bytes


In [13]:
df['Country'].value_counts()

Series([], Name: Country, dtype: int64)

In [14]:
def shorten_category(categories,cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [15]:
country_map = shorten_category(df.Country.value_counts(),400)
df['Country'] = df['Country'].map(country_map)
df.Country.value_counts()

Series([], Name: Country, dtype: int64)

In [16]:
df = df[df['Salary'] <= 250000]
df = df[df['Salary'] <=10000]
df = df[df['Country'] != 'Other']

In [17]:
df['YearsCodePro'].unique()

array([], dtype=object)

In [18]:
def clean_experience(x):
    if x == 'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCodePro'] = df['YearsCodePro'].apply(clean_experience)

In [19]:
df['YearsCodePro'].unique()

array([], dtype=object)

In [20]:
df['EdLevel'].unique()

array([], dtype=object)

In [21]:
def clean_education(x):
    if x == 'Bachelor’s degree (B.A., B.S., B.Eng., etc.)':
        return 'Bachelor’s Degree'
    if x == 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)':
        return 'Master’s degree'
    if x == 'Other doctoral degree' or x == 'Professional degree (JD, MD, etc.)':
        return 'Post Graduate'
    return 'Less then a Bachelor'


In [22]:
df['EdLevel'] = df['EdLevel'].apply(clean_education)

In [23]:
df['EdLevel'].unique()

array([], dtype=object)

In [24]:
from sklearn.preprocessing import LabelEncoder

In [25]:
le_education = LabelEncoder()
df['EdLevel'] = le_education.fit_transform(df['EdLevel'])

In [26]:
df['EdLevel'].unique()

array([], dtype=float64)

In [27]:
le_country = LabelEncoder()
df['Country'] = le_country.fit_transform(df['Country'])
df['Country'].unique()

array([], dtype=int64)

In [28]:
x = df.drop('Salary',axis=1)
y = df['Salary']

In [29]:
from sklearn.linear_model import LinearRegression
linear_regres = LinearRegression()
linear_regres.fit(x,y.values)

ValueError: Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required.

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

max_depth = [None,2,4,6,8,10,12]
parameter = {"max_depth" :max_depth}

regressor = DecisionTreeRegressor(random_state = 0)
gs = GridSearchCV(regressor,parameter,scoring='neg_mean_squared_error')
gs.fit(x,y)

ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=0.

In [None]:
regressor = gs.best_estimator_

regressor.fit(x,y.values)

In [None]:
y_pred = regressor.predict(x)

In [None]:
from sklearn.metrics import mean_squared_error
error = np.sqrt(mean_squared_error(y,y_pred))

In [None]:
print("${:,.02f}".format(error))

In [None]:
X = np.array([["United States of America", "Master’s degree",15]])
X

In [None]:
X[:,0] = le_country.transform(X[:,0])
X[:,1] = le_education.transform(X[:,1])
X = X.astype(float)
x

In [None]:
y_pred = regressor.predict(X)
y_pred

In [None]:
import pickle

In [None]:
data =  {"model" : regressor, "le_country":le_country, "le_education":le_education}
with open('saved_steps.pkl','wb') as file:
    pickle.dump(data,file)

In [None]:
pip install streamlit

In [None]:
df.Country


In [None]:
Other                                                   
United States of America                                
Germany                                                 
United Kingdom of Great Britain and Northern Ireland    
India                                                   
Canada                                                  
France                                                  
Brazil                                                  
Spain                                                   
Netherlands                                             
Australia                                               
Italy                                                   
Poland                                                  
Sweden                                                  
Russian Federation                                      
Switzerland                                             