In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as seabornInstance
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

df = pd.read_csv('survey_results_public.csv')
df.head()

In [0]:
#get only the needed columns
f_cols = ['YearsCodingProf', 'ConvertedSalary', 'SalaryType']
f_df = df[f_cols]

In [0]:
#drop NaN in YearsCodingProf
f_df = f_df.dropna(subset=['YearsCodingProf'])
f_df.head(12)

Unnamed: 0,YearsCodingProf,ConvertedSalary,SalaryType
0,3-5 years,,Monthly
1,18-20 years,70841.0,Yearly
2,6-8 years,,
3,12-14 years,,
4,0-2 years,21426.0,Yearly
5,3-5 years,41671.0,
6,0-2 years,120000.0,Yearly
7,3-5 years,,
8,21-23 years,250000.0,Yearly
10,0-2 years,,


In [0]:
# get the number of years from YearsExperience
# if YearsCodingProf is "30 or more years", make average years 30 (according to sir Roy)
def checkyears(x):
    if (x['YearsCodingProf'] == '30 or more years'):
      return '30-30'
    else:
      return x['YearsCodingProf'].split()[0]
f_df['YProfExp'] = f_df.apply(checkyears , axis=1)
f_df

f_df['YProfExp'].unique()

array(['3-5', '18-20', '0-2', '21-23', '12-14', '24-26', '6-8', '9-11',
       '15-17', '27-29', '30-30'], dtype=object)

In [0]:
#get ave
f_df['YProfExpStart'] = f_df.apply(lambda x: x['YProfExp'].split("-")[0], axis=1)
f_df['YProfExpEnd'] = f_df.apply(lambda x: x['YProfExp'].split("-")[1], axis=1)
f_df['YProfExpStart'] = pd.to_numeric(f_df['YProfExpStart'])
f_df['YProfExpEnd'] = pd.to_numeric(f_df['YProfExpEnd'])
f_df['AverageYears'] = f_df.apply(lambda x: (x['YProfExpStart']+ x['YProfExpEnd'])/2, axis=1)
f_df.head()

Unnamed: 0,YearsCodingProf,ConvertedSalary,SalaryType,YProfExp,YProfExpStart,YProfExpEnd,AverageYears
0,3-5 years,,Monthly,3-5,3,5,4.0
1,18-20 years,70841.0,Yearly,18-20,18,20,19.0
2,6-8 years,,,6-8,6,8,7.0
3,12-14 years,,,12-14,12,14,13.0
4,0-2 years,21426.0,Yearly,0-2,0,2,1.0


In [0]:
#monthly salary
def fillSalary(x):
    # check if the salary is nan
    if np.isnan(x['ConvertedSalary']):
        # get the rows that has the same YearsExperience
        filtered_rows = f_df[f_df['AverageYears'] == x['AverageYears']]
        # get the salary mean of the filtered rows
        return filtered_rows.mean()
    else:
        return x['ConvertedSalary']
f_df['ConvertedSalary'] = f_df.apply(fillSalary, axis=1)
f_df.head(15)

Unnamed: 0,YearsCodingProf,ConvertedSalary,SalaryType,YProfExp,YProfExpStart,YProfExpEnd,AverageYears
0,3-5 years,89099.571429,Monthly,3-5,3,5,4.0
1,18-20 years,70841.0,Yearly,18-20,18,20,19.0
2,6-8 years,95494.0,,6-8,6,8,7.0
3,12-14 years,79810.5,,12-14,12,14,13.0
4,0-2 years,21426.0,Yearly,0-2,0,2,1.0
5,3-5 years,41671.0,,3-5,3,5,4.0
6,0-2 years,120000.0,Yearly,0-2,0,2,1.0
7,3-5 years,89099.571429,,3-5,3,5,4.0
8,21-23 years,250000.0,Yearly,21-23,21,23,22.0
10,0-2 years,88290.389313,,0-2,0,2,1.0


In [0]:
#remove nan in salary type
f_df = f_df.dropna(subset=['SalaryType'])
f_df.head(12)

Unnamed: 0,YearsCodingProf,ConvertedSalary,SalaryType,YProfExp,YProfExpStart,YProfExpEnd,AverageYears
0,3-5 years,89099.571429,Monthly,3-5,3,5,4.0
1,18-20 years,70841.0,Yearly,18-20,18,20,19.0
4,0-2 years,21426.0,Yearly,0-2,0,2,1.0
6,0-2 years,120000.0,Yearly,0-2,0,2,1.0
8,21-23 years,250000.0,Yearly,21-23,21,23,22.0
14,0-2 years,0.0,Monthly,0-2,0,2,1.0
17,0-2 years,47904.0,Monthly,0-2,0,2,1.0
18,3-5 years,89099.571429,Monthly,3-5,3,5,4.0
20,12-14 years,95968.0,Yearly,12-14,12,14,13.0
21,24-26 years,85561.285714,Monthly,24-26,24,26,25.0


In [0]:
#convert to peso then divide by 12
#column ConvertedSalary is yearly salary converted to US dollars (according to sir Roy)
f_df['ConvertedMS'] = f_df.apply(lambda x: int ((x['MonthlySalary']*52.3767)/12), axis=1)
f_df.head(15)

In [0]:

print(f_df.shape)

print(f_df.describe())

X = f_df[['AverageYears']].values
y = f_df['ConvertedMS'].values

plt.figure(figsize=(15,10))
plt.tight_layout()
seabornInstance.distplot(f_df['ConvertedMS'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

df1 = df.head(25)

df1.plot(kind='bar', figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Average Years of Profession vs Monthly Salary (in Php)')
plt.xlabel('Average Years of Profession')
plt.ylabel('Monthly Salary')
plt.show()

