In [10]:
import pandas as pd
import numpy as np

In [20]:
def map_yoe_range(x):
        if x < 1:
            return "0 - 1 year"
        elif x < 3:
            return "1 - 3 years"
        elif x < 5:
            return "3 - 5 years"
        elif x < 10:
            return "5 - 10 years"
        return "10+ years"

class dfs_provider:
    data = pd.read_csv("../data/txl_data.csv")
    city_code = pd.read_csv('../data/provinces.csv')
    def get_job_levels(self):
        return self.data['level'].unique()
    
    def get_industry_counts(self):
        industry_counts = pd.DataFrame(self.data['mapped_industry'].str.split(', ')
                                       .explode().value_counts()).reset_index()
        industry_counts.columns = ['industry', 'count']
        return industry_counts
    
    def search_mean_salaries_by_level(self, level):
        temp_data = self.data.copy()
        temp_data['mapped_industry_ls'] = temp_data['mapped_industry'].str.split(', ')
        temp_data = temp_data.explode('mapped_industry_ls').copy()
        try:
            mean_slr_by_level = temp_data.groupby(['mapped_industry_ls', 'level']).mean()[['min_salary', 'max_salary']].reset_index()
            mean_slr_by_level['min_salary_rd'] = mean_slr_by_level['min_salary'].apply(lambda x: np.round(x / 1000000, decimals=2))
            mean_slr_by_level['max_salary_rd'] = mean_slr_by_level['max_salary'].apply(lambda x: np.round(x / 1000000, decimals=2))
            return mean_slr_by_level[mean_slr_by_level['level'] == level].reset_index(drop=True)
        except Exception as e:
            print(e)
            st.dataframe(temp_data)
    
    def get_mean_salaries(self):
        temp_data = self.data.copy()
        temp_data['mapped_industry_ls'] = temp_data['mapped_industry'].str.split(', ')
        temp_data = temp_data.explode('mapped_industry_ls').copy()[['mapped_industry_ls', 'min_salary', 'max_salary']]
        display(temp_data)
        mean_salaries = temp_data.groupby('mapped_industry_ls').mean()[['min_salary', 'max_salary']].reset_index()
        mean_salaries['min_salary_rd'] = mean_salaries['min_salary'].apply(lambda x: np.round(x / 1000000, decimals=2))
        mean_salaries['max_salary_rd'] = mean_salaries['max_salary'].apply(lambda x: np.round(x / 1000000, decimals=2))
        return mean_salaries
    
    def search_mean_salaries_by_industry(self, industry):
        full_df = self.get_mean_salaries()
        return full_df[full_df['mapped_industry_ls'] == industry]
    
    def search_mean_salaries_by_industry_level(self, industry, level):
        full_df = self.search_mean_salaries_by_level(level)
        return full_df[full_df['mapped_industry_ls'] == industry]
    
    def get_industries(self):
        temp_data = self.data.copy()
        temp_data['mapped_industry_ls'] = temp_data['mapped_industry'].str.split(', ')
        temp_data = temp_data.explode('mapped_industry_ls').copy()
        return temp_data['mapped_industry_ls'].unique()
    
    def get_mean_salaries_by_industry_group_by_yoe(self, industry):
        temp_data = self.data.copy()
        temp_data['mapped_industry_ls'] = temp_data['mapped_industry'].str.split(', ')
        temp_data = temp_data.explode('mapped_industry_ls').copy()
        mean_slr_yoe = temp_data.groupby(['mapped_industry_ls','min_year']).mean()[['min_salary', 'max_salary']]
        mean_slr_yoe['min_salary_rd'] = mean_slr_yoe['min_salary'].apply(lambda x: np.round(x / 1000000, decimals=2))
        mean_slr_yoe['max_salary_rd'] = mean_slr_yoe['max_salary'].apply(lambda x: np.round(x / 1000000, decimals=2))
        mean_slr_yoe_of_ind = mean_slr_yoe.loc[industry].reset_index()
        mean_slr_yoe_of_ind.min_year = mean_slr_yoe_of_ind.min_year.apply(map_yoe_range)
        df = mean_slr_yoe_of_ind.groupby('min_year').mean()[['min_salary_rd', 'max_salary_rd']].reset_index()
        return df
    
    def get_mean_min_years_for_each_level(self, industry):
        temp_data = self.data.copy()
        temp_data['mapped_industry_ls'] = temp_data['mapped_industry'].str.split(', ')
        temp_data = temp_data.explode('mapped_industry_ls').copy()
        mean_min_year_by_level = temp_data[temp_data['mapped_industry_ls'] == industry].groupby('level').mean()[['min_year']].reset_index()
        mean_min_year_by_level = mean_min_year_by_level[mean_min_year_by_level['level'].apply(lambda x: 'Quản lý' in x)].copy()
        mean_min_year_by_level['min_year_rd'] = mean_min_year_by_level.min_year.apply(lambda x: np.round(x, 2))
        return mean_min_year_by_level.sort_values('min_year_rd')
    
    def get_mean_salary_by_province(self):
        map = pd.DataFrame()
        map['industry_list'] = [x.split(', ') for x in self.data['mapped_industry']]
        map['average'] = [(x+y)/2 for x, y in zip(self.data['min_salary'],self.data['max_salary'])]
        map['city'] = [x.split(', ') for x in self.data['city']]
        map = map.explode('industry_list').explode('city').groupby(['industry_list','city']).mean().reset_index()
        map['average'] = [round(x,-5) for x in map['average']]

        lst = list(set(map['industry_list']))
        output = self.city_code.copy()
        for x in lst:
            filter = map.loc[map['industry_list'] == x]
            filter.drop(columns = ['industry_list'], inplace=True)
            filter.columns = ['Provinces', x]
            
            output = output.merge(filter, on='Provinces', how='outer', indicator=True)\
                .query('_merge!="right_only"').drop(columns=['_merge'])
        output = output.fillna(-1)

        return output

In [21]:
dfs = dfs_provider()
dfs.get_mean_salaries()

Unnamed: 0,mapped_industry_ls,min_salary,max_salary
0,Tư vấn,10000000,30000000
0,Ngân hàng,10000000,30000000
0,Chăm sóc khách hàng,10000000,30000000
1,IT Phần mềm,14759739,27302264
2,Cơ khí,7000000,10000000
...,...,...,...
40087,Môi trường - Xử lý chất thải,16422000,25806000
40088,Sản phẩm công nghiệp,11730000,18768000
40088,Sản xuất - Lắp ráp - Chế biến,11730000,18768000
40089,Bán buôn - Bán lẻ - Quản lý cửa hàng,16422000,21114000


Unnamed: 0,mapped_industry_ls,min_salary,max_salary,min_salary_rd,max_salary_rd
0,AI - Data Science - Business Intelligence,1.382915e+07,2.584214e+07,13.83,25.84
1,An toàn lao động,1.332733e+07,2.212362e+07,13.33,22.12
2,Biên phiên dịch,1.411233e+07,2.225816e+07,14.11,22.26
3,Bác sĩ,2.662725e+07,5.825997e+07,26.63,58.26
4,Bán buôn - Bán lẻ - Quản lý cửa hàng,1.777042e+07,3.081144e+07,17.77,30.81
...,...,...,...,...,...
67,Xuất Nhập Khẩu,1.278365e+07,2.182585e+07,12.78,21.83
68,Xây dựng,1.491260e+07,2.522589e+07,14.91,25.23
69,Y tế - Chăm sóc sức khỏe,1.338179e+07,2.403075e+07,13.38,24.03
70,Điện - Điện tử,1.287428e+07,2.232179e+07,12.87,22.32
