In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

import pylab as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression 

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

In [2]:
def analisis(cota = 0.25, paises = 10):
    salaries_data = pd.read_csv('salaries_data.csv')
    dimension_original_salaries_data = salaries_data.shape
    test = pd.read_csv('test.csv')
    dimension_original_test = test.shape
    
    iqr = salaries_data.salary_in_usd.quantile(1-cota) - salaries_data.salary_in_usd.quantile(cota)
    cutoff = iqr * 1.5
    lower =  salaries_data.salary_in_usd.quantile(cota) - cutoff
    upper = salaries_data.salary_in_usd.quantile(1-cota) + cutoff
    salaries_data.drop(salaries_data[(salaries_data.salary_in_usd < lower) | (salaries_data.salary_in_usd > upper)].index, axis=0, inplace = True)
    
    X_salaries_data = salaries_data.drop('salary_in_usd', axis=1)
    y_salaries_data = salaries_data.salary_in_usd
    
    data = pd.concat([X_salaries_data,test])
    dimension_original_data = data.shape
    
    data.work_year = data.work_year.apply(lambda x: str(x))
    
    new = []
    for server in data.job_title:
        if 'scien' in server.lower():
            new.append('Data Scientist')
        elif 'engine' in server.lower():
            new.append('Data Engineer')
        elif 'analy' in server.lower():
            new.append('Data Analyst')
        elif 'archi' in server.lower():
            new.append('Data Architect')
        elif 'machi' in server.lower():
            new.append('Machine Learning')
        else: new.append('Other')
    data.job_title = new
    
    data = data.drop(['employee_residence'], axis = 1)
    
    data.remote_ratio.replace({0: 'CERO', 50: 'HALF', 100: 'FULL'}, inplace=True)
    
    top_paises = pd.DataFrame(data.company_location.value_counts())[:paises]
    
    data.company_location = ['OTHER' if c not in top_paises.index else c for c in data.company_location]
    
    data = pd.get_dummies(data)

    X_salaries_data = data[:len(X_salaries_data)]
    test = data[500:]
    
    salaries_data = pd.concat([X_salaries_data,y_salaries_data], axis = 1)

    X_train, X_test,y_train, y_test = tts(X_salaries_data, y_salaries_data, train_size=0.8, test_size=0.2, random_state=22)
    
    logreg=LogisticRegression(max_iter=2000)
    
    logreg.fit(X_train, y_train)
    y_pred=logreg.predict(X_test)
    y_prob=logreg.predict_proba(X_test)

    accuracy = sum(y_pred==y_test)/y_pred.shape[0] * 100
    
    print('cota:', cota, 'paises:', paises)
    print('Acierto: ', round(accuracy, 2), '%')
    print('RMSE:', mse(y_test, y_pred, squared=False), '€ [0, inf)')

return mse(y_test, y_pred, squared=False)
    

In [3]:
analisis()

Acierto:  4.04 %
RMSE: 46544.54792264323 € [0, inf)
MSE: 2166394941.323232 € [0, inf)
RMSLE: 0.6678776568610786 € [0, inf)
MAE: 38077.52525252525 € [0, inf)
R2: 0.30860338080578964 € (-inf, 1]
