In [12]:
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [13]:
maaslar = pd.read_csv('data/maaslar_yeni.csv')

In [14]:
df = maaslar.copy()
df.head()

Unnamed: 0,Calisan ID,unvan,UnvanSeviyesi,Kidem,Puan,maas
0,1,Cayci,1,5,70,2250
1,2,Sekreter,2,5,70,2500
2,3,Uzman Yardimcisi,3,5,70,3000
3,4,Uzman,4,5,70,4000
4,5,Proje Yoneticisi,5,5,70,5500


In [15]:
def replaces_spaces_with_underscore(df):
    if isinstance(df, pd.core.frame.DataFrame):
        return df.columns.str.strip().str.replace(" ", "_")
    elif isinstance(df, str):
        return df.strip().replace(" ", "_")     

In [16]:
def turkish_to_english_translation(text):
    if isinstance(text, str):
        translationTable = str.maketrans("ğĞıİöÖüÜşŞçÇ", "gGiIoOuUsScC")
        translated_text = text.translate(translationTable)
        return translated_text

In [17]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def custom_combiner(feature, category):
    print("")
    return str(feature) + "_" + type(category).__name__ + "_" + str(category)

def hot_encoded(df, column):
    custom_fnames_enc = OneHotEncoder(sparse=False, handle_unknown='ignore',
                                      feature_name_combiner=custom_combiner, dtype=int).fit(df[column])
    one_hot_encoded = custom_fnames_enc.transform(df[column])
    return pd.DataFrame(one_hot_encoded, columns=custom_fnames_enc.get_feature_names_out(input_features=column))

In [18]:
df.columns = replaces_spaces_with_underscore(df)
df.columns = df.columns.map(turkish_to_english_translation)
df

Unnamed: 0,Calisan_ID,unvan,UnvanSeviyesi,Kidem,Puan,maas
0,1,Cayci,1,5,70,2250
1,2,Sekreter,2,5,70,2500
2,3,Uzman Yardimcisi,3,5,70,3000
3,4,Uzman,4,5,70,4000
4,5,Proje Yoneticisi,5,5,70,5500
5,6,Sef,6,5,70,7500
6,7,Mudur,7,5,70,10000
7,8,Direktor,8,5,70,15000
8,9,C-level,9,5,70,25000
9,10,CEO,10,5,70,50000


In [21]:
df_ = df.iloc[:,2:]
X   = df_.drop(columns=['maas', 'Kidem', 'Puan'])
Y   = df_[['maas']]

In [22]:
from sklearn.linear_model  import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, Y)

In [23]:
import statsmodels.api as sm
lin_model = sm.OLS(lin_reg.predict(X), X).fit()
lin_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.942
Model:,OLS,Adj. R-squared (uncentered):,0.94
Method:,Least Squares,F-statistic:,468.1
Date:,"Thu, 12 Oct 2023",Prob (F-statistic):,1.93e-19
Time:,08:40:07,Log-Likelihood:,-287.43
No. Observations:,30,AIC:,576.9
Df Residuals:,29,BIC:,578.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
UnvanSeviyesi,2269.9134,104.919,21.635,0.000,2055.330,2484.497

0,1,2,3
Omnibus:,5.87,Durbin-Watson:,0.6
Prob(Omnibus):,0.053,Jarque-Bera (JB):,1.873
Skew:,-0.0,Prob(JB):,0.392
Kurtosis:,1.776,Cond. No.,1.0


In [24]:
def getOLS(func, arr):
    return sm.OLS(func.predict(arr), arr).fit().summary()

In [25]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg  = PolynomialFeatures(degree=2)
x_poly    = poly_reg.fit_transform(X)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(x_poly, Y)

poly_model = sm.OLS(lin_reg_2.predict(poly_reg.fit_transform(X)), X).fit()
poly_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.81
Model:,OLS,Adj. R-squared (uncentered):,0.803
Method:,Least Squares,F-statistic:,123.4
Date:,"Thu, 12 Oct 2023",Prob (F-statistic):,5.78e-12
Time:,08:40:08,Log-Likelihood:,-307.43
No. Observations:,30,AIC:,616.9
Df Residuals:,29,BIC:,618.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
UnvanSeviyesi,2269.9134,204.360,11.107,0.000,1851.949,2687.878

0,1,2,3
Omnibus:,4.609,Durbin-Watson:,0.493
Prob(Omnibus):,0.1,Jarque-Bera (JB):,4.047
Skew:,0.892,Prob(JB):,0.132
Kurtosis:,2.766,Cond. No.,1.0


In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

sc1 = StandardScaler()
sc2 = StandardScaler()
x_olcekli = sc1.fit_transform(X)
y_olcekli = sc2.fit_transform(Y)

svr_reg = SVR(kernel='rbf')
svr_reg.fit(x_olcekli, y_olcekli)

svr_model = sm.OLS(svr_reg.predict(x_olcekli), x_olcekli).fit()
svr_model.summary()

  y = column_or_1d(y, warn=True)


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.77
Model:,OLS,Adj. R-squared (uncentered):,0.762
Method:,Least Squares,F-statistic:,97.19
Date:,"Thu, 12 Oct 2023",Prob (F-statistic):,9.11e-11
Time:,08:40:09,Log-Likelihood:,-1.368
No. Observations:,30,AIC:,4.736
Df Residuals:,29,BIC:,6.137
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.4636,0.047,9.858,0.000,0.367,0.560

0,1,2,3
Omnibus:,9.456,Durbin-Watson:,0.304
Prob(Omnibus):,0.009,Jarque-Bera (JB):,2.433
Skew:,0.179,Prob(JB):,0.296
Kurtosis:,1.652,Cond. No.,1.0


In [27]:
from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor(random_state=0)
dt_reg.fit(X, Y)
dt_model = sm.OLS(dt_reg.predict(X), X).fit()
dt_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.751
Model:,OLS,Adj. R-squared (uncentered):,0.742
Method:,Least Squares,F-statistic:,87.29
Date:,"Thu, 12 Oct 2023",Prob (F-statistic):,3.01e-10
Time:,08:40:10,Log-Likelihood:,-312.62
No. Observations:,30,AIC:,627.2
Df Residuals:,29,BIC:,628.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
UnvanSeviyesi,2269.9134,242.950,9.343,0.000,1773.025,2766.802

0,1,2,3
Omnibus:,27.769,Durbin-Watson:,1.364
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45.942
Skew:,2.292,Prob(JB):,1.06e-10
Kurtosis:,6.968,Cond. No.,1.0


In [28]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(random_state=0)
rf_reg.fit(X, Y)
rf_model = sm.OLS(rf_reg.predict(X), X).fit()
rf_model.summary()

  return fit_method(estimator, *args, **kwargs)


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.752
Model:,OLS,Adj. R-squared (uncentered):,0.744
Method:,Least Squares,F-statistic:,88.15
Date:,"Thu, 12 Oct 2023",Prob (F-statistic):,2.71e-10
Time:,08:40:11,Log-Likelihood:,-312.05
No. Observations:,30,AIC:,626.1
Df Residuals:,29,BIC:,627.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
UnvanSeviyesi,2237.7615,238.343,9.389,0.000,1750.296,2725.227

0,1,2,3
Omnibus:,27.686,Durbin-Watson:,1.353
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45.657
Skew:,2.287,Prob(JB):,1.22e-10
Kurtosis:,6.95,Cond. No.,1.0


In [29]:
result = []
result.append(lin_model.rsquared)
result.append(svr_model.rsquared)
result.append(poly_model.rsquared)
result.append(dt_model.rsquared)
result.append(rf_model.rsquared)

In [30]:
result.sort(reverse=True)
result

[0.9416580080048553,
 0.8096791836800286,
 0.77018277782798,
 0.7524549850574087,
 0.7506319083486271]