In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from preprocess import date_formatter, month_fixer, year_fixer, renamer, string_fixer, place_calculator, extract_numbers
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import process
import pickle 

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)



### PREPROCESS FINAL

#### OUTSOURCE DATA

In [2]:
yok_atlas = pd.read_excel("data/yok_atlas_programlar.xls")

new_cols = []

for col in yok_atlas.columns:
    if "Unnamed" in col:
        new_col = yok_atlas.loc[0,col]
    else:
        new_col = col + " " + yok_atlas.loc[0,col]
    new_cols.append(new_col)

yok_atlas.columns = new_cols
yok_atlas = yok_atlas.drop(index=[0], axis=0).reset_index(drop=True)

new_cols = []

for col in yok_atlas.columns:
    if pd.isna(yok_atlas.loc[0,col]):
        new_col = col
    else:
        new_col = col + " " + yok_atlas.loc[0,col]
    new_cols.append(new_col)

yok_atlas.columns = new_cols
yok_atlas = yok_atlas.drop(index=[0], axis=0).reset_index(drop=True)

university_df = pd.DataFrame(columns=["universite_adi","akreditasyon","fakulte_adi","bolum_adi",
                                              "ogr_sure","puan_turu",
                                              "genel_kontr","ok_bir_kont","basari_sirasi","yks_puan_min",
                                              "pdr_count","ddr_count","drogr_count","kpss_1","kpss_2"])

cols_to_scrape = ["PROGRAM ADI (2)","ÖĞR. SÜRE  (3)","PUAN TÜRÜ  (4)","GENEL KONT.  (5)",
                "OK.BİR KONT. (6)","2023-YKS BAŞARI SIRASI (12)","2023-YKS.1 EN KÜÇÜK PUANI (13)",
                "P.DR. SAYI (14)","D.DR. SAYI (15)","DR.ÖĞR. ÜYE SAYI (16)","KPSS-1 (25)","KPSS-2 (26)"]

universite_adi = yok_atlas.loc[0,"PROGRAM ADI (2)"]
akreditasyon = yok_atlas.loc[0,"AKREDİ- TASYON (19)"]
fakulte_adi = yok_atlas.loc[1,"PROGRAM ADI (2)"]
values = [universite_adi,akreditasyon,fakulte_adi]

for row in yok_atlas.index[1:-1]:
    if pd.isna(yok_atlas.loc[row,"PROGRAM KODU (1)"]) and pd.isna(yok_atlas.loc[row+1,"PROGRAM KODU (1)"]):
        universite_adi = yok_atlas.loc[row,"PROGRAM ADI (2)"]
        akreditasyon = yok_atlas.loc[row,"AKREDİ- TASYON (19)"]
        fakulte_adi = yok_atlas.loc[row+1,"PROGRAM ADI (2)"]
        values = [universite_adi,akreditasyon,fakulte_adi]
    if pd.isna(yok_atlas.loc[row,"PROGRAM KODU (1)"]):
        if yok_atlas.loc[row-1,"PROGRAM ADI (2)"] == universite_adi:
            pass
        else:
            fakulte_adi = yok_atlas.loc[row,"PROGRAM ADI (2)"]
            values = [universite_adi,akreditasyon,fakulte_adi]
    else:
        scraped_values = yok_atlas.loc[row,cols_to_scrape].values.tolist()
        passed_values = values + scraped_values
        university_df.loc[university_df.shape[0],:] = passed_values

university_df.loc[university_df["basari_sirasi"]=="...","basari_sirasi"] = np.nan
university_df = university_df.drop(index=university_df.loc[university_df["basari_sirasi"].isnull(),:].index)
university_df = university_df.reset_index(drop=True)
university_df["akreditasyon"] = university_df["akreditasyon"].fillna("diger")
university_df["akreditasyon"] = university_df["akreditasyon"].apply(string_fixer)
university_df["genel_kontr"] = university_df["genel_kontr"].astype(int)
university_df["ok_bir_kont"] = university_df["ok_bir_kont"].fillna(0).astype(int)
university_df["basari_sirasi"] = university_df["basari_sirasi"].astype(int)
university_df["yks_puan_min"] = university_df["yks_puan_min"].astype(float)
university_df["pdr_count"] = university_df["pdr_count"].fillna(0).astype(int)
university_df["ddr_count"] = university_df["ddr_count"].fillna(0).astype(int)
university_df["drogr_count"] = university_df["drogr_count"].fillna(0).astype(int)
university_df["kpss_1"] = university_df["kpss_1"].astype(float)
university_df["kpss_2"] = university_df["kpss_2"].astype(float)

def university_df_name_fixer(df, col):
    df[col] = df[col].apply(string_fixer)
    df[col] = df[col].apply(lambda x: x.split("(")[0][:-1] if "(" in x else x)
    for _ in range(3):
        df[col] = df[col].apply(lambda x: x[1:] if " " == x[0] else x)
    return df

fix_cols = ["universite_adi","fakulte_adi","bolum_adi"]
for fix_col in fix_cols:
    university_df = university_df_name_fixer(university_df, fix_col)

university_df = university_df.loc[university_df[["universite_adi","bolum_adi"]].drop_duplicates(keep="first").index,:]
university_df = university_df.reset_index(drop=True)

#### PREPROCESS ON GIVEN DATA

In [3]:
train = pd.read_csv("data/train.csv")
train = renamer(train)

test = pd.read_csv("data/test_x.csv")
test = renamer(test)

  train = pd.read_csv("data/train.csv")


In [4]:
def fuzzy_uni(x, list_item, threshold=80):
    new_x, th_new = process.extractOne(x, list_item)
    if th_new >= threshold:
        return new_x
    return "diger"

list_item = university_df["universite_adi"].unique()

item_col = "universite_adi"

train[item_col] = train[item_col].apply(string_fixer)
train[item_col].fillna("diger", inplace=True)
manuel_fix = {"istanbul sehir universitesi":"kapatilan universiteler",
            "fatih universitesi":"kapatilan universiteler",
            "izmir universitesi":"kapatilan universiteler",
            "gediz universitesi":"kapatilan universiteler",
            "sifa universitesi":"kapatilan universiteler",
            "mevlana universitesi":"kapatilan universiteler",
            "meliksah universitesi":"kapatilan universiteler",
            "canik basari universitesi":"kapatilan universiteler",
            "istanbul kemerburgaz universitesi":"altinbas universitesi",
            "zirve universitesi":"gaziantep islam bilim ve teknoloji universitesi",
            "odtu":"orta dogu teknik universitesi",
            "uluslararasi antalya universitesi":"antalya bilim universitesi",
            "istanbul kavram meslek":"istanbul kavram meslek yuksekokulu",
            'istanbul bilim universitesi':"demiroglu bilim universitesi",
            'ipek universitesi':"kapatilan universiteler",
            'gebze yuksek teknoloji enstitusu':"gebze teknik universitesi",
            'faruk sarac tasarim meslek yuksekokulu (istanbul)':'faruk sarac tasarim meslek yuksekokulu',
            'alanya hamdullah emin pasa universitesi':"alanya universitesi",
            'faruk sarac tasarim meslek':'faruk sarac tasarim meslek yuksekokulu',
            'icisleri bakanligi ve milli savunma bakanligi adina saglik bilimleri universitesinde egitim alacaklar':"saglik bilimleri universitesi",
            "beykoz lojistik meslek":"beykoz lojistik meslek yuksekokulu"
            }

train.loc[(train[item_col]!="diger") & 
          (train[item_col].isin(list_item)==False),
          item_col] = train.loc[(train[item_col]!="diger") & 
          (train[item_col].isin(list_item)==False),
          item_col].apply(lambda x: manuel_fix[x] if x in manuel_fix.keys() else x)

train.loc[(train[item_col].isin(["diger","kapatilan universiteler"])==False) & 
          (train[item_col].str.contains("yuksekokulu")==False) & 
          (train[item_col].isin(list_item)==False),
          item_col
          ] = train.loc[(train[item_col]!="diger") & 
          (train[item_col].str.contains("yuksekokulu")==False) & 
          (train[item_col].isin(list_item)==False),
          item_col].apply(lambda x: fuzzy_uni(x, list_item))

test[item_col] = test[item_col].apply(string_fixer)
test[item_col].fillna("diger", inplace=True)

test.loc[(test[item_col]!="diger") & 
          (test[item_col].isin(list_item)==False),
          item_col] = test.loc[(test[item_col]!="diger") & 
          (test[item_col].isin(list_item)==False),
          item_col].apply(lambda x: manuel_fix[x] if x in manuel_fix.keys() else x)

test.loc[(test[item_col].isin(["diger","kapatilan universiteler"])==False) & 
          (test[item_col].str.contains("yuksekokulu")==False) & 
          (test[item_col].isin(list_item)==False),
          item_col
          ] = test.loc[(test[item_col]!="diger") & 
          (test[item_col].str.contains("yuksekokulu")==False) & 
          (test[item_col].isin(list_item)==False),
          item_col].apply(lambda x: fuzzy_uni(x, list_item))

train["universite_teknik"] = train["universite_adi"].apply(lambda x: 1 if "teknik" in x else 0)
train["universite_yuksekokulu"] = train["universite_adi"].apply(lambda x: 1 if "yuksekokulu" in x else 0)
test["universite_teknik"] = test["universite_adi"].apply(lambda x: 1 if "teknik" in x else 0)
test["universite_yuksekokulu"] = test["universite_adi"].apply(lambda x: 1 if "yuksekokulu" in x else 0)

In [5]:
item_dict_fixed = {"genetics and bioengineering":"genetik ve biyomuhendislik",
    "computer science engineering":"bilgisayar muhendisligi",
    "computer engineering":"bilgisayar muhendisligi",
    "master of business administration":"isletme",
    "management information systems":'yonetim bilisim sistemleri',
    "political science and international relations":'siyaset bilimi ve uluslararasi iliskiler',
    "biemf - bachelor of international economics, management and finance":"ekonomi ve finans",
                    "construction management-master":'insaat muhendisligi',
                    "laboratuvar teknoloji":"diger",
                    'sahne ve gosteri sanatlari yonetimi':"diger",
                   'bilgisayar teknolojileri':'bilgisayar teknolojisi ve bilisim sistemleri',
                   'spor yonetimi':'spor yoneticiligi',
                   'fizik tedavi ve rehabilitasyon':'fizyoterapi ve rehabilitasyon',
                   'insaat teknolojileri':"diger",
                   'ulualararasi lojistik ve tasimacilikingilizce':'uluslararasi ticaret ve lojistik',
                   'dis ticaretingilizce':'uluslararasi ticaret',
                   'muhasebe bilgi sistemleri':'muhasebe ve finans yonetimi',
                   'uluslararasi ticaretingilizce':'uluslararasi ticaret',
                   'sanat sosyal fakultesi':'sanat ve sosyal bilimler programlari',
                   'geomatik muhendisligi':'harita muhendisligi',
                   'radyo ve televizyon ingilizce':'radyo, televizyon ve sinema',
                   'management yonetim bilimleri':'yonetim bilimleri programlari',
                   'avrupa birligi iliskileri':'kuresel siyaset ve uluslararasi iliskiler',
                   'ascilik':'gastronomi ve mutfak sanatlari',
                   'tip(ingilizce)(tam burslu)':"tip",
                   'gorme engelliler ogretmenligi':'sosyal hizmet',
                   'avrupa birligi iliskileri(ingilizce)':'kuresel siyaset ve uluslararasi iliskiler',
                   'endustri muhendis':'endustri muhendisligi',
                   'soru gozukmuyor ama devlet universitesinde egitim goruyorum':"diger",
                   'sahne sanatlari':"diger",
                   'yok':"diger",
                   'yakin dogu universitesi tip fakultesi':"tip",
                   'biyomedikal cihaz teknolojisi':'biyosistem muhendisligi',
                   'makine egitimi':'makine muhendisligi',
                   'tutun teknoloji muhendisligi':"diger",
                   'muzik ogretmenligi':'sinif ogretmenligi',
                   'ing dili ve edebiyati':'ingiliz dili ve edebiyati',
                   'iktisadi idari bilimler':'iktisadi ve idari bilimler programlari',
                   'genetik muh':'genetik ve biyomuhendislik',
                   'ibrani dili ve edebiyati':'ibrani dili ve kulturu',
                   'ingiliz dili egitimi':'ingiliz dili ve edebiyati',
                   'yasli bakim':"diger",
                   'to':"diger",
                   "muzik":"diger",
                   "yuzde sifir":"diger",
                   "computer education and educational technologies":'bilgisayar ve ogretim teknolojileri ogretmenligi',}

In [6]:
with open('data/bolum_combinations_new_dict.pkl', 'rb') as f:
    new_item_dict = pickle.load(f)

def fuzzy_bolum(x, list_item, threshold=80):

    new_x, th_new = process.extractOne(x, list_item)
    if th_new >= threshold:
        return new_x
    return x

list_item = university_df["bolum_adi"].unique()

item_col = "bolum"

train[item_col] = train[item_col].apply(string_fixer)
train[item_col].fillna("diger", inplace=True)
train["bolum_ingilizce"] = train[item_col].apply(lambda x: 1 if "ingiliz" in x else 0)
test[item_col] = test[item_col].apply(string_fixer)
test[item_col].fillna("diger", inplace=True)
test["bolum_ingilizce"] = test[item_col].apply(lambda x: 1 if "ingiliz" in x else 0)
print("here")
train.loc[(train[item_col]!="diger") & 
          (train[item_col].isin(list_item)==False),
          item_col] = train.loc[(train[item_col]!="diger") & 
          (train[item_col].isin(list_item)==False),
          item_col].apply(lambda x: item_dict_fixed[x] if x in item_dict_fixed.keys() else x)
test.loc[(test[item_col]!="diger") & 
          (test[item_col].isin(list_item)==False),
          item_col] = test.loc[(test[item_col]!="diger") & 
          (test[item_col].isin(list_item)==False),
          item_col].apply(lambda x: item_dict_fixed[x] if x in item_dict_fixed.keys() else x)

print("here2")
train.loc[(train[item_col]!="diger") & 
          (train[item_col].isin(list_item)==False),
          item_col] = train.loc[(train[item_col]!="diger") & 
          (train[item_col].isin(list_item)==False),
          item_col].apply(lambda x: new_item_dict[x] if x in new_item_dict.keys() else x)
test.loc[(test[item_col]!="diger") & 
          (test[item_col].isin(list_item)==False),
          item_col] = test.loc[(test[item_col]!="diger") & 
          (test[item_col].isin(list_item)==False),
          item_col].apply(lambda x: new_item_dict[x] if x in new_item_dict.keys() else x)

here
here2


In [7]:
## takes too long
#train.loc[(train[item_col]!="diger") & 
#          (train[item_col].isin(list_item)==False),
#          item_col] = train.loc[(train[item_col]!="diger") & 
#          (train[item_col].isin(list_item)==False),
#          item_col].apply(lambda x: fuzzy_bolum(x, list_item))
train_bolum_info = pd.read_csv("data/train_bolum_adi_preprocessed.csv")
train["bolum"] = train_bolum_info["bolum"].values

test.loc[(test[item_col]!="diger") & 
          (test[item_col].isin(list_item)==False),
          item_col] = test.loc[(test[item_col]!="diger") & 
          (test[item_col].isin(list_item)==False),
          item_col].apply(lambda x: fuzzy_bolum(x, list_item))
print("here4")

  train_bolum_info = pd.read_csv("data/train_bolum_adi_preprocessed.csv")


here4


In [8]:
def fuzzy_sehir(x, list_sehir, threshold=80):

    if x == "------" or pd.isna(x):

        return x
    
    manuel_fix = {"antakya":"hatay","izmit":"kocaeli","konak":"izmir",
                  "corlu":"tekirdag","sarikaya":"yozgat","inegol":"bursa",
                  "saray":"tekirdag","kaman":"kirsehir","aksehir":"konya",
                  "yalvac":"isparta","emirdag":"afyonkarahisar","karasu":"sakarya"}

    if x in manuel_fix.keys():

        return manuel_fix[x]

    new_x, th_new = process.extractOne(x, list_sehir)

    if th_new >= threshold:
        return new_x
    return x

ilce_file = pd.read_csv("data/il_ilce.csv")
ilce_file["il"] = ilce_file["il"].apply(string_fixer)
ilce_file["ilce"] = ilce_file["ilce"].apply(string_fixer)

for sehir_col in ["lise_sehir","ikametgah_sehri","dogum_yeri"]:
    train[sehir_col] = train[sehir_col].apply(string_fixer)
    test[sehir_col] = test[sehir_col].apply(string_fixer)

    fixed_col = sehir_col + "_fixed"
    train[fixed_col] = train[sehir_col].apply(lambda x: place_calculator(x, ilce_file)).fillna("bilinmiyor")
    test[fixed_col] = test[sehir_col].apply(lambda x: place_calculator(x, ilce_file)).fillna("bilinmiyor")

    train.loc[train[fixed_col]=="bilinmiyor",
            fixed_col] = train.loc[train[fixed_col]=="bilinmiyor",
            sehir_col].apply(lambda x: fuzzy_sehir(x, ilce_file["il"].unique()))
    test.loc[test[fixed_col]=="bilinmiyor",
            fixed_col] = test.loc[test[fixed_col]=="bilinmiyor",
            sehir_col].apply(lambda x: fuzzy_sehir(x, ilce_file["il"].unique()))

    train[sehir_col] = train[fixed_col]
    test[sehir_col] = test[fixed_col]

    train = train.drop(columns=[fixed_col])
    test = test.drop(columns=[fixed_col])

In [29]:
def preprocess(data, type="train", drop_id=True):

    # data read
    ilce_file = pd.read_csv("data/il_ilce.csv")
    ilce_file["il"] = ilce_file["il"].apply(string_fixer)
    ilce_file["ilce"] = ilce_file["ilce"].apply(string_fixer)

    # renaming data
    data = renamer(data)

    # setting lists
    string_fix_cols = ["cinsiyet",
                       "universite_turu","burs_aliyor_mu",
                       "universite_kacinci_sinif","universite_not_ortalamasi",'lise_adi',
                       'lise_turu',"lise_bolumu","lise_mezuniyet_notu","baska_bir_kurumdan_burs_aliyor_mu",
                       "burs_aldigi_baska_kurum","baska_kurumdan_aldigi_burs_miktari",
                       'anne_egitim_durumu','anne_calisma_durumu','anne_sektor',
                       'baba_egitim_durumu','baba_calisma_durumu','baba_sektor',
                       'girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz',
                       'profesyonel_bir_spor_daliyla_mesgul_musunuz','spor_dalindaki_rolunuz_nedir',
                       "spor_dalindaki_rolunuz_nedir","aktif_olarak_bir_stk_uyesi_misiniz","hangi_stk'nin_uyesisiniz",
                       "girisimcilikle_ilgili_deneyiminiz_var_mi","girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz",
                       "ingilizce_biliyor_musunuz"]
    
    drop_cols = ["dogum_tarihi","burslu_ise_burs_yuzdesi",'daha_once_baska_bir_universiteden_mezun_olmus',
                 "lise_adi_diger","lise_bolum_diger",'stk_projesine_katildiniz_mi','ingilizce_seviyeniz',
                 "daha_onceden_mezun_olunduysa,_mezun_olunan_universite","id",'uye_oldugunuz_kulubun_ismi',
                 "new_lise_adi"]
    
    drop_index_list = []

    for col in string_fix_cols:
        data[col] = data[col].apply(string_fixer)

    # preprocess operations for train specific
    if type == "train":

        drop_index_list += data.loc[data["degerlendirme_puani"].isnull(),:].index.tolist()
        data.loc[data["kardes_sayisi"] == "Kardeş Sayısı 1 Ek Bilgi Aile Hk. Anne Vefat","kardes_sayisi"] = 1
        data["kardes_sayisi"] = data["kardes_sayisi"].apply(lambda x: 0 if pd.isna(x) else int(x))
    
    # cinsiyet
    data["cinsiyet"].fillna("belirtmek istemiyorum", inplace=True)

    # dogum tarihi
    data["dogum_tarihi"] = data["dogum_tarihi"].apply(lambda x: date_formatter(x))
    data["dogum_yil"] = data["dogum_tarihi"].apply(lambda x: x[0])
    data["dogum_ay"] = data["dogum_tarihi"].apply(lambda x: x[1])
    data["dogum_yil"] = data["dogum_yil"].apply(year_fixer)
    data.loc[(data["dogum_yil"] <= 1970) | (data["dogum_yil"] >= 2007), "dogum_yil"] = np.nan
    data["dogum_ay"] = data["dogum_ay"].apply(month_fixer).fillna(0)
    data["dogum_ay"] = data["dogum_ay"].map({0:"bilinmiyor",
                                            1:"ocak",
                                            2:"subat",
                                            3:"mart",
                                            4:"nisan",
                                            5:"mayis",
                                            6:"haziran",
                                            7:"temmuz",
                                            8:"agustos",
                                            9:"eylul",
                                            10:"ekim",
                                            11:"kasim",
                                            12:"aralik"})
    
    # universite_turu
    data["universite_turu"].fillna("devlet", inplace=True)

    # burs_aliyor_mu
    data["burs_aliyor_mu"].fillna("hayir", inplace=True)

    # universite_kacinci_sinif
    data.loc[data["universite_kacinci_sinif"].isin(["hazirlik"]), "universite_kacinci_sinif"] = 0
    for sinif in range(7):
        data.loc[data["universite_kacinci_sinif"]==str(sinif), "universite_kacinci_sinif"] = sinif
    data["universite_kacinci_sinif"].fillna(0, inplace=True)
    drop_index_list += data.loc[data["universite_kacinci_sinif"].isin(["tez","yuksek lisans","mezun"]),:].index.tolist()

    # universite_not_ortalamasi
    lower_group = ["2.50 ve alti","2.00 - 2.50","1.80 - 2.49","1.00 - 2.50","0 - 1.79",]
    data.loc[data["universite_not_ortalamasi"].isin(lower_group), "universite_not_ortalamasi"] = "2.50 - 0.00"

    lower_mid_group = ["3.00-2.50","2.50 - 3.00","2.50 - 2.99","2.50 -3.00","0 - 1.79"]
    data.loc[data["universite_not_ortalamasi"].isin(lower_mid_group),"universite_not_ortalamasi"] = "3.00 - 2.50"
    mid_group = ["3.00 - 3.49", "3.00 - 3.50", "3.50-3", "3.00 - 4.00"]

    data.loc[data["universite_not_ortalamasi"].isin(mid_group), "universite_not_ortalamasi"] = "3.50 - 3.00"

    upper_group = ["3.50 - 4.00", "4.0-3.5", "4-3.5"]
    data.loc[data["universite_not_ortalamasi"].isin(upper_group), "universite_not_ortalamasi"] = "4.00 - 3.50"
    
    hazirlik_group = ["ortalama bulunmuyor","hazirligim","not ortalamasi yok"]
    data.loc[data["universite_not_ortalamasi"].isin(hazirlik_group),"universite_not_ortalamasi"] = "ortalama_yok"

    data.loc[(data["universite_not_ortalamasi"].isnull()),"universite_not_ortalamasi"] = "ortalama_yok"

    # lise_adi
    data["lise_adi"].fillna("diger", inplace=True)
    data.loc[data["lise_adi"]=="------","lise_adi"] = "diger"
    lise_turleri = ["sosyal","fen","anadolu","acik","imam","meslek","teknik"]

    data["new_lise_adi"] = "lisesi"

    for lise in lise_turleri:

        data["new_lise_adi"] = data["lise_adi"].apply(lambda x: lise if lise in x else "") + " " + data["new_lise_adi"]
        data["new_lise_adi"] = data["new_lise_adi"].apply(lambda x: x.replace("  "," "))

    data["new_lise_adi"] = data["new_lise_adi"].apply(lambda x: x[1:] if x[0] == " " else x)
    data.loc[data["new_lise_adi"]=="lisesi","new_lise_adi"] = "diger"
    less_30_groups = data["new_lise_adi"].value_counts()[data["new_lise_adi"].value_counts()<30].keys()
    data.loc[data["new_lise_adi"].isin(less_30_groups),"new_lise_adi"] = "diger"

    data["lise_adi"] = data["new_lise_adi"].values

    # lise_turu
    devlet_liseleri = ["anadolu lisesi","diger","duz lise","devlet",
                        "meslek lisesi","fen lisesi","meslek","imam hatip lisesi"]
    data.loc[data["lise_turu"].isin(devlet_liseleri),"lise_turu"] = "devlet"
    data.loc[data["lise_turu"].isin(["ozel", "ozel lisesi", "ozel lise"]), "lise_turu"] = "ozel"
    data["lise_turu"].fillna("devlet", inplace=True)

    # lise_bolumu
    data["lise_bolumu"].fillna("sayisal", inplace=True)
    esit_agirlik_values = ["tm", "esit", "ea", "turkce", "agirlik"]
    for val in esit_agirlik_values:
        data.loc[data["lise_bolumu"].str.contains(val), "lise_bolumu"] = "esit agirlik"
    sayisal_values = ["fen", "mf", "fm", "sayisal"]
    for val in sayisal_values:
        data.loc[data["lise_bolumu"].str.contains(val), "lise_bolumu"] = "sayisal"
    sozel_values = ["sozel", "ts"]
    for val in sozel_values:
        data.loc[data["lise_bolumu"].str.contains(val), "lise_bolumu"] = "sozel"
    dil_values = ["dil", "yabanci", "ingilizce", "ydl"]
    for val in dil_values:
        data.loc[data["lise_bolumu"].str.contains(val), "lise_bolumu"] = "dil"

    # lise_mezuniyet_notu
    upper_group = ["75 - 100","84-70","100-85","4.00-3.50","3.00 - 4.00","3.50-3.00","3.50-3"]
    data.loc[data["lise_mezuniyet_notu"].isin(upper_group),"lise_mezuniyet_notu"] = "75 - 100"

    mid_group = ["50 - 75","69-55","3.00-2.50","50 - 74","2.50 ve alti","54-45","not ortalamasi yok"]
    data.loc[data["lise_mezuniyet_notu"].isin(mid_group), "lise_mezuniyet_notu"] = "50 - 74"

    lower_group = ["25 - 50", "44-0", "0 - 25", "25 - 49" "0 - 24"]
    data.loc[data["lise_mezuniyet_notu"].isin(lower_group), "lise_mezuniyet_notu"] = "25 - 49"
    data.loc[data["lise_mezuniyet_notu"] == "0 - 24", "lise_mezuniyet_notu"] = "25 - 49"

    data["lise_mezuniyet_notu"].fillna("50 - 74", inplace=True)

    # baska_bir_kurumdan_burs_aliyor_mu
    data["baska_bir_kurumdan_burs_aliyor_mu"].fillna("hayir", inplace=True)

    # burs_aldigi_baska_kurum
    kyk = ["kyk","kredi","yurt"]
    vakif = ["vakif","vakfi"]

    data["burs_aldigi_baska_kurum"].fillna("almiyor", inplace=True)

    for kyk_val in kyk:
        data.loc[data["burs_aldigi_baska_kurum"].str.contains(kyk_val),"burs_aldigi_baska_kurum"] = "kyk"

    for vakif_val in vakif:
        data.loc[data["burs_aldigi_baska_kurum"].str.contains(vakif_val),"burs_aldigi_baska_kurum"] = "vakif"

    data.loc[data["burs_aldigi_baska_kurum"]=="-","burs_aldigi_baska_kurum"] = "almiyor"
    data.loc[(data["burs_aldigi_baska_kurum"].isin(["kyk","vakif","almiyor"])==False),"burs_aldigi_baska_kurum"] = "diger"

    # baska_kurumdan_aldigi_burs_miktari
    data['baska_kurumdan_aldigi_burs_miktari'] = data['baska_kurumdan_aldigi_burs_miktari'].apply(extract_numbers)

    # anne_egitim_durumu
    data.loc[data["anne_egitim_durumu"].isin(["egitim yok", "egitimi yok"]),"anne_egitim_durumu"] = "egitim_yok"
    data.loc[data["anne_egitim_durumu"].isin(["ilkokul mezunu", "ilkokul"]),"anne_egitim_durumu"] = "ilkokul"
    data.loc[data["anne_egitim_durumu"].isin(["ortaokul mezunu", "ortaokul"]),"anne_egitim_durumu"] = "ortaokul"
    data.loc[data["anne_egitim_durumu"].isin(["lise", "lise mezunu"]), "anne_egitim_durumu"] = "lise"
    data.loc[data["anne_egitim_durumu"].isin(["universite mezunu", "universite"]),"anne_egitim_durumu"] = "universite"
    data.loc[data["anne_egitim_durumu"].isin(["yuksek lisans","doktora","yuksek lisans / doktora","yuksek lisans / doktara"]),"anne_egitim_durumu"] = "yuksek_egitim"
    data["anne_egitim_durumu"].fillna("bilinmiyor", inplace=True)
    data.loc[data["anne_egitim_durumu"] == "bilinmiyor","anne_egitim_durumu"] = "ilkokul"

    # anne_calisma_durumu
    data.loc[data["anne_calisma_durumu"].isin(["emekli"]), "anne_calisma_durumu"] = "evet"
    data["anne_calisma_durumu"].fillna("hayir", inplace=True)

    # anne_sektor
    data.loc[data["anne_sektor"].isin(["0", "-"]), "anne_sektor"] = "calismiyor"
    data["anne_sektor"].fillna("calismiyor", inplace=True)

    # baba_egitim_durumu
    data.loc[data["baba_egitim_durumu"].isin(["egitim yok", "egitimi yok"]),"baba_egitim_durumu"] = "egitim_yok"
    data.loc[data["baba_egitim_durumu"].isin(["ilkokul mezunu", "ilkokul"]),"baba_egitim_durumu"] = "ilkokul"
    data.loc[data["baba_egitim_durumu"].isin(["ortaokul mezunu", "ortaokul"]),"baba_egitim_durumu",] = "ortaokul"
    data.loc[data["baba_egitim_durumu"].isin(["lise", "lise mezunu"]), "baba_egitim_durumu"] = "lise"
    data.loc[data["baba_egitim_durumu"].isin(["universite mezunu", "universite"]),"baba_egitim_durumu"] = "universite"
    data.loc[data["baba_egitim_durumu"].isin(["yuksek lisans","doktora","yuksek lisans / doktora","yuksek lisans / doktara"]),"baba_egitim_durumu"] = "yuksek_egitim"
    data["baba_egitim_durumu"].fillna("bilinmiyor", inplace=True)
    data.loc[data["baba_egitim_durumu"] == "0", "baba_egitim_durumu"] = "bilinmiyor"
    data.loc[data["baba_egitim_durumu"] == "bilinmiyor","baba_egitim_durumu"] = "ilkokul"

    # baba_calisma_durumu
    data.loc[data["baba_calisma_durumu"].isin(["emekli"]), "baba_calisma_durumu"] = "evet"
    data["baba_calisma_durumu"].fillna("hayir", inplace=True)

    # baba_sektor
    data.loc[data["baba_sektor"].isin(["0", "-"]), "baba_sektor"] = "calismiyor"
    data["baba_sektor"].fillna("calismiyor", inplace=True)

    # kardes_sayisi
    data.loc[data["kardes_sayisi"] >= 4, "kardes_sayisi"] = 4

    # girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz
    data["girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz"].fillna("hayir", inplace=True)

    # profesyonel_bir_spor_daliyla_mesgul_musunuz
    data["profesyonel_bir_spor_daliyla_mesgul_musunuz"].fillna("hayir", inplace=True)

    # spor_dalindaki_rolunuz_nedir
    data.loc[data["spor_dalindaki_rolunuz_nedir"].isin(["0", "-"]),"spor_dalindaki_rolunuz_nedir"] = "yok"
    data.loc[data["spor_dalindaki_rolunuz_nedir"].isin(["bireysel spor", "bireysel"]),"spor_dalindaki_rolunuz_nedir"] = "biyersel spor"
    data.loc[data["spor_dalindaki_rolunuz_nedir"].isin(["lider/kaptan", "kaptan", "kaptan / lider"]),"spor_dalindaki_rolunuz_nedir"] = "lider"
    data["spor_dalindaki_rolunuz_nedir"].fillna("yok", inplace=True)

    # aktif_olarak_bir_stk_uyesi_misiniz
    data["aktif_olarak_bir_stk_uyesi_misiniz"].fillna("hayir", inplace=True)

    # hangi_stk'nin_uyesisiniz
    data["hangi_stk'nin_uyesisiniz"].fillna("-", inplace=True)

    # girisimcilikle_ilgili_deneyiminiz_var_mi
    data["girisimcilikle_ilgili_deneyiminiz_var_mi"].fillna("hayir", inplace=True)
    
    # girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz
    data["girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz"].fillna("-", inplace=True)
    
    # ingilizce_biliyor_musunuz
    data["ingilizce_biliyor_musunuz"].fillna("hayir", inplace=True)

    # drops
    if drop_id==False:
        drop_cols = [col for col in drop_cols if col != "id"]
    data.drop(columns=drop_cols, inplace=True)
    data.drop(index=drop_index_list, inplace=True)
    data.reset_index(drop=True, inplace=True)

    # after drop process
    data.loc[data["dogum_yil"].isnull(),"dogum_yil"] = data.loc[data["dogum_yil"].isnull(),
                                                                ["basvuru_yili","universite_kacinci_sinif"]
                                                                ].apply(lambda x: x.basvuru_yili-18-x.universite_kacinci_sinif, axis=1)
    data["basvuru_yas"] = data["basvuru_yili"]-data["dogum_yil"]

    return data

In [10]:
def aciklama_combiner(train, test, col, words):

    aciklama_df = pd.DataFrame(data=[], columns=["word","count","mean_deger","median_deger"])

    for word in words:
        
        if type(word) is list:

            degers = []

            for word_t in word:

                temp = train.loc[(train[col].str.contains(word_t)), "degerlendirme_puani"].tolist()
                degers += temp
        
        else:

            degers = train.loc[(train[col].str.contains(word)), "degerlendirme_puani"].values

        mean_deger = np.mean(degers)
        median_deger = np.median(degers)
        count = len(degers)
        aciklama_df.loc[aciklama_df.shape[0],:] = [word, count, mean_deger, median_deger]
    
    words_sorted = aciklama_df.sort_values(by="mean_deger", ascending=False)["word"].values

    train[col+"_selected_word"] = "-"
    train[col+"_selected_count"] = 0
    test[col+"_selected_word"] = "-"
    test[col+"_selected_count"] = 0

    for selected_word in words_sorted:
        train.loc[(train[col].str.contains(selected_word)), col+"_selected_word"] = selected_word
        train.loc[(train[col].str.contains(selected_word)), col+"_selected_count"] = train.loc[(train[col].str.contains(selected_word)), col+"_selected_count"] + 1

        test.loc[(test[col].str.contains(selected_word)), col+"_selected_word"] = selected_word
        test.loc[(test[col].str.contains(selected_word)), col+"_selected_count"] = test.loc[(test[col].str.contains(selected_word)), col+"_selected_count"] + 1

    train.loc[(train[col+"_selected_word"]=="-") & (train[col]!="-"), col+"_selected_word"] = "diger"

    test.loc[(test[col+"_selected_word"]=="-") & (test[col]!="-"), col+"_selected_word"] = "diger"

    return train, test

In [32]:
def test_train_categorical_fixer(train, test):

    other_values_for_cat_cols = {
        "bolum":"diger",
        "universite_adi":"diger",
        "girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz":"diger",
        "hangi_stk'nin_uyesisiniz":"diger"
    }

    for col in other_values_for_cat_cols.keys():

        categories_not_in_train = [cat for cat in test[col].unique() if cat not in train[col].unique()]

        test.loc[test[col].isin(categories_not_in_train),col] = other_values_for_cat_cols[col]
    
    return train, test

In [12]:
def categorical_combiner(train, test, lists_dict):

    for cat_comb_col, val_cols in lists_dict.items():

        train[cat_comb_col] = ""
        test[cat_comb_col] = ""

        for col in val_cols:

            train[cat_comb_col] = train[cat_comb_col] + "_" + train[col]
            test[cat_comb_col] = test[cat_comb_col] + "_" + test[col]

    return train, test

In [66]:
def word_column_scorer(df, df_test, message_col, score_col, new_col):
    # Step 1: Tokenize each row in the message column and assign the score to each word
    word_scores = {}

    for i, row in df.iterrows():
        score = row[score_col]
        words = row[message_col].split()  # Split message into words
        for word in words:
            if word not in word_scores:
                word_scores[word] = []
            word_scores[word].append(score)

    # Step 2: Calculate the average score for each word across all rows and the overall average
    avg_word_scores = {word: np.mean(scores) for word, scores in word_scores.items()}
    
    # Step 3: Calculate the overall average score of all words
    overall_avg_score = np.mean([score for scores in word_scores.values() for score in scores])

    # Step 4: Replace the average score for words used less than 15 times with the overall average
    word_counts = {word: len(scores) for word, scores in word_scores.items()}
    for word, count in word_counts.items():
        if count < 30:
            avg_word_scores[word] = overall_avg_score

    # Step 5: Assign average scores to each word in each row and calculate row average
    def calculate_row_score(message, avg_word_scores):
        try:
            words = message.split()
            word_avg_scores = [avg_word_scores[word] for word in words]
            return np.mean(word_avg_scores)
        except KeyError:
            return 0
        
    # Apply the function to each row in the dataframe
    df[new_col] = df[message_col].apply(lambda msg: calculate_row_score(msg, avg_word_scores))
    df_test[new_col] = df_test[message_col].apply(lambda msg: calculate_row_score(msg, avg_word_scores))

    return df, df_test

In [14]:
lists_dict = {"ebeveyn_egitim_durumu":["anne_egitim_durumu","baba_egitim_durumu"],
              "ebeveyn_calisma_durumu":["anne_calisma_durumu","baba_calisma_durumu"],
              "ebeveyn_sektor":["anne_sektor","baba_sektor"],
              "universite_bolum":["universite_adi","bolum"],
              "lise_adi_ve_turu":["lise_adi","lise_turu"],
              "spor_ve_gonulluluk":["profesyonel_bir_spor_daliyla_mesgul_musunuz","aktif_olarak_bir_stk_uyesi_misiniz"],
              "spor_ve_girisim":["profesyonel_bir_spor_daliyla_mesgul_musunuz","girisimcilikle_ilgili_deneyiminiz_var_mi"],
              "spor_ve_girisim_kulup":["profesyonel_bir_spor_daliyla_mesgul_musunuz","girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz"],
              "gonulluluk_ve_girisim":["aktif_olarak_bir_stk_uyesi_misiniz","girisimcilikle_ilgili_deneyiminiz_var_mi"],
              "gonulluluk_ve_girisim_kulup":["aktif_olarak_bir_stk_uyesi_misiniz","girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz"],
              "girisim_ve_girisim_kulup":["girisimcilikle_ilgili_deneyiminiz_var_mi","girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz"],
              "spor_gonulluluk_ve_girisim_kulup":["profesyonel_bir_spor_daliyla_mesgul_musunuz","aktif_olarak_bir_stk_uyesi_misiniz","girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz"],
              "girisim_gonulluluk_ve_girisim_kulup":["girisimcilikle_ilgili_deneyiminiz_var_mi","aktif_olarak_bir_stk_uyesi_misiniz","girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz"],
              "spor_girisim_kulup_ve_girisim":["profesyonel_bir_spor_daliyla_mesgul_musunuz","girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz","girisimcilikle_ilgili_deneyiminiz_var_mi"],
              "spor_gonulluluk_ve_girisim":["profesyonel_bir_spor_daliyla_mesgul_musunuz","aktif_olarak_bir_stk_uyesi_misiniz","girisimcilikle_ilgili_deneyiminiz_var_mi"],
              "spor_gonulluluk_girisim_kulup_ve_girisim":["profesyonel_bir_spor_daliyla_mesgul_musunuz","aktif_olarak_bir_stk_uyesi_misiniz","girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz","girisimcilikle_ilgili_deneyiminiz_var_mi"]}

girisim_kelimeler = ["e-ticaret","marka","patent","isletme","herbalife",
                "network","esnaf","sosyal","sorumluluk","web","teknoloji",
                "yazilim","ticar","kendim","uretim","tekstil","giyim",
                "drop","nft","bitcoin","borsa","pazarla","pazar","dukkan",
                "fellow","seminer","sertifika","bilgisayar","covid",
                "ieee","muhendis","medya","reklam","al sat","alim satim",
                "yurtdisi","amazon","startup","kosgeb","insta","youtube",
                "facebook","freelance","yarisma","taki","teknofest","teknopark",
                "finans","garson","turizm","baski","www","bigg","oyun",
                "sanat","gida"]

gonulluluk_kelimeler = ["losev","tegev","cydd","cagdas","afad","yesil","kizil","unicef",
                   "tema","akut","ahbap","leo","lion","green","hayvan","ihh","insan",
                   "doga","sivil","kadin","lider","cocuk","aiesec","aisec","ieee","saglik",
                   "genclik","giris","mezun","ted","wwf","kacuv","yarisma",
                   "teknoloji","dijital","dunya","toplum","tegv","egitim",
                   "tog","tugva","habitat","yga","young","ataturk","otesi",
                   "isletme","siyamder","unifeb","fener","galatasaray","ultraslan","rotaract",
                   "carsi","meclis","siyas","aegee","erasmus","turmepa","deniz",
                   "sosyalben","simurg","islam","bilim","ilim"]

In [15]:
# Download Turkish stopwords from nltk
nltk.download('stopwords')
nltk.download('punkt')

# Turkish stopwords
turkish_stopwords = stopwords.words('turkish')

# Preprocessing function for Turkish text
def preprocess_turkish_text(text):
    # Lowercasing
    text = text.lower()
    # Remove special characters (keep Turkish characters intact)
    text = re.sub(r'[^a-zA-ZçğıöşüÇĞİÖŞÜ\s]', ' ', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove Turkish stopwords
    words = [word for word in words if word not in turkish_stopwords]
    return ' '.join(words)

# Load the Turkish model
nlp = spacy.load('tr_core_news_md')

# Function to lemmatize Turkish text using spaCy
def lemmatize_turkish_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])
    
def text_scorer(train,test):

    train = renamer(train)
    test = renamer(test)

    print("girisimcilik start")
    text_col = "girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz"
    train[text_col] = train[text_col].fillna("-").apply(preprocess_turkish_text)
    test[text_col] = test[text_col].fillna("-").apply(preprocess_turkish_text)
    train[text_col] = train[text_col].apply(lemmatize_turkish_text)
    test[text_col] = test[text_col].apply(lemmatize_turkish_text)

    vectorizer = TfidfVectorizer(max_features=15000)
    X = vectorizer.fit_transform(train[text_col])
    X_test = vectorizer.transform(test[text_col])
    y = train["degerlendirme_puani"].fillna(0)

    print("girisimcilik tfidf start")
    model_tfidf = XGBRegressor(random_state=42)
    model_tfidf.fit(X, y)

    train["girisimcilik_tfidf"] = model_tfidf.predict(X)
    test["girisimcilik_tfidf"] = model_tfidf.predict(X_test)

    print("gonulluluk start")
    text_col = "hangi_stk'nin_uyesisiniz"
    train[text_col] = train[text_col].fillna("-").apply(preprocess_turkish_text)
    test[text_col] = test[text_col].fillna("-").apply(preprocess_turkish_text)
    train[text_col] = train[text_col].apply(lemmatize_turkish_text)
    test[text_col] = test[text_col].apply(lemmatize_turkish_text)

    vectorizer = TfidfVectorizer(max_features=15000)
    X = vectorizer.fit_transform(train[text_col])
    X_test = vectorizer.transform(test[text_col])
    y = train["degerlendirme_puani"].fillna(0)

    print("gonulluluk tfidf start")
    model_tfidf = XGBRegressor(random_state=42, verbose=0)
    model_tfidf.fit(X, y)

    train["gonulluluk_tfidf"] = model_tfidf.predict(X)
    test["gonulluluk_tfidf"] = model_tfidf.predict(X_test)

    return train, test

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bilalcanustabas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bilalcanustabas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
print("text scorer start")
train,test = text_scorer(train,test)
print("text scorer finished")

train = preprocess(train)
test = preprocess(test, type="test")

train, test = aciklama_combiner(train, test, 
                                "girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz", girisim_kelimeler)
train, test = aciklama_combiner(train, test, 
                                "hangi_stk'nin_uyesisiniz", gonulluluk_kelimeler)

train, test = categorical_combiner(train, test, lists_dict)

train, test = test_train_categorical_fixer(train, test)

score_col = 'degerlendirme_puani'

message_col = "girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz"
new_col = "girisimcilik_score"
train, test = word_column_scorer(train, test, message_col, score_col, new_col)

message_col = "hangi_stk'nin_uyesisiniz"
new_col = "gonulluluk_score"
train, test = word_column_scorer(train, test, message_col, score_col, new_col)

train_merged = train.merge(university_df, 
                           left_on=["universite_adi","bolum"], right_on=["universite_adi","bolum_adi"], 
                           how="left")
train_merged = train_merged.drop(columns=["bolum_adi"])

test_merged = test.merge(university_df, 
                         left_on=["universite_adi","bolum"], right_on=["universite_adi","bolum_adi"], 
                         how="left")
test_merged = test_merged.drop(columns=["bolum_adi"])

def university_missing_value_handler(df):
    df.loc[df["akreditasyon"].isnull(),"akreditasyon"] = "diger"
    df.loc[df["fakulte_adi"].isnull(),"fakulte_adi"] = "diger"
    df.loc[df["ogr_sure"].isnull(),"ogr_sure"] = -1
    df.loc[df["puan_turu"].isnull(),"puan_turu"] = -1
    df.loc[df["ok_bir_kont"].isnull(),"ok_bir_kont"] = -1
    df.loc[df["basari_sirasi"].isnull(),"basari_sirasi"] = -1
    df.loc[df["yks_puan_min"].isnull(),"yks_puan_min"] = -1
    df.loc[df["pdr_count"].isnull(),"pdr_count"] = -1
    df.loc[df["ddr_count"].isnull(),"ddr_count"] = -1
    df.loc[df["drogr_count"].isnull(),"drogr_count"] = -1
    df.loc[df["kpss_1"].isnull(),"kpss_1"] = -1
    df.loc[df["kpss_2"].isnull(),"kpss_2"] = -1
    return df

train_merged = university_missing_value_handler(train_merged)
test_merged = university_missing_value_handler(test_merged)

less_30_groups = train_merged["bolum"].value_counts()[train_merged["bolum"].value_counts()<30].keys()
train_merged.loc[train_merged["bolum"].isin(less_30_groups),"bolum"] = "diger"
test_merged.loc[test_merged["bolum"].isin(less_30_groups),"bolum"] = "diger"

less_30_groups = train_merged["universite_adi"].value_counts()[train_merged["universite_adi"].value_counts()<30].keys()
train_merged.loc[train_merged["universite_adi"].isin(less_30_groups),"universite_adi"] = "diger"
test_merged.loc[test_merged["universite_adi"].isin(less_30_groups),"universite_adi"] = "diger"

text scorer start
girisimcilik start
girisimcilik tfidf start
gonulluluk start
gonulluluk tfidf start


Parameters: { "verbose" } are not used.



text scorer finished


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [17]:
train_merged.isnull().sum()[train_merged.isnull().sum() > 0]

dogum_yeri              791
ikametgah_sehri        2000
lise_sehir             1054
girisimcilik_score    55598
gonulluluk_score      52297
genel_kontr           13299
dtype: int64

In [18]:
test_merged.isnull().sum()[test_merged.isnull().sum() > 0]

girisimcilik_score    7739
gonulluluk_score      4740
genel_kontr           1297
dtype: int64

In [67]:
train_for_score_text = pd.read_csv("data/train.csv")
train_for_score_text = renamer(train_for_score_text)
train_for_score_text["degerlendirme_puani"] = train_for_score_text["degerlendirme_puani"].fillna(0)

test_for_score_text = pd.read_csv("data/test_x.csv")
test_for_score_text = renamer(test_for_score_text)

train_for_score_text["girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz"
                     ] = train_for_score_text["girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz"].apply(string_fixer)
test_for_score_text["girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz"
                    ] = test_for_score_text["girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz"].apply(string_fixer)

train_for_score_text["hangi_stk'nin_uyesisiniz"].fillna("-", inplace=True)
train_for_score_text["girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz"].fillna("-", inplace=True)
test_for_score_text["hangi_stk'nin_uyesisiniz"].fillna("-", inplace=True)
test_for_score_text["girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz"].fillna("-", inplace=True)

train_for_score_text, test_for_score_text = aciklama_combiner(train_for_score_text, test_for_score_text, 
                                "girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz", girisim_kelimeler)
train_for_score_text, test_for_score_text = aciklama_combiner(train_for_score_text, test_for_score_text, 
                                "hangi_stk'nin_uyesisiniz", gonulluluk_kelimeler)

score_col = 'degerlendirme_puani'

message_col = "girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz"
new_col = "girisimcilik_score"
train_for_score_text, test_for_score_text = word_column_scorer(train_for_score_text, test_for_score_text, 
                                                message_col, score_col, new_col)

message_col = "hangi_stk'nin_uyesisiniz"
new_col = "gonulluluk_score"
train_for_score_text, test_for_score_text = word_column_scorer(train_for_score_text, test_for_score_text, 
                                                message_col, score_col, new_col)

  train_for_score_text = pd.read_csv("data/train.csv")
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [68]:
train_deneme = renamer(pd.read_csv("data/train.csv"))
valid_train_ids = preprocess(train_deneme, type="train", drop_id=False)["id"].values

  train_deneme = renamer(pd.read_csv("data/train.csv"))


In [69]:
len(valid_train_ids)

64924

In [70]:
selected_score_cols = ["girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz_selected_word",
                       "girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz_selected_count","hangi_stk'nin_uyesisiniz_selected_word",
                       "hangi_stk'nin_uyesisiniz_selected_count","girisimcilik_score","gonulluluk_score"]
train_for_score_text_selecteds = train_for_score_text.loc[train_for_score_text["id"].isin(valid_train_ids),selected_score_cols].copy()
test_for_score_text_selecteds = test_for_score_text.loc[:,selected_score_cols].copy()

In [71]:
train_for_score_text_selecteds.shape

(64924, 6)

In [72]:
test_for_score_text_selecteds.shape

(11049, 6)

In [37]:
train_merged.to_csv("train_preprocessed_all.csv", index=False)
test_merged.to_csv("test_preprocessed_all.csv", index=False)

In [73]:
train_merged[selected_score_cols] = train_for_score_text_selecteds.values
test_merged[selected_score_cols] = test_for_score_text_selecteds.values

In [74]:
train_merged.shape

(64924, 77)

In [75]:
train_merged.isnull().sum()[train_merged.isnull().sum()>0]

dogum_yeri           791
ikametgah_sehri     2000
lise_sehir          1054
genel_kontr        13299
dtype: int64

In [76]:
# Step 1: Group by class and calculate the median score for each class
bolum_kont_means = train_merged.groupby('bolum')['genel_kontr'].mean()

# Step 2: Fill missing values in the score column with the median score of the corresponding class
train_merged['genel_kontr'] = train_merged.apply(
    lambda row: bolum_kont_means[row['bolum']] if pd.isna(row['genel_kontr']) else row['genel_kontr'], 
    axis=1
)
test_merged['genel_kontr'] = test_merged.apply(
    lambda row: bolum_kont_means[row['bolum']] if pd.isna(row['genel_kontr']) else row['genel_kontr'], 
    axis=1
)

In [78]:
train_merged["dogum_yeri"] = train_merged["dogum_yeri"].apply(lambda x: place_calculator(x, ilce_file)).fillna("bilinmiyor")
train_merged["ikametgah_sehri"] = train_merged["ikametgah_sehri"].apply(lambda x: place_calculator(x, ilce_file)).fillna("bilinmiyor")
train_merged["lise_sehir"] = train_merged["lise_sehir"].apply(lambda x: place_calculator(x, ilce_file)).fillna("bilinmiyor")

In [79]:
test_merged["dogum_yeri"] = test_merged["dogum_yeri"].apply(lambda x: place_calculator(x, ilce_file)).fillna("bilinmiyor")
test_merged["ikametgah_sehri"] = test_merged["ikametgah_sehri"].apply(lambda x: place_calculator(x, ilce_file)).fillna("bilinmiyor")
test_merged["lise_sehir"] = test_merged["lise_sehir"].apply(lambda x: place_calculator(x, ilce_file)).fillna("bilinmiyor")

In [81]:
train_merged.isnull().sum()[train_merged.isnull().sum()>0]

genel_kontr    614
dtype: int64

In [82]:
test_merged.isnull().sum()[test_merged.isnull().sum()>0]

genel_kontr    49
dtype: int64

In [83]:
train_merged['genel_kontr'] = train_merged['genel_kontr'].fillna(train_merged['genel_kontr'].mean())
test_merged['genel_kontr'] = test_merged['genel_kontr'].fillna(test_merged['genel_kontr'].mean())

In [84]:
train_merged.isnull().sum()[train_merged.isnull().sum()>0]

Series([], dtype: int64)

In [85]:
test_merged.isnull().sum()[test_merged.isnull().sum()>0]

Series([], dtype: int64)

In [90]:
train_merged["girisimcilik_score"] = train_merged["girisimcilik_score"].astype(float)
train_merged["gonulluluk_score"] = train_merged["gonulluluk_score"].astype(float)
test_merged["girisimcilik_score"] = test_merged["girisimcilik_score"].astype(float)
test_merged["gonulluluk_score"] = test_merged["gonulluluk_score"].astype(float)

In [91]:
train_merged.to_csv("train_preprocessed_all_final.csv", index=False)
test_merged.to_csv("test_preprocessed_all_final.csv", index=False)

### MODELLING

In [92]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani"])
y = train_merged["degerlendirme_puani"]

model = CatBoostRegressor(objective="RMSE", eval_metric="RMSE", iterations=1000,
                          random_state=42, cat_features=cat_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print(f"train rmse: {root_mean_squared_error(y_train, y_pred_train)}")
print(f"test rmse: {root_mean_squared_error(y_test, y_pred)}")

Learning rate set to 0.094673
0:	learn: 16.9574669	test: 16.8926971	best: 16.8926971 (0)	total: 145ms	remaining: 2m 24s
1:	learn: 15.8340612	test: 15.7660709	best: 15.7660709 (1)	total: 218ms	remaining: 1m 48s
2:	learn: 14.8334254	test: 14.7705692	best: 14.7705692 (2)	total: 281ms	remaining: 1m 33s
3:	learn: 13.9696978	test: 13.9060092	best: 13.9060092 (3)	total: 355ms	remaining: 1m 28s
4:	learn: 13.1839687	test: 13.1257066	best: 13.1257066 (4)	total: 442ms	remaining: 1m 27s
5:	learn: 12.5170230	test: 12.4574482	best: 12.4574482 (5)	total: 524ms	remaining: 1m 26s
6:	learn: 11.9064660	test: 11.8473921	best: 11.8473921 (6)	total: 597ms	remaining: 1m 24s
7:	learn: 11.3770598	test: 11.3184257	best: 11.3184257 (7)	total: 661ms	remaining: 1m 21s
8:	learn: 10.9337699	test: 10.8799269	best: 10.8799269 (8)	total: 727ms	remaining: 1m 20s
9:	learn: 10.5325552	test: 10.4804347	best: 10.4804347 (9)	total: 801ms	remaining: 1m 19s
10:	learn: 10.1852111	test: 10.1318853	best: 10.1318853 (10)	total: 86

In [93]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani"])
y = train_merged["degerlendirme_puani"]

model = CatBoostRegressor(objective="RMSE", eval_metric="RMSE", iterations=3000,
                          random_state=42, cat_features=cat_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print(f"train rmse: {root_mean_squared_error(y_train, y_pred_train)}")
print(f"test rmse: {root_mean_squared_error(y_test, y_pred)}")

Learning rate set to 0.048438
0:	learn: 17.5434213	test: 17.4764188	best: 17.4764188 (0)	total: 80.1ms	remaining: 4m
1:	learn: 16.9305627	test: 16.8620975	best: 16.8620975 (1)	total: 157ms	remaining: 3m 54s
2:	learn: 16.3708171	test: 16.3074664	best: 16.3074664 (2)	total: 238ms	remaining: 3m 57s
3:	learn: 15.8304090	test: 15.7674490	best: 15.7674490 (3)	total: 318ms	remaining: 3m 58s
4:	learn: 15.3104371	test: 15.2485032	best: 15.2485032 (4)	total: 385ms	remaining: 3m 50s
5:	learn: 14.8335851	test: 14.7712390	best: 14.7712390 (5)	total: 467ms	remaining: 3m 53s
6:	learn: 14.3830621	test: 14.3234812	best: 14.3234812 (6)	total: 560ms	remaining: 3m 59s
7:	learn: 13.9568518	test: 13.8995110	best: 13.8995110 (7)	total: 633ms	remaining: 3m 56s
8:	learn: 13.5656185	test: 13.5227750	best: 13.5227750 (8)	total: 703ms	remaining: 3m 53s
9:	learn: 13.1826786	test: 13.1432801	best: 13.1432801 (9)	total: 783ms	remaining: 3m 53s
10:	learn: 12.8299906	test: 12.7916883	best: 12.7916883 (10)	total: 856ms

In [94]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani"])
y = train_merged["degerlendirme_puani"]

model = CatBoostRegressor(objective="RMSE", eval_metric="RMSE", iterations=5000,
                          random_state=42, cat_features=cat_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print(f"train rmse: {root_mean_squared_error(y_train, y_pred_train)}")
print(f"test rmse: {root_mean_squared_error(y_test, y_pred)}")

Learning rate set to 0.03547
0:	learn: 17.7093822	test: 17.6415434	best: 17.6415434 (0)	total: 91.1ms	remaining: 7m 35s
1:	learn: 17.2528217	test: 17.1839493	best: 17.1839493 (1)	total: 196ms	remaining: 8m 9s
2:	learn: 16.8203433	test: 16.7536992	best: 16.7536992 (2)	total: 278ms	remaining: 7m 43s
3:	learn: 16.4050125	test: 16.3387329	best: 16.3387329 (3)	total: 363ms	remaining: 7m 33s
4:	learn: 16.0066079	test: 15.9393980	best: 15.9393980 (4)	total: 440ms	remaining: 7m 19s
5:	learn: 15.6247227	test: 15.5591261	best: 15.5591261 (5)	total: 519ms	remaining: 7m 12s
6:	learn: 15.2543774	test: 15.1895099	best: 15.1895099 (6)	total: 606ms	remaining: 7m 12s
7:	learn: 14.9079291	test: 14.8444457	best: 14.8444457 (7)	total: 699ms	remaining: 7m 16s
8:	learn: 14.5711205	test: 14.5094399	best: 14.5094399 (8)	total: 797ms	remaining: 7m 22s
9:	learn: 14.2466340	test: 14.1864219	best: 14.1864219 (9)	total: 930ms	remaining: 7m 43s
10:	learn: 13.9478080	test: 13.8898083	best: 13.8898083 (10)	total: 1.0

In [95]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani"])
y = train_merged["degerlendirme_puani"]

model = CatBoostRegressor(objective="RMSE", eval_metric="RMSE", learning_rate=0.079166, iterations=3000,
                          random_state=42, cat_features=cat_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print(f"train rmse: {root_mean_squared_error(y_train, y_pred_train)}")
print(f"test rmse: {root_mean_squared_error(y_test, y_pred)}")

0:	learn: 17.1529602	test: 17.0875761	best: 17.0875761 (0)	total: 88ms	remaining: 4m 23s
1:	learn: 16.1926273	test: 16.1246522	best: 16.1246522 (1)	total: 173ms	remaining: 4m 18s
2:	learn: 15.3179180	test: 15.2545261	best: 15.2545261 (2)	total: 262ms	remaining: 4m 22s
3:	learn: 14.5442425	test: 14.4804772	best: 14.4804772 (3)	total: 355ms	remaining: 4m 26s
4:	learn: 13.8285001	test: 13.7655391	best: 13.7655391 (4)	total: 434ms	remaining: 4m 20s
5:	learn: 13.1908707	test: 13.1303462	best: 13.1303462 (5)	total: 522ms	remaining: 4m 20s
6:	learn: 12.6095835	test: 12.5520740	best: 12.5520740 (6)	total: 606ms	remaining: 4m 19s
7:	learn: 12.1039182	test: 12.0438492	best: 12.0438492 (7)	total: 708ms	remaining: 4m 24s
8:	learn: 11.6402508	test: 11.5831016	best: 11.5831016 (8)	total: 827ms	remaining: 4m 34s
9:	learn: 11.2277372	test: 11.1699036	best: 11.1699036 (9)	total: 919ms	remaining: 4m 34s
10:	learn: 10.8586105	test: 10.8023946	best: 10.8023946 (10)	total: 1.01s	remaining: 4m 34s
11:	learn

In [96]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

model = CatBoostRegressor(objective="RMSE", learning_rate=0.079166, iterations=2977,
                          random_state=42, cat_features=cat_cols, eval_metric="RMSE")

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani"])
y = train_merged["degerlendirme_puani"]

model.fit(X, y)

preds = model.predict(test_merged)

test_df = pd.read_csv('data/test_x.csv')
df_sub = pd.DataFrame({
    'id': test_df['id'],  
    'Degerlendirme Puani': preds 
})

df_sub.to_csv('submissions/submission_preprocess_VALL_ITR2977_LR079166.csv', index=False)

0:	learn: 17.1115505	total: 116ms	remaining: 5m 44s
1:	learn: 16.1511920	total: 246ms	remaining: 6m 5s
2:	learn: 15.2957039	total: 368ms	remaining: 6m 4s
3:	learn: 14.5191484	total: 493ms	remaining: 6m 6s
4:	learn: 13.8264287	total: 622ms	remaining: 6m 9s
5:	learn: 13.2043150	total: 731ms	remaining: 6m 1s
6:	learn: 12.6280962	total: 856ms	remaining: 6m 3s
7:	learn: 12.1034199	total: 988ms	remaining: 6m 6s
8:	learn: 11.6411933	total: 1.09s	remaining: 5m 58s
9:	learn: 11.2320887	total: 1.29s	remaining: 6m 23s
10:	learn: 10.8603057	total: 1.39s	remaining: 6m 16s
11:	learn: 10.5367642	total: 1.53s	remaining: 6m 17s
12:	learn: 10.2521904	total: 1.66s	remaining: 6m 17s
13:	learn: 9.9890577	total: 1.79s	remaining: 6m 18s
14:	learn: 9.7528813	total: 1.91s	remaining: 6m 17s
15:	learn: 9.5422870	total: 2.02s	remaining: 6m 13s
16:	learn: 9.3532697	total: 2.14s	remaining: 6m 13s
17:	learn: 9.1769324	total: 2.25s	remaining: 6m 9s
18:	learn: 9.0346179	total: 2.38s	remaining: 6m 10s
19:	learn: 8.8984

In [97]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

model = CatBoostRegressor(objective="RMSE", learning_rate=0.03547, iterations=4777,
                          random_state=42, cat_features=cat_cols, eval_metric="RMSE")

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani"])
y = train_merged["degerlendirme_puani"]

model.fit(X, y)

preds = model.predict(test_merged)

test_df = pd.read_csv('data/test_x.csv')
df_sub = pd.DataFrame({
    'id': test_df['id'],  
    'Degerlendirme Puani': preds 
})

df_sub.to_csv('submissions/submission_preprocess_VALL_ITR4777_LR03547.csv', index=False)

0:	learn: 17.6831726	total: 95.1ms	remaining: 7m 34s
1:	learn: 17.2246223	total: 192ms	remaining: 7m 37s
2:	learn: 16.7907687	total: 286ms	remaining: 7m 35s
3:	learn: 16.3720143	total: 399ms	remaining: 7m 56s
4:	learn: 15.9877876	total: 521ms	remaining: 8m 17s
5:	learn: 15.6058418	total: 630ms	remaining: 8m 20s
6:	learn: 15.2330316	total: 742ms	remaining: 8m 25s
7:	learn: 14.8937542	total: 874ms	remaining: 8m 41s
8:	learn: 14.5527689	total: 1.02s	remaining: 9m 3s
9:	learn: 14.2303568	total: 1.2s	remaining: 9m 34s
10:	learn: 13.9246189	total: 1.38s	remaining: 9m 55s
11:	learn: 13.6286658	total: 1.49s	remaining: 9m 49s
12:	learn: 13.3496563	total: 1.61s	remaining: 9m 51s
13:	learn: 13.0840671	total: 1.74s	remaining: 9m 52s
14:	learn: 12.8300464	total: 1.86s	remaining: 9m 50s
15:	learn: 12.5815489	total: 1.98s	remaining: 9m 48s
16:	learn: 12.3477899	total: 2.11s	remaining: 9m 50s
17:	learn: 12.1290360	total: 2.24s	remaining: 9m 51s
18:	learn: 11.9153697	total: 2.35s	remaining: 9m 49s
19:	

In [99]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.loc[train_merged["basvuru_yili"].isin([2016,2019])==False].drop(columns=["degerlendirme_puani"])
y = train_merged.loc[train_merged["basvuru_yili"].isin([2016,2019])==False]["degerlendirme_puani"]

model = CatBoostRegressor(objective="RMSE", eval_metric="RMSE", learning_rate=0.094673, iterations=3000,
                          random_state=42, cat_features=cat_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print(f"train rmse: {root_mean_squared_error(y_train, y_pred_train)}")
print(f"test rmse: {root_mean_squared_error(y_test, y_pred)}")

0:	learn: 14.4214218	test: 14.5300884	best: 14.5300884 (0)	total: 53.7ms	remaining: 2m 41s
1:	learn: 13.6282002	test: 13.7396307	best: 13.7396307 (1)	total: 120ms	remaining: 3m
2:	learn: 12.9598689	test: 13.0591659	best: 13.0591659 (2)	total: 184ms	remaining: 3m 4s
3:	learn: 12.3310725	test: 12.4277654	best: 12.4277654 (3)	total: 248ms	remaining: 3m 5s
4:	learn: 11.7926259	test: 11.8903272	best: 11.8903272 (4)	total: 300ms	remaining: 2m 59s
5:	learn: 11.3345772	test: 11.4388138	best: 11.4388138 (5)	total: 351ms	remaining: 2m 55s
6:	learn: 10.9140622	test: 11.0269403	best: 11.0269403 (6)	total: 405ms	remaining: 2m 53s
7:	learn: 10.5448306	test: 10.6591481	best: 10.6591481 (7)	total: 453ms	remaining: 2m 49s
8:	learn: 10.2442274	test: 10.3557517	best: 10.3557517 (8)	total: 519ms	remaining: 2m 52s
9:	learn: 9.9840887	test: 10.0935145	best: 10.0935145 (9)	total: 577ms	remaining: 2m 52s
10:	learn: 9.7527280	test: 9.8603020	best: 9.8603020 (10)	total: 624ms	remaining: 2m 49s
11:	learn: 9.5128

In [100]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani","gonulluluk_tfidf","girisimcilik_tfidf"])
y = train_merged["degerlendirme_puani"]

model = CatBoostRegressor(objective="RMSE", eval_metric="RMSE", learning_rate=0.094673, iterations=3000,
                          random_state=42, cat_features=cat_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print(f"train rmse: {root_mean_squared_error(y_train, y_pred_train)}")
print(f"test rmse: {root_mean_squared_error(y_test, y_pred)}")

0:	learn: 16.8895340	test: 16.8256164	best: 16.8256164 (0)	total: 68.8ms	remaining: 3m 26s
1:	learn: 15.7852795	test: 15.7203671	best: 15.7203671 (1)	total: 139ms	remaining: 3m 28s
2:	learn: 14.7930924	test: 14.7293306	best: 14.7293306 (2)	total: 200ms	remaining: 3m 20s
3:	learn: 13.9179011	test: 13.8564279	best: 13.8564279 (3)	total: 268ms	remaining: 3m 20s
4:	learn: 13.1375015	test: 13.0852896	best: 13.0852896 (4)	total: 342ms	remaining: 3m 24s
5:	learn: 12.4594762	test: 12.4085958	best: 12.4085958 (5)	total: 409ms	remaining: 3m 24s
6:	learn: 11.8652875	test: 11.8212620	best: 11.8212620 (6)	total: 480ms	remaining: 3m 25s
7:	learn: 11.3539247	test: 11.3102419	best: 11.3102419 (7)	total: 540ms	remaining: 3m 22s
8:	learn: 10.8926316	test: 10.8656962	best: 10.8656962 (8)	total: 611ms	remaining: 3m 23s
9:	learn: 10.5010741	test: 10.4603306	best: 10.4603306 (9)	total: 682ms	remaining: 3m 23s
10:	learn: 10.1594887	test: 10.1168040	best: 10.1168040 (10)	total: 744ms	remaining: 3m 22s
11:	lea

In [101]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani","gonulluluk_tfidf","girisimcilik_tfidf"])
y = train_merged["degerlendirme_puani"]

model = CatBoostRegressor(objective="RMSE", eval_metric="RMSE", learning_rate=0.079166, iterations=3000,
                          random_state=42, cat_features=cat_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print(f"train rmse: {root_mean_squared_error(y_train, y_pred_train)}")
print(f"test rmse: {root_mean_squared_error(y_test, y_pred)}")

0:	learn: 17.0963659	test: 17.0314283	best: 17.0314283 (0)	total: 67.8ms	remaining: 3m 23s
1:	learn: 16.1507458	test: 16.0851499	best: 16.0851499 (1)	total: 143ms	remaining: 3m 35s
2:	learn: 15.2826354	test: 15.2183575	best: 15.2183575 (2)	total: 203ms	remaining: 3m 22s
3:	learn: 14.4995905	test: 14.4393818	best: 14.4393818 (3)	total: 267ms	remaining: 3m 19s
4:	learn: 13.7860049	test: 13.7343291	best: 13.7343291 (4)	total: 339ms	remaining: 3m 23s
5:	learn: 13.1487433	test: 13.0986750	best: 13.0986750 (5)	total: 413ms	remaining: 3m 26s
6:	learn: 12.5866270	test: 12.5441555	best: 12.5441555 (6)	total: 494ms	remaining: 3m 31s
7:	learn: 12.0711301	test: 12.0265762	best: 12.0265762 (7)	total: 595ms	remaining: 3m 42s
8:	learn: 11.6122813	test: 11.5699531	best: 11.5699531 (8)	total: 660ms	remaining: 3m 39s
9:	learn: 11.2095845	test: 11.1654298	best: 11.1654298 (9)	total: 721ms	remaining: 3m 35s
10:	learn: 10.8411056	test: 10.7973412	best: 10.7973412 (10)	total: 783ms	remaining: 3m 32s
11:	lea

In [102]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

model = CatBoostRegressor(objective="RMSE", learning_rate=0.079166, iterations=2891,
                          random_state=42, cat_features=cat_cols, eval_metric="RMSE")

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani","gonulluluk_tfidf","girisimcilik_tfidf"])
y = train_merged["degerlendirme_puani"]

model.fit(X, y)

preds = model.predict(test_merged.drop(columns=["gonulluluk_tfidf","girisimcilik_tfidf"]))

test_df = pd.read_csv('data/test_x.csv')
df_sub = pd.DataFrame({
    'id': test_df['id'],  
    'Degerlendirme Puani': preds 
})

df_sub.to_csv('submissions/submission_preprocess_vALL_NON_TFIDF_ITR2891_LR079166.csv', index=False)

0:	learn: 17.1021413	total: 71.2ms	remaining: 3m 25s
1:	learn: 16.1368685	total: 146ms	remaining: 3m 31s
2:	learn: 15.2810397	total: 210ms	remaining: 3m 21s
3:	learn: 14.5000267	total: 285ms	remaining: 3m 25s
4:	learn: 13.8056994	total: 362ms	remaining: 3m 28s
5:	learn: 13.1599537	total: 427ms	remaining: 3m 25s
6:	learn: 12.5883593	total: 489ms	remaining: 3m 21s
7:	learn: 12.0609117	total: 558ms	remaining: 3m 20s
8:	learn: 11.5983317	total: 627ms	remaining: 3m 20s
9:	learn: 11.1813784	total: 705ms	remaining: 3m 23s
10:	learn: 10.8175160	total: 780ms	remaining: 3m 24s
11:	learn: 10.4941148	total: 850ms	remaining: 3m 24s
12:	learn: 10.2083831	total: 925ms	remaining: 3m 24s
13:	learn: 9.9464224	total: 994ms	remaining: 3m 24s
14:	learn: 9.7039447	total: 1.05s	remaining: 3m 22s
15:	learn: 9.5021361	total: 1.12s	remaining: 3m 20s
16:	learn: 9.3079957	total: 1.19s	remaining: 3m 20s
17:	learn: 9.1366732	total: 1.25s	remaining: 3m 19s
18:	learn: 8.9815416	total: 1.31s	remaining: 3m 18s
19:	lear

In [103]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani","gonulluluk_tfidf","girisimcilik_tfidf"])
y = train_merged["degerlendirme_puani"]

model = CatBoostRegressor(objective="RMSE", eval_metric="RMSE", learning_rate=0.03547, iterations=5000,
                          random_state=42, cat_features=cat_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print(f"train rmse: {root_mean_squared_error(y_train, y_pred_train)}")
print(f"test rmse: {root_mean_squared_error(y_test, y_pred)}")

0:	learn: 17.6842884	test: 17.6163276	best: 17.6163276 (0)	total: 85ms	remaining: 7m 5s
1:	learn: 17.2326830	test: 17.1646444	best: 17.1646444 (1)	total: 157ms	remaining: 6m 31s
2:	learn: 16.7940787	test: 16.7265488	best: 16.7265488 (2)	total: 213ms	remaining: 5m 54s
3:	learn: 16.3749875	test: 16.3095467	best: 16.3095467 (3)	total: 275ms	remaining: 5m 43s
4:	learn: 15.9708547	test: 15.9079692	best: 15.9079692 (4)	total: 347ms	remaining: 5m 47s
5:	learn: 15.5877811	test: 15.5243271	best: 15.5243271 (5)	total: 410ms	remaining: 5m 41s
6:	learn: 15.2265336	test: 15.1632964	best: 15.1632964 (6)	total: 476ms	remaining: 5m 39s
7:	learn: 14.8719401	test: 14.8092069	best: 14.8092069 (7)	total: 537ms	remaining: 5m 35s
8:	learn: 14.5360226	test: 14.4749461	best: 14.4749461 (8)	total: 604ms	remaining: 5m 34s
9:	learn: 14.2221447	test: 14.1596608	best: 14.1596608 (9)	total: 663ms	remaining: 5m 30s
10:	learn: 13.9169120	test: 13.8563279	best: 13.8563279 (10)	total: 724ms	remaining: 5m 28s
11:	learn:

In [104]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani","gonulluluk_tfidf","girisimcilik_tfidf"])
y = train_merged["degerlendirme_puani"]

model = CatBoostRegressor(objective="RMSE", eval_metric="RMSE", learning_rate=0.03547, iterations=7500,
                          random_state=42, cat_features=cat_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print(f"train rmse: {root_mean_squared_error(y_train, y_pred_train)}")
print(f"test rmse: {root_mean_squared_error(y_test, y_pred)}")

0:	learn: 17.6842884	test: 17.6163276	best: 17.6163276 (0)	total: 81.9ms	remaining: 10m 13s
1:	learn: 17.2326830	test: 17.1646444	best: 17.1646444 (1)	total: 168ms	remaining: 10m 30s
2:	learn: 16.7940787	test: 16.7265488	best: 16.7265488 (2)	total: 237ms	remaining: 9m 53s
3:	learn: 16.3749875	test: 16.3095467	best: 16.3095467 (3)	total: 314ms	remaining: 9m 47s
4:	learn: 15.9708547	test: 15.9079692	best: 15.9079692 (4)	total: 395ms	remaining: 9m 52s
5:	learn: 15.5877811	test: 15.5243271	best: 15.5243271 (5)	total: 466ms	remaining: 9m 42s
6:	learn: 15.2265336	test: 15.1632964	best: 15.1632964 (6)	total: 529ms	remaining: 9m 26s
7:	learn: 14.8719401	test: 14.8092069	best: 14.8092069 (7)	total: 591ms	remaining: 9m 13s
8:	learn: 14.5360226	test: 14.4749461	best: 14.4749461 (8)	total: 653ms	remaining: 9m 3s
9:	learn: 14.2221447	test: 14.1596608	best: 14.1596608 (9)	total: 757ms	remaining: 9m 26s
10:	learn: 13.9169120	test: 13.8563279	best: 13.8563279 (10)	total: 874ms	remaining: 9m 55s
11:	le

In [None]:
Shrink model to first 2891 iterations.
train rmse: 5.689909275669965
test rmse: 6.290500741895375

In [105]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

model = CatBoostRegressor(objective="RMSE", learning_rate=0.03547, iterations=3862,
                          random_state=42, cat_features=cat_cols, eval_metric="RMSE")

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani","gonulluluk_tfidf","girisimcilik_tfidf"])
y = train_merged["degerlendirme_puani"]

model.fit(X, y)

preds = model.predict(test_merged.drop(columns=["gonulluluk_tfidf","girisimcilik_tfidf"]))

test_df = pd.read_csv('data/test_x.csv')
df_sub = pd.DataFrame({
    'id': test_df['id'],  
    'Degerlendirme Puani': preds 
})

df_sub.to_csv('submissions/submission_preprocess_vALL_NON_TFIDF_ITR3862_LR03547.csv', index=False)

0:	learn: 17.6790009	total: 88.8ms	remaining: 5m 42s
1:	learn: 17.2180887	total: 174ms	remaining: 5m 35s
2:	learn: 16.7842574	total: 254ms	remaining: 5m 26s
3:	learn: 16.3697165	total: 405ms	remaining: 6m 30s
4:	learn: 15.9749089	total: 499ms	remaining: 6m 25s
5:	learn: 15.5941770	total: 572ms	remaining: 6m 7s
6:	learn: 15.2299124	total: 653ms	remaining: 5m 59s
7:	learn: 14.8825358	total: 736ms	remaining: 5m 54s
8:	learn: 14.5517982	total: 816ms	remaining: 5m 49s
9:	learn: 14.2321477	total: 898ms	remaining: 5m 45s
10:	learn: 13.9218442	total: 979ms	remaining: 5m 42s
11:	learn: 13.6370943	total: 1.05s	remaining: 5m 38s
12:	learn: 13.3515136	total: 1.14s	remaining: 5m 36s
13:	learn: 13.0841878	total: 1.22s	remaining: 5m 34s
14:	learn: 12.8277577	total: 1.3s	remaining: 5m 34s
15:	learn: 12.5888368	total: 1.38s	remaining: 5m 31s
16:	learn: 12.3598227	total: 1.46s	remaining: 5m 30s
17:	learn: 12.1352641	total: 1.54s	remaining: 5m 29s
18:	learn: 11.9261447	total: 1.62s	remaining: 5m 28s
19:	

In [106]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

model = CatBoostRegressor(objective="RMSE", learning_rate=0.03547, iterations=4999,
                          random_state=42, cat_features=cat_cols, eval_metric="RMSE")

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani","gonulluluk_tfidf","girisimcilik_tfidf"])
y = train_merged["degerlendirme_puani"]

model.fit(X, y)

preds = model.predict(test_merged.drop(columns=["gonulluluk_tfidf","girisimcilik_tfidf"]))

test_df = pd.read_csv('data/test_x.csv')
df_sub = pd.DataFrame({
    'id': test_df['id'],  
    'Degerlendirme Puani': preds 
})

df_sub.to_csv('submissions/submission_preprocess_vALL_NON_TFIDF_ITR4999_LR03547.csv', index=False)

0:	learn: 17.6790009	total: 71.1ms	remaining: 5m 55s
1:	learn: 17.2180887	total: 150ms	remaining: 6m 15s
2:	learn: 16.7842574	total: 220ms	remaining: 6m 7s
3:	learn: 16.3697165	total: 306ms	remaining: 6m 22s
4:	learn: 15.9749089	total: 388ms	remaining: 6m 27s
5:	learn: 15.5941770	total: 458ms	remaining: 6m 21s
6:	learn: 15.2299124	total: 538ms	remaining: 6m 23s
7:	learn: 14.8825358	total: 619ms	remaining: 6m 26s
8:	learn: 14.5517982	total: 695ms	remaining: 6m 25s
9:	learn: 14.2321477	total: 772ms	remaining: 6m 25s
10:	learn: 13.9218442	total: 853ms	remaining: 6m 26s
11:	learn: 13.6370943	total: 923ms	remaining: 6m 23s
12:	learn: 13.3515136	total: 1s	remaining: 6m 24s
13:	learn: 13.0841878	total: 1.08s	remaining: 6m 25s
14:	learn: 12.8277577	total: 1.17s	remaining: 6m 27s
15:	learn: 12.5888368	total: 1.25s	remaining: 6m 28s
16:	learn: 12.3598227	total: 1.33s	remaining: 6m 31s
17:	learn: 12.1352641	total: 1.43s	remaining: 6m 34s
18:	learn: 11.9261447	total: 1.51s	remaining: 6m 35s
19:	le

KeyboardInterrupt: 

In [27]:
train_merged = pd.read_csv("train_preprocessed_all_final.csv")
test_merged = pd.read_csv("test_preprocessed_all_final.csv")

  train_merged = pd.read_csv("train_preprocessed_all_final.csv")


In [28]:
train_merged["index"] = train_merged.groupby("basvuru_yili")["degerlendirme_puani"].cumcount()
test_merged = test_merged.reset_index()

In [29]:
train_merged = train_merged.fillna("diger")

In [30]:
test_merged = pd.read_csv("test_preprocessed_all_final.csv")
test_merged = test_merged.reset_index()
test_merged = test_merged.fillna("diger")

In [33]:
train_merged, test_merged = test_train_categorical_fixer(train_merged, test_merged)

In [34]:
test_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11049 entries, 0 to 11048
Data columns (total 77 columns):
 #   Column                                                                   Non-Null Count  Dtype  
---  ------                                                                   --------------  -----  
 0   index                                                                    11049 non-null  int64  
 1   basvuru_yili                                                             11049 non-null  int64  
 2   cinsiyet                                                                 11049 non-null  object 
 3   dogum_yeri                                                               11049 non-null  object 
 4   ikametgah_sehri                                                          11049 non-null  object 
 5   universite_adi                                                           11049 non-null  object 
 6   universite_turu                                                       

In [22]:
cat_cols = [col for col in train_merged.columns if train_merged[col].dtype == "O" or train_merged[col].dtype=="category"]

model = CatBoostRegressor(objective="RMSE", iterations=3000,
                          random_state=42, cat_features=cat_cols, eval_metric="RMSE")

train_merged[cat_cols] = train_merged[cat_cols].astype("category")
test_merged[cat_cols] = test_merged[cat_cols].astype("category")

X = train_merged.drop(columns=["degerlendirme_puani","gonulluluk_tfidf","girisimcilik_tfidf"])
y = train_merged["degerlendirme_puani"]

model.fit(X, y)

preds = model.predict(test_merged.drop(columns=["gonulluluk_tfidf","girisimcilik_tfidf"]))

test_df = pd.read_csv('data/test_x.csv')
df_sub = pd.DataFrame({
    'id': test_df['id'],  
    'Degerlendirme Puani': preds 
})

df_sub.to_csv('submissions/submission_preprocess_vALL_NON_TFIDF_with_ID.csv', index=False)

Learning rate set to 0.032407
0:	learn: 17.7151209	total: 131ms	remaining: 6m 32s
1:	learn: 17.2939556	total: 210ms	remaining: 5m 14s
2:	learn: 16.8927194	total: 317ms	remaining: 5m 16s
3:	learn: 16.5012109	total: 390ms	remaining: 4m 52s
4:	learn: 16.1282360	total: 463ms	remaining: 4m 37s
5:	learn: 15.7728398	total: 525ms	remaining: 4m 22s
6:	learn: 15.4344859	total: 591ms	remaining: 4m 12s
7:	learn: 15.0994346	total: 648ms	remaining: 4m 2s
8:	learn: 14.7854368	total: 717ms	remaining: 3m 58s
9:	learn: 14.4777078	total: 783ms	remaining: 3m 54s
10:	learn: 14.1958896	total: 848ms	remaining: 3m 50s
11:	learn: 13.9226963	total: 916ms	remaining: 3m 48s
12:	learn: 13.6543989	total: 984ms	remaining: 3m 46s
13:	learn: 13.3928904	total: 1.04s	remaining: 3m 41s
14:	learn: 13.1593063	total: 1.1s	remaining: 3m 39s
15:	learn: 12.9250598	total: 1.16s	remaining: 3m 36s
16:	learn: 12.6983250	total: 1.22s	remaining: 3m 34s
17:	learn: 12.4821935	total: 1.29s	remaining: 3m 33s
18:	learn: 12.2791818	total:

CatBoostError: features data: pandas.DataFrame column 'bolum' has dtype 'category' but is not in  cat_features list

In [35]:
preds = model.predict(test_merged[X.columns])

test_df = pd.read_csv('data/test_x.csv')
df_sub = pd.DataFrame({
    'id': test_df['id'],  
    'Degerlendirme Puani': preds 
})

df_sub.to_csv('submissions/submission_preprocess_vALL_NON_TFIDF_with_ID.csv', index=False)