In [1]:
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn import ensemble
from sklearn import multioutput
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler,RobustScaler
import joblib

## Warnings

In [2]:
# For output settings
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 700)

# For warnings
warnings.filterwarnings("ignore")

## Load Dataset

In [3]:
train_df = pd.read_parquet("train_final.parquet")
test = pd.read_parquet("test_final.parquet")

## Target

In [4]:
def do_Target_spareted(dataframe):
    new_spareted_cabin = dataframe["target"].str.split(pat = ",", expand = True)
    
    dataframe.drop("target", axis=1, inplace=True)
    
    new_spareted_cabin.rename(columns={0 : 'first_menu',
                                       1 : 'second_menu',
                                       2 : 'third_menu'}, inplace=True)
    
    return pd.concat([dataframe, new_spareted_cabin], axis=1)

In [5]:
train_df = do_Target_spareted(train_df)
def binarize_column(column):
    lb = LabelBinarizer()
    transformed_data = lb.fit_transform(column)
    if column.name == "second_menu":
        transformed_data = [np.insert(row, 4, 0) for row in transformed_data]
    elif column.name == "third_menu":
        transformed_data = [np.insert(row, 2, 0) for row in transformed_data]
    return pd.Series([row.tolist() for row in transformed_data])

train_df["first_menu"] = binarize_column(train_df["first_menu"])
train_df["second_menu"] = binarize_column(train_df["second_menu"])
train_df["third_menu"] = binarize_column(train_df["third_menu"])

train_df['target'] = train_df[['first_menu', 'second_menu', 'third_menu']].apply(
    lambda row: [1 if any(x) else 0 for x in zip(*row)], axis=1
)

train_df.drop(["first_menu","second_menu","third_menu"], inplace=True, axis=1)

In [6]:
train_df = train_df.drop_duplicates(subset='id', keep='first')
test = test.drop_duplicates(subset='id', keep='first')

In [7]:
X = train_df.drop("target", axis=1)
y = train_df["target"]

numpy_dizi = y.to_numpy()
numpy_dizi = [np.array(row) for row in numpy_dizi]

In [8]:
print(train_df['devicebrand'].unique())
a = train_df['devicebrand'].unique()

['Apple' 'samsung' 'Redmi' 'HUAWEI' 'POCO' 'OPPO' 'Sony' 'vivo' 'asus'
 'gm' 'Alcatel' 'xiaomi' 'reeder' 'HONOR' 'OMIX' 'Lenovo' 'realme' 'Meizu'
 'TCL' 'GM' 'Vestel' 'TECNO' 'lge' 'Casper' 'CASPER' 'motorola' 'OnePlus'
 'htc' 'Turk_Telekom' 'Nokia' 'Ulefone' 'google' 'HIKING' 'Reeder'
 'Trident' 'ZTE' 'generalmobile' 'blackberry' 'Elephone' 'Hytera'
 'Blackview' 'DOOGEE' 'SuperD' 'OUKITEL' 'Infinix' 'Vodafone' 'Realme'
 'UMIDIGI' 'nubia' 'Huawei' 'meizu' 'Nothing' 'DIJITSU' 'HTC' 'HiKING'
 'TURKCELL' 'Fairphone' 'iBRIT' 'KAAN' 'Cat' 'HOMETECH' 'WIKO' 'Gigaset'
 'blackshark']


In [9]:
def dataprep1(df):
    for i in ['gm','GM','generalmobile']:
        df['devicebrand'] = df['devicebrand'].str.replace(i, 'GENERAL_MOBILE', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('Reeder', 'reeder', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('Casper', 'CASPER', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('Realme', 'realme', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('htc', 'HTC', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('HikING', 'HIKING', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('Huawei', 'HUAWEI', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('Meizu', 'meizu', regex=True)
    
    
    df['carrier'] = df['carrier'].apply(lambda x: 'VODAFONE' if 'VODAFONE' in x else x)
    df['carrier'] = df['carrier'].apply(lambda x: 'VODAFONE' if 'VF' in x else x)
    df['carrier'] = df['carrier'].apply(lambda x: 'TURKCELL' if 'LIFECELL' in x else x)
    df['carrier'] = df['carrier'].apply(lambda x: 'TURK_TELEKOM' if 'TURK TELEKO' in x else x)
    df['carrier'] = df['carrier'].apply(lambda x: 'TURK_TELEKOM' if 'TÜRK TELEKO' in x else x)
    df['carrier'] = df['carrier'].apply(lambda x: 'TURK_TELEKOM' if 'TURKTELEKOM' in x else x)
    df['carrier'] = df['carrier'].str.replace('^KCELL ', 'AKCELL', regex=True)
    df['carrier'] = df['carrier'].str.replace('^中国', 'CHINATEL', regex=True)
    df['carrier'] = df['carrier'].str.replace('ドコモ', 'JAPENTEL', regex=True)  
    df['carrier'] = df['carrier'].str.replace('^Z 4.5G+', 'ZAIN', regex=True)  


    yurtdisi = ['ALMADAR','AIRTEL','AZERCELL','BAKCELL','O2','BEE','A1 ','3_AT','IRANCELL',
                'AYYILDIZ','BH','NL','ORANGE','MOLDCELL','ZAIN','YETTEL','VERIZONE','TELEKOM',
                'TELENOR','TELE2','TELIA','MAGTI','STC','BOUYGUES','HORMUUD','JIO','LIDL','KSA',
                'FREEDOM','BUDGET','XFINITY','CHINA_TELECOM','MTN','1&1','BASE','CLARO','GEOCELL',
                'MEGAFONE','GSMOBILE','ETISALAT','TIM','MAXIS','PROXIMUS', 'SUNRISE', 'WINDTRE', 
                'VODACOM', 'LYCAMOBILE','LIBYANA','TIGO', 'ASIACELL', 'SFR','CUBACELL','AKCELL',
                'SALT','T-MOBILE', 'CHINATEL', 'JAPENTEL','ROBI','AWCC','KYIVSTAR','GLOBE','TDC',
                'DIGICEL','DIGITEL','DIGITEC','VOLNA','HANDYVERTRAG',] 
    sirket = ['TURKCELL','BIMCELL','AVEA','VODAFONE','TURK_TELEKOM','TEKNOSA','PTTCELL','KKTCELL','PRIMETEL']
    
    birlesim = yurtdisi+sirket
    
    for anahtarKelime in birlesim:
        df['carrier'] = df['carrier'].apply(lambda x: anahtarKelime if anahtarKelime in x else x)
    
    for anahtarKelime in ['HAYAT','FENER','TRABZON','61','UNKNOWN',' ','nknown']:  
        df['carrier'] = df['carrier'].apply(lambda x: 'UNKNOWN' if anahtarKelime in x else x)
    print(df['carrier'].unique())
    
    unknown = [row for row in df['carrier'].unique() if row not in birlesim]
    df['carrier'] = df['carrier'].apply(lambda x: 'UNKNOWN' if x in unknown else x if x not in unknown else x)
    ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
    df[['carrier', 'devicebrand']] = ordinal_encoder.fit_transform(df[['carrier', 'devicebrand']])
    # df['total_nseconds'] = df['n_seconds_1'] + df['n_seconds_2'] + df['n_seconds_3']

    return df

## TRAIN

In [10]:
train_df = dataprep1(train_df)

['VODAFONE' 'TURKCELL' 'TELEKOM' 'UNKNOWN' 'BIMCELL' 'PTTCELL' 'TEKNOSA'
 'O2' 'KKTCELL' 'CUMHURIYET' 'CAPA' 'IAM' 'VERIZON' 'STC' 'XFINITY'
 'GSMOBILE' 'TELIA' '1&1' 'BASE' 'CLARO' 'BOUYGUES' 'OZLEM' 'GEOCELL' 'NL'
 'ERTANZULAL' 'KOREK' 'SUNRISE' 'ZAIN' 'MEDIONMOBILE' 'METIN' 'AYYILDIZ'
 'EVATIS' 'AVEA' 'HYPNOGAJA' 'GAYETIYICEKIYOR' 'ETISALAT' 'ORANGE' 'SALT'
 'ELUX' 'KSA' 'VIRGIN' 'CARRIER' 'BEE' 'ILIAD' 'BURAK' 'FREE' 'TAMER'
 'AZERCELL' 'SFR' 'T-MOBILE' 'CUBACEL' 'KCELL' 'ASIACELL' 'TIGO' 'DU'
 'PREMIUMSIM' 'TALKMORE' 'TELE2' 'LIBYANA' 'TELENOR' 'SEVOCELL'
 'LYCAMOBILE' 'VODACOM' 'K' 'WINDTRE' 'NAGIHAN' 'MAXIS' 'AIRALO' 'PLAY'
 'OOREDOO' 'FIDO' 'KONUR' 'UZTELECOM' 'ALTAY' 'PRIMETEL' '3' 'KAYIHAN'
 'VESTELCELL' 'YETTEL' '3AT' 'DJEZZY' 'MAHMUTEFE' 'KUTAY' 'OPTUS'
 'MOVISTAR' 'ELISA' 'TIM' 'MEGAFON' 'SIMDISCOUNT.DE' 'CEM' 'PROXIMUS'
 'MELIHA' 'SMART' 'THREE' 'SINGTEL' 'MCI' 'BILAL' 'AKCELL' 'BLAU' 'AT&T'
 'ALPCIGIM' 'VIVACOM' 'MAGTI' 'BANGLALINK' 'FASTWEB' 'ALMADAR' 'IRANCELL'
 'AIRTE

In [11]:
X = train_df.drop("target", axis=1)
y = train_df["target"]
X = X.drop(["id"], axis=1)

## TEST

In [12]:
test = dataprep1(test)

['VODAFONE' 'TURKCELL' 'TELEKOM' 'UNKNOWN' 'BIMCELL' 'KKTCELL' 'STC'
 'PTTCELL' 'TEKNOSA' 'OOREDOO' 'T-MOBILE' 'AVEA' 'FREE' 'AT&T' 'HAKAN'
 'O2' 'KOREK' 'AYYILDIZ' 'WILLKOMMEN' 'ELUX' 'ASIACELL' 'MEDIONMOBILE'
 'ETISALAT' 'MOBILCOM-DEBITEL' 'AKCELL' 'SWISSCOM' 'MOBILIS' 'YETTEL'
 'LYCAMOBILE' 'ALMADAR' 'SFR' 'COMVIQ' 'LONESTAR' 'TELE2' 'GSMOBILE' 'BEE'
 'IPAD' 'ORHUN' 'GEOCELL' 'SUNRISE' 'CARRIER' '1&1' 'NL' 'KPN' 'WINSIM'
 'ORANGE' 'CHINATEL' 'GULER' 'FLOW' 'VIRGIN' 'CUMHURIYET' 'ZAIN' 'ワイモバイル'
 'GIFFGAFF' 'PROXIMUS' 'ROGERS' 'VOO' 'ANKARAGUCU' 'YASIN' 'TDC' 'OPTUS'
 'DROETKER' 'MINT' 'UMITDOST' 'DENT' 'UCELL' 'VERIZON' 'NESLIHAN'
 'MEGAFON' 'WINDTRE' 'DU' 'FIKRETBLR' 'UZMOBILE' 'ATATURK' 'FTHKR' '3'
 'TOUCH' 'FASTLINK' 'TELCEL' 'PREMIUMSIM' 'MOVISTAR' '3AT' 'AIRTEL' 'TIM'
 'BATELCO' 'KONUR' 'TR' 'TELENOR' 'YOTA' 'EE' 'FRAENK' 'CHATR' 'COSMOTE'
 'BOUYGTEL' 'IR-MCI' 'DREI' 'BT' 'TRUE-H' 'BOUYGUES' 'TELEMACH' 'BUDGET'
 'SYMA' '938' 'IRANCELL']


In [13]:
test.drop('id',axis=1,inplace=True)

In [14]:
# reg = multioutput.MultiOutputRegressor(ensemble.GradientBoostingRegressor())
# reg.fit(X, numpy_dizi)
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor

lgb_model = lgb.LGBMRegressor(learning_rate=0.05, n_estimators=500, max_depth=7, num_leaves=35)
model = MultiOutputRegressor(lgb_model)

model = model.fit(X, numpy_dizi)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13243
[LightGBM] [Info] Number of data points in the train set: 80478, number of used features: 56
[LightGBM] [Info] Start training from score 0.239270
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13243
[LightGBM] [Info] Number of data points in the train set: 80478, number of used features: 56
[LightGBM] [Info] Start training from score 0.797758
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13243
[LightGBM] [Info] Number of data points in the train set: 80478, number of used features: 56
[LightGBM] [Info] Start training from score 0.061247
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13243
[LightGBM] [Info] Number of data points in the train set: 80478, number of used features: 56
[LightGBM] [Info] Start training from score 0.409553
You can set `force_col_wise=true` to remove 

In [15]:
y_pred = model.predict(test)

In [16]:
def sample_sub(ypred):
    

    sample = pd.read_csv("csv_sample.csv")

    submission = pd.DataFrame({"id": sample["id"],
                                "target": ypred})
    # submission['target'] = submission['target'].apply(lambda x: ''.join(map(str, eval(x))))
                                
    submission.to_csv("control.csv", index=False)

In [17]:
for row in y_pred:
    list_with_index = [(value, index) for index, value in enumerate(row)]
    sorted_list = sorted(list_with_index, key=lambda x: x[0])
    three_values = [x[1] for x in sorted_list[-3:]]
    row[three_values]=1
    row[~np.isin(np.arange(len(row)), three_values)] = 0
y_pred = y_pred.astype(int)
y_pred = [row.tolist() for row in y_pred]
print(y_pred)

[[0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 1, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0], [0, 1, 0, 1, 0, 0, 0, 1, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0, 1], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 1, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 1, 

In [18]:
sample_sub(pd.Series(y_pred))

In [19]:
df = pd.read_csv('control.csv')
df['target'] = df['target'].apply(lambda x: ''.join(map(str, eval(x))))

In [20]:
df.to_csv('0control.csv',index=False)

In [21]:
from sklearn.metrics import accuracy_score
y_test = pd.read_csv("csv\st.csv")
y_pred = pd.read_csv('0control.csv')
accuracy = accuracy_score(y_test["target"], y_pred["target"])

print("Accuracy:", accuracy)

Accuracy: 0.8816394813885403
