In [26]:
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn import ensemble
from sklearn import multioutput
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler,RobustScaler
import joblib
import config as cnf
import Utils as utils
from sklearn.neighbors import LocalOutlierFactor

## Warnings

In [27]:
# For output settings
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 700)

# For warnings
warnings.filterwarnings("ignore")

## Load Dataset

In [28]:
train_df = pd.read_parquet(cnf.trainpath)
test = pd.read_parquet(cnf.testpath)

## Target

In [29]:
def do_Target_spareted(dataframe):
    new_spareted_cabin = dataframe["target"].str.split(pat = ",", expand = True)
    
    dataframe.drop("target", axis=1, inplace=True)
    
    new_spareted_cabin.rename(columns={0 : 'first_menu',
                                       1 : 'second_menu',
                                       2 : 'third_menu'}, inplace=True)
    
    return pd.concat([dataframe, new_spareted_cabin], axis=1)

In [30]:
train_df = do_Target_spareted(train_df)

In [31]:
train_df.groupby('first_menu')['n_seconds_1'].mean()

first_menu
menu1    316.009387
menu2    493.134845
menu3    418.947496
menu4    341.351390
menu5    231.593111
menu6    430.725137
menu7    410.893435
menu8    425.093297
menu9    393.796716
Name: n_seconds_1, dtype: float64

In [32]:
def binarize_column(column):
    lb = LabelBinarizer()
    transformed_data = lb.fit_transform(column)
    if column.name == "second_menu":
        transformed_data = [np.insert(row, 4, 0) for row in transformed_data]
    elif column.name == "third_menu":
        transformed_data = [np.insert(row, 2, 0) for row in transformed_data]
    return pd.Series([row.tolist() for row in transformed_data])

train_df["first_menu"] = binarize_column(train_df["first_menu"])
train_df["second_menu"] = binarize_column(train_df["second_menu"])
train_df["third_menu"] = binarize_column(train_df["third_menu"])

train_df['target'] = train_df[['first_menu', 'second_menu', 'third_menu']].apply(
    lambda row: [1 if any(x) else 0 for x in zip(*row)], axis=1
)

train_df.drop(["first_menu","second_menu","third_menu"], inplace=True, axis=1)

In [33]:
train_df = train_df.drop_duplicates(subset='id', keep='first')
test = test.drop_duplicates(subset='id', keep='first')

In [34]:
# cat_cols, num_cols, cat_but_car = utils.grab_col_names(train_df.drop('target',axis=1))

In [35]:
# clf = LocalOutlierFactor(n_neighbors=20)
# clf.fit_predict(train_df[num_cols])
# df_scores = clf.negative_outlier_factor_

# print(df_scores)

# scores = pd.DataFrame(np.sort(df_scores))
# scores.plot(stacked=True, xlim=[0, 50], style='.-')
# plt.show()


In [36]:
# th = np.sort(df_scores)[3]

# lof_drop_index = train_df[df_scores < th].index

# train_df.drop(lof_drop_index, inplace=True)

# print(lof_drop_index)

In [37]:
# train_df.drop(utils.grab_outliers(train_df,'n_seconds_3',index=True),axis=0,inplace=True)

In [38]:
def dataprep1(df):
    df['avgnseconds23']=(df['n_seconds_2']+df['n_seconds_3'])/2
    # df['nsecondsmonth']=(df['n_seconds_1']*df['month'])
    
    for i in ['gm','GM','generalmobile']:
        df['devicebrand'] = df['devicebrand'].str.replace(i, 'GENERAL_MOBILE', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('Reeder', 'reeder', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('Casper', 'CASPER', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('Realme', 'realme', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('htc', 'HTC', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('HikING', 'HIKING', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('Huawei', 'HUAWEI', regex=True)
    df['devicebrand'] = df['devicebrand'].str.replace('Meizu', 'meizu', regex=True)
    
    
    df['carrier'] = df['carrier'].apply(lambda x: 'VODAFONE' if 'VODAFONE' in x else x)
    df['carrier'] = df['carrier'].apply(lambda x: 'VODAFONE' if 'VF' in x else x)
    df['carrier'] = df['carrier'].apply(lambda x: 'TURKCELL' if 'LIFECELL' in x else x)
    df['carrier'] = df['carrier'].apply(lambda x: 'TURK_TELEKOM' if 'TURK TELEKO' in x else x)
    df['carrier'] = df['carrier'].apply(lambda x: 'TURK_TELEKOM' if 'TÜRK TELEKO' in x else x)
    df['carrier'] = df['carrier'].apply(lambda x: 'TURK_TELEKOM' if 'TURKTELEKOM' in x else x)
    df['carrier'] = df['carrier'].str.replace('^KCELL ', 'AKCELL', regex=True)
    df['carrier'] = df['carrier'].str.replace('^中国', 'CHINATEL', regex=True)
    df['carrier'] = df['carrier'].str.replace('ドコモ', 'JAPENTEL', regex=True)  
    df['carrier'] = df['carrier'].str.replace('^Z 4.5G+', 'ZAIN', regex=True)  


    yurtdisi = ['ALMADAR','AIRTEL','AZERCELL','BAKCELL','O2','BEE','A1 ','3_AT','IRANCELL',
                'AYYILDIZ','BH','NL','ORANGE','MOLDCELL','ZAIN','YETTEL','VERIZONE','TELEKOM',
                'TELENOR','TELE2','TELIA','MAGTI','STC','BOUYGUES','HORMUUD','JIO','LIDL','KSA',
                'FREEDOM','BUDGET','XFINITY','CHINA_TELECOM','MTN','1&1','BASE','CLARO','GEOCELL',
                'MEGAFONE','GSMOBILE','ETISALAT','TIM','MAXIS','PROXIMUS', 'SUNRISE', 'WINDTRE', 
                'VODACOM', 'LYCAMOBILE','LIBYANA','TIGO', 'ASIACELL', 'SFR','CUBACELL','AKCELL',
                'SALT','T-MOBILE', 'CHINATEL', 'JAPENTEL','ROBI','AWCC','KYIVSTAR','GLOBE','TDC',
                'DIGICEL','DIGITEL','DIGITEC','VOLNA','HANDYVERTRAG',] 
    sirket = ['TURKCELL','BIMCELL','AVEA','VODAFONE','TURK_TELEKOM','TEKNOSA','PTTCELL','KKTCELL','PRIMETEL']
    
    birlesim = yurtdisi+sirket
    
    for anahtarKelime in birlesim:
        df['carrier'] = df['carrier'].apply(lambda x: anahtarKelime if anahtarKelime in x else x)
    
    for anahtarKelime in ['HAYAT','FENER','TRABZON','61','UNKNOWN',' ','nknown']:  
        df['carrier'] = df['carrier'].apply(lambda x: 'UNKNOWN' if anahtarKelime in x else x)
    
    unknown = [row for row in df['carrier'].unique() if row not in birlesim]
    df['carrier'] = df['carrier'].apply(lambda x: 'UNKNOWN' if x in unknown else x if x not in unknown else x)
    ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
    # df['is_ios'] = df['devicebrand'].apply(lambda x: 1 if x.lower() == 'apple' else 0)
    df[['carrier', 'devicebrand']] = ordinal_encoder.fit_transform(df[['carrier', 'devicebrand']])
    # df['total_nseconds'] = df['n_seconds_1'] + df['n_seconds_2'] + df['n_seconds_3']

    return df

## TRAIN

In [39]:
train_df = dataprep1(train_df)

In [40]:
X = train_df.drop("target", axis=1)
y = train_df["target"]
X = X.drop(["id"], axis=1)

numpy_dizi = y.to_numpy()
numpy_dizi = [np.array(row) for row in numpy_dizi]

## TEST

In [41]:
test = dataprep1(test)

In [42]:
test.drop('id',axis=1,inplace=True)

In [43]:
# reg = multioutput.MultiOutputRegressor(ensemble.GradientBoostingRegressor())
# reg.fit(X, numpy_dizi)
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor

lgb_model = lgb.LGBMRegressor(learning_rate=0.05, n_estimators=500, max_depth=7, num_leaves=35)
model = MultiOutputRegressor(lgb_model)

model = model.fit(X, numpy_dizi)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13498
[LightGBM] [Info] Number of data points in the train set: 80478, number of used features: 57
[LightGBM] [Info] Start training from score 0.239270
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13498
[LightGBM] [Info] Number of data points in the train set: 80478, number of used features: 57
[LightGBM] [Info] Start training from score 0.797758
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13498
[LightGBM] [Info] Number of data points in the train set: 80478, number of used features: 57
[LightGBM] [Info] Start training from score 0.061247
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13498
[LightGBM] [Info] Number of data points in the train set: 80478, number of used features: 57
[LightGBM] [Info] Start training from score 0.409553
You can set `force_col_wise=true` to remove 

In [44]:
y_pred = model.predict(test)

In [45]:
def sample_sub(ypred):
    

    sample = pd.read_csv("csv_sample.csv")

    submission = pd.DataFrame({"id": sample["id"],
                                "target": ypred})
    # submission['target'] = submission['target'].apply(lambda x: ''.join(map(str, eval(x))))
                                
    submission.to_csv("34.csv", index=False)

In [46]:
for row in y_pred:
    list_with_index = [(value, index) for index, value in enumerate(row)]
    sorted_list = sorted(list_with_index, key=lambda x: x[0])
    three_values = [x[1] for x in sorted_list[-3:]]
    row[three_values]=1
    row[~np.isin(np.arange(len(row)), three_values)] = 0
y_pred = y_pred.astype(int)
y_pred = [row.tolist() for row in y_pred]
print(y_pred)

[[0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 1, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 1, 0], [0, 1, 0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 1, 

In [47]:
sample_sub(pd.Series(y_pred))

In [48]:
df = pd.read_csv('34.csv')
df['target'] = df['target'].apply(lambda x: ''.join(map(str, eval(x))))

In [49]:
df.to_csv('034.csv',index=False)

In [50]:
from sklearn.metrics import accuracy_score
y_test = pd.read_csv("034.csv")
y_pred = pd.read_csv('0control.csv')
accuracy = accuracy_score(y_test["target"], y_pred["target"])

print("Accuracy:", accuracy)

Accuracy: 1.0
