In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import LabelBinarizer

In [3]:
# !pip install pyarrow

Collecting pyarrow
  Using cached pyarrow-13.0.0-cp310-cp310-win_amd64.whl (24.3 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-13.0.0



[notice] A new release of pip is available: 23.0.1 -> 23.2.1
[notice] To update, run: C:\Users\aydin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
# !pip install fastparquet

Collecting fastparquet
  Using cached fastparquet-2023.8.0-cp310-cp310-win_amd64.whl (715 kB)
Collecting cramjam>=2.3
  Using cached cramjam-2.7.0-cp310-none-win_amd64.whl (1.3 MB)
Collecting fsspec
  Using cached fsspec-2023.9.2-py3-none-any.whl (173 kB)
Installing collected packages: fsspec, cramjam, fastparquet
Successfully installed cramjam-2.7.0 fastparquet-2023.8.0 fsspec-2023.9.2



[notice] A new release of pip is available: 23.0.1 -> 23.2.1
[notice] To update, run: C:\Users\aydin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
data = pd.read_parquet("train_final.parquet")

In [4]:
data.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target
0,5beefd4d2bf4a4767e0df8108,10,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.197737,1.11336,-1.123334,...,-2.613336,-2.032903,2.645719,-1.023478,1.658986,-1.559406,-2.161336,30,58,"menu2, menu4, menu5"
1,867285b116c063d5a8482f5be,10,5184.876,557.65,487.587,TURKCELL,samsung,-2.336352,2.567766,-0.494908,...,-0.983938,-1.453756,-0.021547,-0.19577,2.775513,-0.31898,-4.291473,21,45,"menu7, menu8, menu4"
2,c82a7cbd2e00d9b66c06bcadc,10,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561455,2.061736,-0.184511,...,-1.668703,-3.599403,1.673868,0.63179,1.293131,-2.230909,-2.383524,19,61,"menu2, menu8, menu4"
3,f2d2b25073ccc298eced86897,10,3532.544,154.509,64.724,TURKCELL,samsung,-2.529918,3.35805,-0.851366,...,-1.861418,-1.219658,1.863495,0.21317,1.02971,-1.142185,-4.466191,2,41,"menu6, menu2, menu1"
4,7818c92a58af0f2cb7c361738,10,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922361,2.096124,0.060796,...,-0.142903,-1.875545,1.024499,-0.186423,-0.061626,-1.462175,-2.371206,23,85,"menu6, menu2, menu8"


In [22]:
def check_df(dataframe, head=5, non_numeric=True):
    """
    Veri setinin genel özellikleirni yazdirir.
    non_numeric değişken veri setinde kategorik değişken varsa false yapılmalıdır.

    """
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### DESCRIBE #####################")
    print(dataframe.describe())
    if non_numeric:
        print("##################### Quantiles #####################")
        print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)


def num_summary(dataframe, numerical_col, plot=False):
    """
        Numerik kolonlar input olarak verilmelidir.
        Sadece ekrana cikti veren herhangi bir degeri return etmeyen bir fonksiyondur.
        For dongusuyle calistiginda grafiklerde bozulma olmamaktadir.
    """
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=20)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)


def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
    Not: Kategorik değişkenlerin içerisine numerik görünümlü kategorik değişkenleri de dahil eder.

    Parameters
    ------
        dataframe: dataframe
                Değişken isimleri alınmak istenilen dataframe
        cat_th: int, optional
                numerik fakat kategorik olan değişkenler için sınıf eşik değeri
        car_th: int, optinal
                kategorik fakat kardinal değişkenler için sınıf eşik değeri

    Returns
    ------
        cat_cols: list
                Kategorik değişken listesi
        num_cols: list
                Numerik değişken listesi

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols = toplam değişken sayısı
        num_but_cat cat_cols'un içerisinde.
    """
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")

    if na_name:
        return na_columns

In [28]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("##########################################")
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show()

In [5]:
def do_Target_spareted(dataframe):
    new_spareted_cabin = dataframe["target"].str.split(pat = ",", expand = True)
    
    dataframe.drop("target", axis=1, inplace=True)
    
    new_spareted_cabin.rename(columns={0 : 'first_menu',
                                       1 : 'second_menu',
                                       2 : 'third_menu'}, inplace=True)
    
    return pd.concat([dataframe, new_spareted_cabin], axis=1)

In [None]:
check_df(data)

In [18]:
missing_values_table(data)

Empty DataFrame
Columns: [n_miss, ratio]
Index: []


In [13]:
data["carrier"].nunique()

556

In [8]:
data["carrier"].unique()

array(['VODAFONE TR', 'TURKCELL', 'TURK TELEKOM', 'TELEKOM.DE', 'Unknown',
       'BIMCELL', 'PTTCELL', 'TEKNOSACELL', 'O2-DE', 'KKTCELL',
       'CUMHURIYET', 'TR TURKCELL', 'VODAFONE', 'CAPA', 'IAM', 'VERIZON',
       'LIFECELL TR', 'STC KSA RIBF2022', 'SULEYMAN YAGIZ ', '3 AT',
       'HAYATEVESIGAR', 'XFINITY MOBILE', 'GSMOBILE', 'FENERCELL',
       'O2 - DE+', 'TELIA', 'VODAFONE UK', '1&1', 'BASE', 'CLARO PERU',
       'F-BOUYGUES TELECOM', 'VODAFONE RO', 'OZLEM', 'GEOCELL',
       'TRABZONCELL', 'KPN NL', 'ERTANZULAL', 'O2 - DE', 'KOREK',
       'SUNRISE', 'Z 4.5G+', 'MEDIONMOBILE', 'METIN', 'DIVERSEY KIMYA',
       'AYYILDIZ.DE', 'EVATIS', ' ', 'ZAIN IQ', 'TELEKOM.RO', 'AVEA',
       'FENERCELL-AVEA', 'HYPNOGAJA', 'IBRAHIM KUTSI ', 'VODAFONE.DE',
       'GAYETIYICEKIYOR', 'UAE 51 ETISALAT', 'VODA AU', 'OSMAN SARIKAYA ',
       'ETISALAT', 'ORANGE F', 'VODAFONE IT', 'SALT', 'ELUX',
       'MUTLU YILLAR', 'WTTC KSA', 'VIRGIN', 'CARRIER', 'BEELINE',
       'ILIAD', 'BURAK', 'UZB UC

In [23]:
cat_cols, num_cols, cat_but_car = grab_col_names(data)

Observations: 94049
Variables: 58
cat_cols: 1
num_cols: 53
cat_but_car: 4
num_but_cat: 1


In [29]:
for col in cat_cols:
    cat_summary(data, col)

    month      Ratio
12  44433  47.244521
11  34485  36.667057
10  15131  16.088422
##########################################


In [None]:
for col in cat_but_car:
    cat_summary(data, col)

In [20]:
cat_cols

['month']

In [6]:
new_data = do_Target_spareted(data)

In [7]:
new_data.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,...,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,first_menu,second_menu,third_menu
0,5beefd4d2bf4a4767e0df8108,10,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.197737,1.11336,-1.123334,...,2.645719,-1.023478,1.658986,-1.559406,-2.161336,30,58,menu2,menu4,menu5
1,867285b116c063d5a8482f5be,10,5184.876,557.65,487.587,TURKCELL,samsung,-2.336352,2.567766,-0.494908,...,-0.021547,-0.19577,2.775513,-0.31898,-4.291473,21,45,menu7,menu8,menu4
2,c82a7cbd2e00d9b66c06bcadc,10,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561455,2.061736,-0.184511,...,1.673868,0.63179,1.293131,-2.230909,-2.383524,19,61,menu2,menu8,menu4
3,f2d2b25073ccc298eced86897,10,3532.544,154.509,64.724,TURKCELL,samsung,-2.529918,3.35805,-0.851366,...,1.863495,0.21317,1.02971,-1.142185,-4.466191,2,41,menu6,menu2,menu1
4,7818c92a58af0f2cb7c361738,10,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922361,2.096124,0.060796,...,1.024499,-0.186423,-0.061626,-1.462175,-2.371206,23,85,menu6,menu2,menu8


In [None]:
type(data["target"])

In [8]:
type(pd.array(data["target"]))

pandas.core.arrays.numpy_.PandasArray

In [8]:
lb = LabelBinarizer()
transformed_data_first = lb.fit_transform(new_data["first_menu"])
transformed_data_second = lb.fit_transform(new_data["second_menu"])
transformed_data_third = lb.fit_transform(new_data["third_menu"])

In [104]:
new_data["third_menu"].unique()

array([' menu5', ' menu4', ' menu1', ' menu8', ' menu9', ' menu6',
       ' menu2', ' menu7'], dtype=object)

In [105]:
new_data["second_menu"].unique()

array([' menu4', ' menu8', ' menu2', ' menu1', ' menu3', ' menu7',
       ' menu6', ' menu9'], dtype=object)

In [106]:
transformed_data_first.shape

(94049, 9)

In [9]:
transformed_data_first[0]

array([0, 1, 0, 0, 0, 0, 0, 0, 0])

In [107]:
transformed_data_second[0]

array([0, 0, 0, 1, 0, 0, 0, 0])

In [10]:
transformed_data_second = [np.insert(row, 4, 0) for row in transformed_data_second]
transformed_data_third = [np.insert(row, 2, 0) for row in transformed_data_third]

In [11]:
new_data.shape

(94049, 60)

In [12]:
new_data_k = new_data

In [13]:
new_data.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,...,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,first_menu,second_menu,third_menu
0,5beefd4d2bf4a4767e0df8108,10,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.197737,1.11336,-1.123334,...,2.645719,-1.023478,1.658986,-1.559406,-2.161336,30,58,menu2,menu4,menu5
1,867285b116c063d5a8482f5be,10,5184.876,557.65,487.587,TURKCELL,samsung,-2.336352,2.567766,-0.494908,...,-0.021547,-0.19577,2.775513,-0.31898,-4.291473,21,45,menu7,menu8,menu4
2,c82a7cbd2e00d9b66c06bcadc,10,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561455,2.061736,-0.184511,...,1.673868,0.63179,1.293131,-2.230909,-2.383524,19,61,menu2,menu8,menu4
3,f2d2b25073ccc298eced86897,10,3532.544,154.509,64.724,TURKCELL,samsung,-2.529918,3.35805,-0.851366,...,1.863495,0.21317,1.02971,-1.142185,-4.466191,2,41,menu6,menu2,menu1
4,7818c92a58af0f2cb7c361738,10,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922361,2.096124,0.060796,...,1.024499,-0.186423,-0.061626,-1.462175,-2.371206,23,85,menu6,menu2,menu8


In [10]:
transformed_data_first[0:5]

array([[0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [11]:
transformed_data_third[0:5]

[array([0, 0, 0, 0, 1, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 0, 0, 0, 0, 0]),
 array([1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 1, 0])]

In [14]:
transformed_data_list = [row.tolist() for row in transformed_data_first]

# Listeyi Pandas Serisi'ne dönüştürme
transformed_data_series = pd.Series(transformed_data_list)

In [15]:
transformed_data_list_second = [row.tolist() for row in transformed_data_second]

# Listeyi Pandas Serisi'ne dönüştürme
transformed_data_series_second = pd.Series(transformed_data_list_second)

In [16]:
transformed_data_list_third = [row.tolist() for row in transformed_data_third]

# Listeyi Pandas Serisi'ne dönüştürme
transformed_data_series_third = pd.Series(transformed_data_list_third)

In [81]:
transformed_data_series

0        [0, 1, 0, 0, 0, 0, 0, 0, 0]
1        [0, 0, 0, 0, 0, 0, 1, 0, 0]
2        [0, 1, 0, 0, 0, 0, 0, 0, 0]
3        [0, 0, 0, 0, 0, 1, 0, 0, 0]
4        [0, 0, 0, 0, 0, 1, 0, 0, 0]
                    ...             
94044    [0, 1, 0, 0, 0, 0, 0, 0, 0]
94045    [0, 0, 0, 0, 0, 0, 0, 0, 1]
94046    [0, 0, 0, 0, 0, 1, 0, 0, 0]
94047    [0, 0, 0, 0, 0, 1, 0, 0, 0]
94048    [0, 0, 0, 0, 0, 1, 0, 0, 0]
Length: 94049, dtype: object

In [57]:
new_data_k["first_menu"].shape

(94049,)

In [58]:
transformed_data_first.shape

(94049, 9)

In [98]:
transformed_data_second.shape

(94049, 8)

In [99]:
transformed_data_third.shape

(94049, 8)

In [17]:
new_data_k["first_menu"] = transformed_data_series
new_data_k["second_menu"] = transformed_data_series_second
new_data_k["third_menu"] = transformed_data_series_third

In [18]:
new_data_k.shape

(94049, 60)

In [19]:
new_data_k.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,...,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,first_menu,second_menu,third_menu
0,5beefd4d2bf4a4767e0df8108,10,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.197737,1.11336,-1.123334,...,2.645719,-1.023478,1.658986,-1.559406,-2.161336,30,58,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,867285b116c063d5a8482f5be,10,5184.876,557.65,487.587,TURKCELL,samsung,-2.336352,2.567766,-0.494908,...,-0.021547,-0.19577,2.775513,-0.31898,-4.291473,21,45,"[0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]"
2,c82a7cbd2e00d9b66c06bcadc,10,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561455,2.061736,-0.184511,...,1.673868,0.63179,1.293131,-2.230909,-2.383524,19,61,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,f2d2b25073ccc298eced86897,10,3532.544,154.509,64.724,TURKCELL,samsung,-2.529918,3.35805,-0.851366,...,1.863495,0.21317,1.02971,-1.142185,-4.466191,2,41,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,7818c92a58af0f2cb7c361738,10,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922361,2.096124,0.060796,...,1.024499,-0.186423,-0.061626,-1.462175,-2.371206,23,85,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [29]:
new_data_k['target'] = new_data_k.apply(lambda row: [1 if any(x) else 0 for x in zip(*row[['first_menu', 'second_menu', 'third_menu']])], axis=1)

In [30]:
new_data_k.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,...,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,first_menu,second_menu,third_menu,Target
0,5beefd4d2bf4a4767e0df8108,10,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.197737,1.11336,-1.123334,...,-1.023478,1.658986,-1.559406,-2.161336,30,58,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 1, 0, 1, 1, 0, 0, 0, 0]"
1,867285b116c063d5a8482f5be,10,5184.876,557.65,487.587,TURKCELL,samsung,-2.336352,2.567766,-0.494908,...,-0.19577,2.775513,-0.31898,-4.291473,21,45,"[0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 1, 1, 0]"
2,c82a7cbd2e00d9b66c06bcadc,10,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561455,2.061736,-0.184511,...,0.63179,1.293131,-2.230909,-2.383524,19,61,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 1, 0, 1, 0, 0, 0, 1, 0]"
3,f2d2b25073ccc298eced86897,10,3532.544,154.509,64.724,TURKCELL,samsung,-2.529918,3.35805,-0.851366,...,0.21317,1.02971,-1.142185,-4.466191,2,41,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 0, 0, 0, 1, 0, 0, 0]"
4,7818c92a58af0f2cb7c361738,10,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922361,2.096124,0.060796,...,-0.186423,-0.061626,-1.462175,-2.371206,23,85,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 1, 0, 0, 0, 1, 0, 1, 0]"


In [31]:
new_data_k.drop("first_menu", inplace=True, axis=1)
new_data_k.drop("second_menu", inplace=True, axis=1)
new_data_k.drop("third_menu", inplace=True, axis=1)

In [32]:
new_data_k.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,Target
0,5beefd4d2bf4a4767e0df8108,10,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.197737,1.11336,-1.123334,...,-2.613336,-2.032903,2.645719,-1.023478,1.658986,-1.559406,-2.161336,30,58,"[0, 1, 0, 1, 1, 0, 0, 0, 0]"
1,867285b116c063d5a8482f5be,10,5184.876,557.65,487.587,TURKCELL,samsung,-2.336352,2.567766,-0.494908,...,-0.983938,-1.453756,-0.021547,-0.19577,2.775513,-0.31898,-4.291473,21,45,"[0, 0, 0, 1, 0, 0, 1, 1, 0]"
2,c82a7cbd2e00d9b66c06bcadc,10,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561455,2.061736,-0.184511,...,-1.668703,-3.599403,1.673868,0.63179,1.293131,-2.230909,-2.383524,19,61,"[0, 1, 0, 1, 0, 0, 0, 1, 0]"
3,f2d2b25073ccc298eced86897,10,3532.544,154.509,64.724,TURKCELL,samsung,-2.529918,3.35805,-0.851366,...,-1.861418,-1.219658,1.863495,0.21317,1.02971,-1.142185,-4.466191,2,41,"[1, 1, 0, 0, 0, 1, 0, 0, 0]"
4,7818c92a58af0f2cb7c361738,10,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922361,2.096124,0.060796,...,-0.142903,-1.875545,1.024499,-0.186423,-0.061626,-1.462175,-2.371206,23,85,"[0, 1, 0, 0, 0, 1, 0, 1, 0]"


In [26]:
# new_data_k.drop("Target", inplace=True, axis=1)

In [42]:
test = pd.read_parquet("test_final.parquet")

In [43]:
test.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,2e6105f5911256f4f6c4813ed,1,6893.544,246.854,242.636,VODAFONE TR,samsung,-1.723524,3.216489,-1.138474,...,-1.094519,-1.217407,-4.280456,1.51224,-2.306445,2.066388,0.844927,-1.026193,18,58
1,c56ad71dae0a5dbd3e7d36adc,1,4481.065,740.209,263.86,TURKCELL,Apple,-0.417275,2.024433,0.102952,...,1.806486,-3.477517,-2.064966,1.499805,1.284697,0.189269,-1.563224,-1.901654,3,35
2,4d02ea175f6581f0c6385311f,1,4340.702,2742.163,318.7,TURKCELL,samsung,-2.943294,2.769536,0.734942,...,1.75908,-2.038839,-2.067219,2.141083,0.055355,0.084739,-1.009925,-2.058473,7,50
3,3412d27a86c286ba078fa935c,1,4129.666,181.397,155.423,TURK TELEKOM,Apple,-2.346902,2.684752,0.168206,...,2.171847,-0.92504,-1.484278,0.666036,0.911519,0.616167,0.092304,-1.874706,22,47
4,0203b561f6f7e10eafa46eefa,1,3903.944,126.133,100.06,TURKCELL,POCO,-1.745354,2.355863,0.318961,...,-0.373413,-0.015773,-2.961445,1.301413,1.37509,-0.107355,0.92439,-1.606419,29,52


In [34]:
from sklearn import multioutput

In [39]:
from sklearn import ensemble
# .GradientBoostingRegressor

In [41]:
X = new_data_k.drop("target", axis=1)
y = new_data_k["target"]

In [46]:
numpy_dizi = y.to_numpy()

In [53]:
numpy_dizi = [np.array(row) for row in numpy_dizi]

In [57]:
X = X.drop(["id", "carrier", "devicebrand"], axis=1)

In [58]:
reg = multioutput.MultiOutputRegressor(ensemble.GradientBoostingRegressor())
reg.fit(X, numpy_dizi)

y_pred = reg.predict(test)

KeyboardInterrupt: 

In [20]:
new_data_k.dtypes

id              object
month            int64
n_seconds_1    float64
n_seconds_2    float64
n_seconds_3    float64
carrier         object
devicebrand     object
feature_0      float64
feature_1      float64
feature_2      float64
feature_3      float64
feature_4      float64
feature_5      float64
feature_6      float64
feature_7      float64
feature_8      float64
feature_9      float64
feature_10     float64
feature_11     float64
feature_12     float64
feature_13     float64
feature_14     float64
feature_15     float64
feature_16     float64
feature_17     float64
feature_18     float64
feature_19     float64
feature_20     float64
feature_21     float64
feature_22     float64
feature_23     float64
feature_24     float64
feature_25     float64
feature_26     float64
feature_27     float64
feature_28     float64
feature_29     float64
feature_30     float64
feature_31     float64
feature_32     float64
feature_33     float64
feature_34     float64
feature_35     float64
feature_36 

In [None]:
new_data["first_menu"]

In [120]:
secilen_sutunlar = new_data[['first_menu', 'first_menu', 'third_menu']]

# Seçilen sütunları toplu olarak işleyerek yeni bir sütun oluşturun
new_data['target'] = secilen_sutunlar.apply(lambda row: ', '.join(map(str, row)), axis=1)

In [122]:
new_data.drop("target", inplace=True, axis=1)

In [123]:
new_data.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,...,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,first_menu,second_menu,third_menu
0,5beefd4d2bf4a4767e0df8108,10,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.197737,1.11336,-1.123334,...,2.645719,-1.023478,1.658986,-1.559406,-2.161336,30,58,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,867285b116c063d5a8482f5be,10,5184.876,557.65,487.587,TURKCELL,samsung,-2.336352,2.567766,-0.494908,...,-0.021547,-0.19577,2.775513,-0.31898,-4.291473,21,45,"[0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]"
2,c82a7cbd2e00d9b66c06bcadc,10,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561455,2.061736,-0.184511,...,1.673868,0.63179,1.293131,-2.230909,-2.383524,19,61,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,f2d2b25073ccc298eced86897,10,3532.544,154.509,64.724,TURKCELL,samsung,-2.529918,3.35805,-0.851366,...,1.863495,0.21317,1.02971,-1.142185,-4.466191,2,41,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,7818c92a58af0f2cb7c361738,10,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922361,2.096124,0.060796,...,1.024499,-0.186423,-0.061626,-1.462175,-2.371206,23,85,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [22]:
new_data["first_menu"] = transformed_data_first
new_data["first_menu"] = transformed_data_second
new_data["third_menu"] = transformed_data_third

In [23]:
new_data["target"] = (new_data["first_menu"] | new_data["second_menu"] | new_data["third_menu"])

In [24]:
new_data.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,...,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,first_menu,second_menu,third_menu,target
0,5beefd4d2bf4a4767e0df8108,10,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.197737,1.11336,-1.123334,...,-1.023478,1.658986,-1.559406,-2.161336,30,58,0,0,0,0
1,867285b116c063d5a8482f5be,10,5184.876,557.65,487.587,TURKCELL,samsung,-2.336352,2.567766,-0.494908,...,-0.19577,2.775513,-0.31898,-4.291473,21,45,0,0,0,0
2,c82a7cbd2e00d9b66c06bcadc,10,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561455,2.061736,-0.184511,...,0.63179,1.293131,-2.230909,-2.383524,19,61,0,0,0,0
3,f2d2b25073ccc298eced86897,10,3532.544,154.509,64.724,TURKCELL,samsung,-2.529918,3.35805,-0.851366,...,0.21317,1.02971,-1.142185,-4.466191,2,41,0,0,1,1
4,7818c92a58af0f2cb7c361738,10,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922361,2.096124,0.060796,...,-0.186423,-0.061626,-1.462175,-2.371206,23,85,0,0,0,0


In [None]:
num_cols

In [24]:
cat_but_car

['id', 'carrier', 'devicebrand', 'target']

In [25]:
data["id"].nunique()

80478

In [26]:
data["devicebrand"].nunique()

64

In [7]:
data["devicebrand"].unique()

array(['Apple', 'samsung', 'Redmi', 'HUAWEI', 'POCO', 'OPPO', 'Sony',
       'vivo', 'asus', 'gm', 'Alcatel', 'xiaomi', 'reeder', 'HONOR',
       'OMIX', 'Lenovo', 'realme', 'Meizu', 'TCL', 'GM', 'Vestel',
       'TECNO', 'lge', 'Casper', 'CASPER', 'motorola', 'OnePlus', 'htc',
       'Turk_Telekom', 'Nokia', 'Ulefone', 'google', 'HIKING', 'Reeder',
       'Trident', 'ZTE', 'generalmobile', 'blackberry', 'Elephone',
       'Hytera', 'Blackview', 'DOOGEE', 'SuperD', 'OUKITEL', 'Infinix',
       'Vodafone', 'Realme', 'UMIDIGI', 'nubia', 'Huawei', 'meizu',
       'Nothing', 'DIJITSU', 'HTC', 'HiKING', 'TURKCELL', 'Fairphone',
       'iBRIT', 'KAAN', 'Cat', 'HOMETECH', 'WIKO', 'Gigaset',
       'blackshark'], dtype=object)

In [27]:
data["target"].nunique()

112