In [1]:
import numpy as np
import pandas as pd

In [2]:
X = pd.read_csv('data/orange_small_churn_data.train.csv')
y = pd.read_csv('data/orange_small_churn_labels.train.csv', header=None)
X.shape

(40000, 230)

In [3]:
y = y[0].ravel()
y.reshape((40000,))
y

array([-1, -1, -1, ..., -1, -1, -1])

In [4]:
X_dropped = X.dropna(axis=1, thresh=20_000, how="any") 

In [5]:
print(X.shape[1] - X_dropped.shape[1], 'columns were dropped')

161 columns were dropped


In [6]:
cat_cols = X_dropped.select_dtypes(exclude=["number","bool_"]).columns
num_cols = list(set(X_dropped.columns) - set(cat_cols))

In [7]:
X_cat_dummy = pd.get_dummies(X_dropped[cat_cols], sparse=False, dummy_na=True)
X_cat_dummy.shape #слишком большая размерность у one-hot энкодера, будем использовать другой способ кодирования

(40000, 36909)

In [8]:
X_dropped[cat_cols] = X_dropped[cat_cols].astype(str)
X_dropped[cat_cols] = X_dropped[cat_cols].astype('category')
X_cat_code = X_dropped[cat_cols].apply(lambda x: x.cat.codes)
X_cat_code.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


(40000, 28)

In [9]:
from sklearn.preprocessing import StandardScaler


X_scaled = pd.DataFrame(StandardScaler().fit_transform(X[num_cols]), columns=num_cols)
X_scaled.fillna(0, inplace=True)
X_scaled.apply(np.float32);

In [10]:
X_processed = pd.concat([X_scaled, X_cat_code], axis=1)
X_processed.shape

(40000, 69)

In [11]:
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score


print(cross_val_score(RidgeClassifier(), X_processed, y, scoring='roc_auc').mean())
print(cross_val_score(RandomForestClassifier(), X_processed, y, scoring='roc_auc').mean())
print(cross_val_score(GradientBoostingClassifier(), X_processed, y, scoring='roc_auc').mean())

0.6393346120786225
0.6724877434127284
0.7299076371034927


In [12]:
print(cross_val_score(RidgeClassifier(), X_processed, y, scoring='f1_macro').mean())
print(cross_val_score(RandomForestClassifier(), X_processed, y, scoring='f1_macro').mean())
print(cross_val_score(GradientBoostingClassifier(), X_processed, y, scoring='f1_macro').mean())

0.4806813457237973
0.4823802936684614
0.4944593113408584
