In [1]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from tqdm import tqdm

pd.options.display.max_columns = 9999
pd.options.display.max_rows = 9999

## Read data

In [2]:
wals = pd.read_csv('data/wals_language_withstatus.csv')

## Preprocessing

In [3]:
# Filter columns
wals = wals.drop(['iso_code', 'glottocode', 'Name', 'latitude', 'longitude', 'genus', 'family', 'macroarea', 'countrycodes'], axis=1)

In [4]:
# Filter features < 10
count_features = wals.shape[0] - wals.isnull().sum()
count_features = count_features[count_features > 10]

In [5]:
wals = wals[count_features.index]

In [6]:
# Filter registers < 10
#count_langs = wals.shape[1] - wals.isnull().sum(axis=1)
#count_langs = count_langs[count_langs > 10]

In [7]:
#wals = wals.iloc[count_langs.index].reset_index(drop=True)

In [8]:
wals_target = wals['Status from Glotto']
wals = wals.drop('Status from Glotto', axis=1)

In [9]:
def change_row(row):
    row_wo_nan = row.drop('nan')
    if row['nan']:
        return pd.Series(np.array([np.nan for i in range(len(row_wo_nan))]), index=row_wo_nan.index)
    else:
        return row_wo_nan


In [None]:
cols_null = {}
for col in tqdm(wals.drop('wals_code', axis=1).columns):
    cols_null[col] = len(wals[col].value_counts())
    if cols_null[col] > 2:
        wals[col] = wals[col].astype(str)
        wals[col] = wals[col].fillna('nan')
        wals_dummies = pd.get_dummies(wals[col])
        wals_dummies = wals_dummies.apply(lambda row: change_row(row), axis=1)
        wals_dummies.columns = col + wals_dummies.columns

        wals = wals.drop(col, axis=1)
        wals = pd.concat((wals, wals_dummies), axis=1)
    else:
        wals[col] = wals[col].replace({c:ix for ix, c in enumerate(wals[col].value_counts().index)})
        wals[col] = wals[col].replace('nan', np.nan)
wals.fillna(-1)

100%|██████████| 188/188 [03:24<00:00,  1.12s/it]


In [None]:
wals.fillna(-1, inplace=True)
wals = wals.loc[wals.wals_code != -1]

In [None]:
wals_melt = pd.melt(wals, id_vars=['wals_code'], var_name='col_name', value_name='feature')
wals_ratings = wals_melt[wals_melt.feature != -1]
wals_ratings_null = wals_melt[wals_melt.feature == -1]

In [93]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(wals_ratings, reader)
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2964  0.2953  0.2922  0.2947  0.2954  0.2948  0.0014  
MAE (testset)     0.1847  0.1836  0.1822  0.1832  0.1843  0.1836  0.0009  
Fit time          13.67   13.86   13.76   13.68   13.68   13.73   0.07    
Test time         0.76    0.56    0.74    0.56    0.57    0.64    0.09    


{'test_rmse': array([0.29637894, 0.2952556 , 0.29216696, 0.29470912, 0.29538527]),
 'test_mae': array([0.1847497 , 0.18356322, 0.18223561, 0.18316787, 0.1842871 ]),
 'fit_time': (13.6653470993042,
  13.863105297088623,
  13.764633178710938,
  13.681365966796875,
  13.679088115692139),
 'test_time': (0.7569980621337891,
  0.5628390312194824,
  0.743659257888794,
  0.5637426376342773,
  0.5658504962921143)}

In [15]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(wals_ratings, reader)
algo = SVD(n_factors=200)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2910  0.2926  0.2933  0.2935  0.2923  0.2925  0.0009  
MAE (testset)     0.1820  0.1829  0.1834  0.1837  0.1821  0.1828  0.0007  
Fit time          23.83   23.98   23.98   24.06   24.04   23.98   0.08    
Test time         0.54    0.66    0.66    0.54    0.66    0.61    0.06    


{'test_rmse': array([0.29098244, 0.29256216, 0.29332159, 0.29353345, 0.29228499]),
 'test_mae': array([0.1819853 , 0.18285044, 0.1833571 , 0.18366228, 0.1821244 ]),
 'fit_time': (23.828102827072144,
  23.978682041168213,
  23.980900764465332,
  24.05958580970764,
  24.037503480911255),
 'test_time': (0.5368878841400146,
  0.6589217185974121,
  0.6553318500518799,
  0.5382833480834961,
  0.6560063362121582)}

In [16]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(wals_ratings, reader)
algo = SVD(n_factors=300)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2903  0.2906  0.2914  0.2900  0.2913  0.2907  0.0005  
MAE (testset)     0.1810  0.1815  0.1824  0.1817  0.1819  0.1817  0.0005  
Fit time          38.41   38.77   38.67   39.56   38.67   38.82   0.39    
Test time         0.55    0.67    0.68    0.55    0.68    0.63    0.06    


{'test_rmse': array([0.29031376, 0.29056225, 0.2913779 , 0.29001904, 0.29127553]),
 'test_mae': array([0.18099809, 0.18145941, 0.18236461, 0.18169389, 0.18189709]),
 'fit_time': (38.40881538391113,
  38.77242612838745,
  38.66944980621338,
  39.56117534637451,
  38.671761989593506),
 'test_time': (0.5510618686676025,
  0.669342041015625,
  0.6794090270996094,
  0.5486891269683838,
  0.676823616027832)}

In [17]:
reader = Reader(rating_scale=(0, 1))

trainset = Dataset.load_from_df(wals_ratings, reader).build_full_trainset()
#testset = Dataset.load_from_df(testset, reader).build_full_trainset()

In [19]:
algo = SVD(n_factors=300)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f3c7230bcc0>

In [20]:
predictions = []
for ix, row in wals_ratings_null.iterrows():
    predictions.append(algo.predict(row.wals_code, row.col_name).est)

In [21]:
wals_ratings_null['feature'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
temp = pd.pivot_table(pd.concat((wals_ratings_null, wals_ratings), axis=0), values='feature', index=['wals_code'],
                         columns=['col_name'], aggfunc=np.sum)
temp = temp.reset_index()

In [23]:
wals_final = pd.DataFrame()
wals_final['wals_code'] = temp['wals_code']
for col in list(cols_null.keys()):
    group_col = [temp_col for temp_col in temp.columns if temp_col.startswith(col)]
    wals_final[col] = temp[group_col].idxmax(axis=1).str.replace(col, '')

In [24]:
wals_final = pd.concat((wals_final, wals_target), axis=1)
wals_final = wals_final[wals_final['wals_code'].notnull()].reset_index(drop=True)

In [25]:
wals_final.to_csv('wals_without_null.csv', index=False)

In [None]:
wals_final['Status from Glotto'].value_counts()

In [None]:
wals_final = wals_final.replace({'safe': 0,
                                'definitely endangered': 1,
                                'vulnerable': 1,
                                'critically endangered': 1,
                                'severely endangered': 1,
                                'extinct': 1})

## Model

In [117]:
train = wals_final.drop('wals_code', axis=1)

In [129]:
wals = wals.replace({'safe': 0,
                                'definitely endangered': 1,
                                'vulnerable': 1,
                                'critically endangered': 1,
                                'severely endangered': 1,
                                'extinct': 1})

In [130]:
train = wals.drop('wals_code', axis=1)

In [131]:
from sklearn import preprocessing

cat_cols = [col for col in train.select_dtypes(include=['object']).columns]
for col in cat_cols:
    train[col] = train[col].fillna(train[col].mode()[0])
    le = preprocessing.LabelEncoder()
    le.fit(train[col])
    train[col] = le.transform(train[col]) 

In [97]:
def print_results(trn_aucs, val_aucs):
    train_auc, train_intconf = np.mean(trn_aucs), 2 * np.std(trn_aucs)
    val_auc, val_intconf = np.mean(val_aucs), 2 * np.std(val_aucs)
    
    train_gini, val_gini = (train_auc - 0.5) * 2, (val_auc - 0.5) * 2
    train_gini_intconf, val_gini_intconf = train_intconf * 2, val_intconf * 2
    
    print(f'Train AUC: {100*train_auc:.2f} +/- {100*train_intconf:.2f} | '
          f'Val AUC: {100*val_auc:.2f} +/- {100*val_intconf:.2f} | '
          f'Train Gini: {100*train_gini:.2f} +/- {100*train_gini_intconf:.2f} | '
          f'Val Gini: {100*val_gini:.2f} +/- {100*val_gini_intconf:.2f}')  
    return np.mean(trn_aucs), np.mean(val_aucs)

In [108]:
from sklearn.metrics import roc_auc_score

def test_lgbm(lgbm, X, y, kfolds, cat_features, params, num_rounds=1000):
    trn_aucs, val_aucs, trn_f1, val_f1 = [], [], [], []
    y_pred = np.zeros(len(X))
    models = []
    evals_result = []
    for trn_idx, val_idx in kfolds.split(X, y):
        eval_result = {}
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
        dtrn = lgbm.Dataset(X_trn, y_trn)
        dval = lgbm.Dataset(X_val, y_val)
        bst = lgbm.train(params, dtrn, num_rounds, [dval],
                  early_stopping_rounds=30, #feval=lgb_f1_score, 
                         evals_result=eval_result,
                  verbose_eval=False)
        evals_result.append(eval_result)
        y_trn_pred = bst.predict(X_trn)
        y_val_pred = bst.predict(X_val)
        trn_aucs.append(roc_auc_score(y_trn, y_trn_pred))
        val_aucs.append(roc_auc_score(y_val, y_val_pred))
        
        print(f'No. estimators: {bst.best_iteration} | '
              f'Train AUC: {100*trn_aucs[-1]:.2f} | '
              f'Val AUC: {100*val_aucs[-1]:.2f} | '
              f'Train Gini: {(100*trn_aucs[-1]-50)*2:.2f} | '
              f'Val Gini: {(100*val_aucs[-1]-50)*2:.2f} | ')
        
        y_pred[val_idx] = y_val_pred
        
        models.append(bst)
        
    print()
    print_results(trn_aucs, val_aucs)
    print()
    return y_pred, models, evals_result

## Compare

In [110]:
%%time
from sklearn.model_selection import StratifiedKFold
import lightgbm

X_train = train.drop('Status from Glotto', axis=1)
y_train = train['Status from Glotto']
skfolds = StratifiedKFold(n_splits=8, random_state=42, shuffle=True)
params = {'objective':'binary', 'gpu_device_id': '1', 'max_depth':7}
y_pred, models, evals = test_lgbm(lightgbm, X_train, y_train, skfolds, [], params)

No. estimators: 39 | Train AUC: 95.73 | Val AUC: 69.26 | Train Gini: 91.46 | Val Gini: 38.53 | 
No. estimators: 25 | Train AUC: 92.52 | Val AUC: 70.58 | Train Gini: 85.04 | Val Gini: 41.16 | 
No. estimators: 14 | Train AUC: 90.53 | Val AUC: 58.37 | Train Gini: 81.05 | Val Gini: 16.73 | 
No. estimators: 31 | Train AUC: 93.20 | Val AUC: 68.49 | Train Gini: 86.39 | Val Gini: 36.99 | 
No. estimators: 20 | Train AUC: 91.85 | Val AUC: 63.47 | Train Gini: 83.69 | Val Gini: 26.95 | 
No. estimators: 5 | Train AUC: 82.97 | Val AUC: 60.89 | Train Gini: 65.95 | Val Gini: 21.78 | 
No. estimators: 20 | Train AUC: 92.32 | Val AUC: 65.68 | Train Gini: 84.64 | Val Gini: 31.36 | 
No. estimators: 7 | Train AUC: 84.75 | Val AUC: 58.19 | Train Gini: 69.51 | Val Gini: 16.38 | 

Train AUC: 90.48 +/- 8.17 | Val AUC: 64.37 +/- 9.18 | Train Gini: 80.97 +/- 16.34 | Val Gini: 28.73 +/- 18.35

CPU times: user 17.2 s, sys: 24 ms, total: 17.2 s
Wall time: 1.48 s


In [132]:
%%time
from sklearn.model_selection import StratifiedKFold
import lightgbm

X_train = train.drop('Status from Glotto', axis=1)
y_train = train['Status from Glotto']
skfolds = StratifiedKFold(n_splits=8, random_state=42, shuffle=True)
params = {'objective':'binary', 'gpu_device_id': '1', 'max_depth':7}
y_pred, models, evals = test_lgbm(lightgbm, X_train, y_train, skfolds, [], params)

No. estimators: 102 | Train AUC: 90.14 | Val AUC: 75.17 | Train Gini: 80.28 | Val Gini: 50.35 | 
No. estimators: 72 | Train AUC: 88.03 | Val AUC: 74.12 | Train Gini: 76.06 | Val Gini: 48.23 | 
No. estimators: 123 | Train AUC: 90.62 | Val AUC: 74.45 | Train Gini: 81.25 | Val Gini: 48.91 | 
No. estimators: 93 | Train AUC: 89.59 | Val AUC: 74.05 | Train Gini: 79.18 | Val Gini: 48.10 | 
No. estimators: 104 | Train AUC: 90.39 | Val AUC: 73.40 | Train Gini: 80.78 | Val Gini: 46.81 | 
No. estimators: 68 | Train AUC: 87.58 | Val AUC: 74.96 | Train Gini: 75.17 | Val Gini: 49.93 | 
No. estimators: 17 | Train AUC: 81.07 | Val AUC: 67.11 | Train Gini: 62.14 | Val Gini: 34.22 | 
No. estimators: 107 | Train AUC: 90.26 | Val AUC: 75.91 | Train Gini: 80.53 | Val Gini: 51.82 | 

Train AUC: 88.46 +/- 5.97 | Val AUC: 73.65 +/- 5.15 | Train Gini: 76.92 +/- 11.94 | Val Gini: 47.29 +/- 10.29

CPU times: user 20.5 s, sys: 8 ms, total: 20.5 s
Wall time: 1.77 s
