In [184]:
import pandas as pd
import numpy
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

pd.options.display.max_columns = 9999
pd.options.display.max_rows = 9999

## Read data

In [213]:
wals = pd.read_csv('data/language.csv')

In [214]:
# Filter columns
wals = wals.drop(['iso_code', 'glottocode', 'Name', 'latitude', 'longitude', 'genus', 'family', 'macroarea', 'countrycodes'], axis=1)

In [215]:
# Filter features < 10
count_features = wals.shape[0] - wals.isnull().sum()
count_features = count_features[count_features > 10]

In [216]:
wals = wals[count_features.index]

In [218]:
# Filter registers < 10
count_langs = wals.shape[1] - wals.isnull().sum(axis=1)
count_langs = count_langs[count_langs > 10]

In [219]:
wals = wals.iloc[count_langs.index]

In [221]:
def change_row(row):
    row_wo_nan = row.drop('nan')
    if row['nan']:
        return pd.Series(np.array([np.nan for i in range(len(row_wo_nan))]), index=row_wo_nan.index)
    else:
        return row_wo_nan


In [222]:
for col in wals.drop('wals_code', axis=1):
    number_levels = len(wals[col].value_counts())
    print(f'name col: {col} with {number_levels} levels')
    if number_levels > 2:
        wals[col] = wals[col].astype(str)
        wals[col] = wals[col].fillna('nan')
        wals_dummies = pd.get_dummies(wals[col])
        wals_dummies = wals_dummies.apply(lambda row: change_row(row), axis=1)
        wals_dummies.columns = col + ' ' + wals_dummies.columns

        wals = wals.drop(col, axis=1)
        wals = pd.concat((wals, wals_dummies), axis=1)
    else:
        wals[col] = wals[col].replace({c:ix for ix, c in enumerate(wals[col].value_counts().index)})
        wals[col] = wals[col].replace('nan', np.nan)


name col: 1A Consonant Inventories with 5 levels
name col: 2A Vowel Quality Inventories with 3 levels
name col: 3A Consonant-Vowel Ratio with 5 levels
name col: 4A Voicing in Plosives and Fricatives with 4 levels
name col: 5A Voicing and Gaps in Plosive Systems with 5 levels
name col: 6A Uvular Consonants with 4 levels
name col: 7A Glottalized Consonants with 8 levels
name col: 8A Lateral Consonants with 5 levels
name col: 9A The Velar Nasal with 3 levels
name col: 10A Vowel Nasalization with 2 levels
name col: 11A Front Rounded Vowels with 4 levels
name col: 12A Syllable Structure with 3 levels
name col: 13A Tone with 3 levels
name col: 14A Fixed Stress Locations with 6 levels
name col: 15A Weight-Sensitive Stress with 8 levels
name col: 16A Weight Factors in Weight-Sensitive Stress Systems with 7 levels
name col: 17A Rhythm Types with 5 levels
name col: 18A Absence of Common Consonants with 6 levels
name col: 19A Presence of Uncommon Consonants with 7 levels
name col: 20A Fusion of S

name col: 143F Postverbal Negative Morphemes with 4 levels
name col: 90B Prenominal relative clauses with 5 levels
name col: 144Y The Position of Negative Morphemes in Object-Initial Languages with 11 levels
name col: 90C Postnominal relative clauses with 4 levels
name col: 144P NegSOV Order with 4 levels
name col: 144J SVNegO Order with 7 levels
name col: 144N Obligatory Double Negation in SOV languages with 18 levels
name col: 144S SOVNeg Order with 11 levels
name col: 144X Verb-Initial with Clause-Final Negative with 4 levels
name col: 144A Position of Negative Word With Respect to Subject, Object, and Verb with 21 levels
name col: 90E Correlative relative clauses with 7 levels
name col: 144V Verb-Initial with Preverbal Negative with 9 levels
name col: 144I SNegVO Order with 8 levels
name col: 144R SONegV Order with 8 levels
name col: 143B Obligatory Double Negation with 16 levels
name col: 144M Multiple Negative Constructions in SOV Languages with 27 levels
name col: 144U Double ne

In [237]:
wals_ratings = pd.melt(wals, id_vars=['wals_code'], var_name='col_name', value_name='feature')
wals_ratings = wals_ratings[wals_ratings.feature.notnull()]

In [244]:
from surprise import Reader

reader = Reader(rating_scale=(0, 1))

data = Dataset.load_from_df(wals_ratings, reader)

algo = SVD()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2926  0.2937  0.2941  0.2934  0.2942  0.2936  0.0006  
MAE (testset)     0.1822  0.1825  0.1835  0.1823  0.1826  0.1826  0.0004  
Fit time          14.52   14.05   14.06   14.04   13.99   14.13   0.20    
Test time         0.69    0.67    0.67    0.54    0.67    0.65    0.05    


{'test_rmse': array([0.29258349, 0.29365539, 0.29414687, 0.29341671, 0.29419403]),
 'test_mae': array([0.18224849, 0.18247465, 0.18346782, 0.18225784, 0.18259204]),
 'fit_time': (14.522541999816895,
  14.052666425704956,
  14.063578844070435,
  14.044833421707153,
  13.988182544708252),
 'test_time': (0.685605525970459,
  0.667564868927002,
  0.6748476028442383,
  0.5437302589416504,
  0.6700916290283203)}

In [261]:
from sklearn.model_selection import train_test_split
trainset, testset = train_test_split(wals_ratings, test_size=0.25)

In [262]:
reader = Reader(rating_scale=(0, 1))

trainset = Dataset.load_from_df(trainset, reader).build_full_trainset()
#testset = Dataset.load_from_df(testset, reader).build_full_trainset()

In [263]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f4f86955e48>

In [281]:
predictions = []
for ix, row in testset.iterrows():
    predictions.append(algo.predict(row.wals_code, row.col_name, r_ui=0.5).est)

In [280]:
predictions

[0.04792653803050184,
 0.12984608591372337,
 0.027774090483966507,
 0.053771625350025776,
 0.5374172672546305,
 0,
 0.12234333356061922,
 0.23336042763778608,
 0.3089421315748955,
 0.570533263661142,
 0.057750886063059115,
 0.5775732544009067,
 0.15841565555872905,
 0.04099447190016514,
 0.460816264665876,
 0.035792596003010324,
 0.04604818678437561,
 0.9106004965566812,
 0.1198742977553553,
 0.23067924054499644,
 0.014011775529497505,
 0.40149630719010876,
 0,
 0.13274678950357988,
 0.183976428649834,
 0,
 0.4259359181739706,
 0.3104175269707341,
 0.009150410924656072,
 0.04635581727216276,
 0.09145859081418717,
 0.199811515248411,
 0.008495820049066793,
 0.059454406755066636,
 0.2817926033221114,
 0.25842005454242045,
 0.5657688670998307,
 0.04292124503811534,
 0.18781040763785972,
 0.20108596082931696,
 0.1682447947397705,
 0,
 0.463213149827834,
 0.02842588083892128,
 0.7638795822092463,
 0.26484768394419933,
 0.12324214986311555,
 0.1522189722173446,
 0.20471820943345453,
 0.07964

In [282]:
predictions

[0.04792653803050184,
 0.12984608591372337,
 0.027774090483966507,
 0.053771625350025776,
 0.5374172672546305,
 0,
 0.12234333356061922,
 0.23336042763778608,
 0.3089421315748955,
 0.570533263661142,
 0.057750886063059115,
 0.5775732544009067,
 0.15841565555872905,
 0.04099447190016514,
 0.460816264665876,
 0.035792596003010324,
 0.04604818678437561,
 0.9106004965566812,
 0.1198742977553553,
 0.23067924054499644,
 0.014011775529497505,
 0.40149630719010876,
 0,
 0.13274678950357988,
 0.183976428649834,
 0,
 0.4259359181739706,
 0.3104175269707341,
 0.009150410924656072,
 0.04635581727216276,
 0.09145859081418717,
 0.199811515248411,
 0.008495820049066793,
 0.059454406755066636,
 0.2817926033221114,
 0.25842005454242045,
 0.5657688670998307,
 0.04292124503811534,
 0.18781040763785972,
 0.20108596082931696,
 0.1682447947397705,
 0,
 0.463213149827834,
 0.02842588083892128,
 0.7638795822092463,
 0.26484768394419933,
 0.12324214986311555,
 0.1522189722173446,
 0.20471820943345453,
 0.07964