In [None]:
from sklearn import *
import numpy as np
import pandas as pd
import random
import glob

sdir = '/kaggle/input/AI4Code/'
train = glob.glob(sdir+'/train/**') #139256
test = glob.glob(sdir+'/test/**') #4 / 20K code comp
sub = pd.read_csv(sdir+'sample_submission.csv') #id, cell_order

In [None]:
%%time
def getDF(files):
    dfs = []
    for f in files:
        d = eval(open(f, 'r').read())
        df = pd.DataFrame([[c, r, d['cell_type'][c], d['source'][c]] for r, c in enumerate(d['cell_type'])], columns=['cell_id','rank','cell_type','source'])
        df['id'] = f.split('/')[-1].split('.')[0]
        dfs.append(df)
    return pd.concat(dfs)

train = getDF(train)
test = getDF(test)
print(train.shape, test.shape)

In [None]:
def cleanstr(s):
    #s = str(s).encode('unicode-escape').decode('ascii')
    s = s.replace('<br\\/>', ' ')
    s = s.replace('\\n', ' ')
    s = s.replace('\\/', ' ')
    s = s.replace('\\t', ' ')
    s = s.replace('#', ' ')
    s = s.replace('*', ' ')
    s = s.replace(',', ' ')
    s = s.replace('.', ' ')
    s = s.replace('](', ' ')
    s = s.replace('  ', ' ')
    return s

print(train['cell_type'].value_counts())
df = train.drop_duplicates(subset=['id'], keep='last')
df = df[['id','rank']].rename(columns={'rank':'total'})
train = pd.merge(train, df, how='left', on='id')
train['target'] = train['rank']/train['total']
train = train[train.cell_type=='markdown']
train['source'] = train['source'].map(lambda x: cleanstr(x))

print(test['cell_type'].value_counts())
df = test.drop_duplicates(subset=['id'], keep='last')
df = df[['id','rank']].rename(columns={'rank':'total'})
test = pd.merge(test, df, how='left', on='id')
test['target'] = test['rank']/test['total']
test['source'] = test['source'].map(lambda x: cleanstr(x)) #impacts code but ok

In [None]:
%%time

def fdeffec(c, s, exp = {}):
    it = {'a':10,
        'b':0.5,
        'c':0.5,
        'd':0.9,
        'e':1e-10}
    for i in range(len(c)):
        words = set([w for w in str(c[i]).lower().split(' ')])
        for w in words:
            try:
                exp[w]['b'] = exp[w]['b'][1:] + [s[i]]
                exp[w]['c'] += 1
                exp[w]['d'] = exp[w]['d'][1:] + [(exp[w]['d'][it['a']-1] + (s[i] * it['d']))/2]
                exp[w]['e'] += s[i]
            except:
                m = [0. for m_ in range(it['a'])]
                exp[w] = {}
                exp[w]['b'] = m[1:] + [s[i]]
                exp[w]['c'] = 1
                exp[w]['d'] = m[1:] + [s[i] * it['d'] / 2]
                exp[w]['e'] = s[i]
                
    for w in exp:
        exp[w]['e'] /= exp[w]['c'] + it['e']
        exp[w]['c'] /= len(c) * it['c']

    return exp

exp = fdeffec(train['source'].values, train['target'].values)
#exp = fdeffec(test[test.cell_type=='markdown']['source'].values, np.zeros(len(test[test.cell_type=='markdown'])), exp)

In [None]:
%%time
def features(df):
    df['len'] = df['source'].map(len)
    df['wlen'] = df['source'].map(lambda x: len(str(x).split(' ')))

    df['b_mean'] = df['source'].map(lambda x: np.mean([np.mean(exp[w]['b']) if w in exp else 0 for w in str(x).lower().split(' ')]))
    df['c_mean'] = df['source'].map(lambda x: np.mean([exp[w]['c'] if w in exp else 0 for w in str(x).lower().split(' ')]))
    df['d_mean'] = df['source'].map(lambda x: np.mean([np.mean(exp[w]['d']) if w in exp else 0 for w in str(x).lower().split(' ')]))
    df['e_mean'] = df['source'].map(lambda x: np.mean([exp[w]['e'] if w in exp else 0.5 for w in str(x).lower().split(' ')]))
    return df

train = features(train)
test = pd.concat((features(test[test.cell_type=='markdown']), test[test.cell_type=='code']))

In [None]:
col = [c for c in train if c not in ['cell_id', 'rank', 'cell_type', 'source', 'id', 'target']]
x1, x2, y1, y2 = model_selection.train_test_split(train[col], train.target, test_size=0.2, random_state=2, stratify=train[['total']], shuffle=True)

model = ensemble.ExtraTreesRegressor(n_estimators=2000, max_depth=14, n_jobs=-1, random_state=2)
model.fit(x1, y1)
preds = model.predict(x2)
score = metrics.r2_score(y2, preds)
print(score)
model.fit(train[col], train.target)

test['target'] = model.predict(test.fillna(0)[col])
df = test[test.cell_type=='code']
df = df.groupby(by=['id'])['rank'].count().reset_index().rename(columns={'rank':'code_count'})
test = pd.merge(test, df, how='left', on='id')
test['target'] = test.apply(lambda r: r.target if r.cell_type=='markdown' else r['rank'] / r.code_count, axis=1)

In [None]:
sub = []
for f in test['id'].unique():
    m = list(test[test['id']==f].sort_values('target')['cell_id'].values)
    sub.append([f, ' '.join(m)])
sub = pd.DataFrame(sub, columns=['id', 'cell_order'])
sub.to_csv('submission.csv', index=False)