In [1]:
import pandas as pd
import numpy as np
import re
import sklearn
from sklearn.model_selection import KFold
from scipy import interp
from sklearn.linear_model import LinearRegression
import sklearn.tree
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import scale
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('PMChallenge2018/challenge1-train.txt', header=None, sep='\t')
print(df.shape)

(9946, 108)


In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,45.437887,four,seven,seven,88,62,37,eight plus seven plus eight plus eight plus on...,34,79,...,26,81,82,88,2,92,63,78,27,18
1,6.804464,nine,two,eight,82,14,12,eight plus eight plus zero plus six plus six p...,34,57,...,31,88,6,35,36,77,91,48,22,75
2,18.096237,two,one,three,7,38,72,zero plus seven plus seven plus nine plus four...,87,75,...,40,20,57,56,54,21,94,85,56,4
3,81.498121,six,zero,one,20,88,53,four plus one plus one plus four plus nine plu...,80,20,...,97,8,99,11,94,22,0,82,71,69
4,99.978597,zero,nine,three,76,99,96,zero plus nine plus two plus nine plus four pl...,98,0,...,85,10,98,40,6,27,51,99,44,8


In [4]:
rep_dict = {'one':'1', 'two':'2', 'three':'3', 'four':'4', 'five':'5', 'six':'6', 'seven':'7', 'eight':'8', 'nine':'9', 'zero':'0', 'plus':'+'}

for i in (1,2,3):
    df[i] = pd.to_numeric(df[i].map(rep_dict))
    
def procRow(string, rep_dict):
    if type(string) == float:
        return '0'
    #print(string)
    pattern = re.compile("|".join([re.escape(k) for k in rep_dict.keys()]), re.M)
    y = pattern.sub(lambda x: rep_dict[x.group(0)], string)
    return eval(y)

df[7] = pd.to_numeric(df[7].apply(lambda x: procRow(x, rep_dict)))

In [5]:
# 2 fold cross validation
n = 2
cv = KFold(n_splits=n, shuffle=True)
from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor()

l1_errors = []

X = np.array(df)[:,1:]
y = np.array(df[0])
#print(X)
for train, test in cv.split(X, y):
    r = regressor.fit(X[train], y[train])
    y_hat = r.predict(X[test])
    l1_error = np.sum(np.abs(y[test] - y_hat)) / len(y_hat)
    l1_errors.append(l1_error)
   
print (l1_errors)
print ('Average L1 error in {} folds cross valudation: {}'.format(n, np.average(l1_errors)))

[0.79760147452200125, 0.81922229072143704]
Average L1 error in 2 folds cross valudation: 0.8084118826217191


In [6]:
df2 = pd.read_csv('PMChallenge2018/challenge1-unlabeledtestfile.txt', header=None, sep='\t')
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0,two,two,four,99,75,89,one plus zero plus one,45,10,...,8,9,65,69,46,17,93,19,32,85
1,0,four,four,eight,1,40,77,one plus five plus two plus nine plus five plu...,0,56,...,13,34,90,41,24,71,83,61,50,91
2,0,four,zero,seven,49,65,71,three plus nine plus one plus eight plus seven...,14,96,...,80,18,55,32,59,93,14,79,25,82
3,0,five,one,six,54,89,32,zero plus five plus nine plus five plus five p...,7,39,...,89,99,71,82,55,25,65,90,13,40
4,0,seven,two,two,28,44,36,one plus eight plus three plus four plus zero ...,79,13,...,1,69,34,74,47,88,63,8,18,12


In [7]:
for i in (1,2,3):
    df2[i] = pd.to_numeric(df2[i].map(rep_dict))
df2[7] = pd.to_numeric(df2[7].apply(lambda x: procRow(x, rep_dict)))

y_pred = regressor.predict(np.array(df2)[:,1:])
print(y_pred.shape)

(10054,)


In [8]:
# Open File
f = open("PMChallenge2018/challenge1-score.txt",'w')

# Write data to file
for r in y_pred:
    f.write(str(r) + '\r\n')
f.close()