In [18]:
import numpy as np
np.random.seed(123)
import pandas as pd
import xgboost as xgb
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
import itertools
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split

In [2]:
shift = 200

In [3]:
COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
               'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
               'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
               'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')

In [4]:
def encode(charcode):
    r = 0
    ln = len(str(charcode))
    for i in range(ln):
        r+= (ord(str(charcode)[i])) - ord('A')+1 *26 **(ln-i-1)
    return r

In [5]:
def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[abs(skewed_feats) > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain

In [73]:
train = pd.read_csv('C:\\hudsondata\\Machine Learning\\Kaggle\AllState\\train.csv',nrows=100000)
test = pd.read_csv('C:\\hudsondata\\Machine Learning\\Kaggle\AllState\\test.csv')
id_test = test['id'].values

In [74]:
numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x]
train_test, ntrain = mungeskewed(train, test, numeric_feats)

In [75]:
for column in list(train.select_dtypes(include = ['object']).columns):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train
        remove = remove_train.union(remove_test)
        
        def filter_cat(x):
            if x in remove:
                return np.nan
            return x
        
        train_test[column] = train_test[column].apply(lambda x: filter_cat(x),1)

In [76]:
train_test['cont1'] = np.sqrt(minmax_scale(train_test['cont1']))
train_test['cont4'] = np.sqrt(minmax_scale(train_test['cont4']))
train_test['cont5'] = np.sqrt(minmax_scale(train_test['cont5']))
train_test['cont8'] = np.sqrt(minmax_scale(train_test['cont8']))
train_test['cont10'] = np.sqrt(minmax_scale(train_test['cont10']))
train_test['cont11'] = np.sqrt(minmax_scale(train_test['cont11']))
train_test['cont12'] = np.sqrt(minmax_scale(train_test['cont12']))

In [77]:
train_test['cont6'] = np.log(minmax_scale(train_test['cont6'])+ 0000.1)
train_test['cont7'] = np.log(minmax_scale(train_test['cont7'])+ 0000.1)
train_test['cont9'] = np.log(minmax_scale(train_test['cont9'])+ 0000.1)
train_test['cont13'] = np.log(minmax_scale(train_test['cont13'])+ 0000.1)
train_test["cont14"] = (np.maximum(train_test["cont14"] - 0.179722, 0) / 0.665122) ** 0.25

In [78]:
for col in categorical_feats:
    train_test[col] = train_test[col].apply(encode)

In [12]:
for comb in itertools.combinations(COMB_FEATURE,2):
    feat = comb[0] + "_" + comb[1]
    train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
    train_test[feat] = train_test[feat].apply(encode)

In [79]:
ss = StandardScaler()

In [80]:
train_test[numeric_feats] = ss.fit_transform(train_test[numeric_feats].values)

In [81]:
train = train_test.iloc[:ntrain,:].copy()
test = train_test.iloc[ntrain:,:].copy()

In [82]:
ids = pd.read_csv('C:\\hudsondata\\Machine Learning\\Kaggle\AllState\\test.csv')['id']

In [83]:
train_y = np.log(train['loss'] + shift)
train_x = train.drop(['loss','id'], axis=1)
test_x = test.drop(['loss','id'], axis=1)

In [92]:
model = ElasticNet(alpha=.0001, selection = 'random',random_state = 0,max_iter = 10000

In [89]:
X_train, X_test, y_train,y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state = 0)

In [93]:
model.fit(X_train,y_train)

ElasticNet(alpha=0.0001, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=10000, normalize=False, positive=False, precompute=False,
      random_state=0, selection='random', tol=0.0001, warm_start=False)

In [94]:
mean_absolute_error(np.exp(y_test), np.exp(model.predict(X_test)))

1288.1239113218116

In [96]:
from sklearn.linear_model import Ridge

In [106]:
model2 = Ridge(alpha =.0001, random_state=0)

In [107]:
model2.fit(X_train,y_train)

Ridge(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=0, solver='auto', tol=0.001)

In [108]:
mean_absolute_error(np.exp(y_test), np.exp(model2.predict(X_test)))

1287.9467545282296

In [109]:
from sklearn.linear_model import LinearRegression

In [112]:
lr = LinearRegression()

In [113]:
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [114]:
mean_absolute_error(np.exp(y_test), np.exp(lr.predict(X_test)))

1287.9446530309183