In [758]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score

import datetime

%matplotlib inline

# Get data

In [759]:
cat_data = pd.read_csv('markedcats_13_12.csv')
cat_price=pd.read_csv('clear_price190k.csv')

with open("fancy14_12.txt") as file:
    lines = [line.strip() for line in file]
fancy = list(map(int, lines))

In [760]:
cat_data.head()

Unnamed: 0,num,isGestating,isReady,cooldownIndex,nextActionAt,siringWithId,birthTime,matronId,sireId,generation,genes,breed,prim_col,sec_col,pat_col,pat_type,eye_type,eye_col,mouth
0,0,False,True,0,0,0,1511415679,0,0,0,0xffffffffffffffffffffffffffffffffffffffffffff...,ragdoll,unknown,unknown,unknown,totesbasic,unknown,unknown,soserious
1,1,False,True,0,0,0,1511417999,0,0,0,0x5ad2b318e6724ce4b9290146531884721ad18c63298a...,sphynx,orangesoda,granitegrey,lemonade,jaguar,simple,sizzurp,pouty
2,2,False,True,0,0,0,1511417999,0,0,0,0x5a50b310c470c614b90f294a321086318c618c63294e...,himalayan,orangesoda,kittencream,blurple,jaguar,simple,chestnut,soserious
3,3,False,True,0,0,0,1511417999,0,0,0,0x4ad09294e5738ce42129294a5210c43190e18c63298e...,himalayan,orangesoda,granitegrey,chocolate,jaguar,simple,strawberry,pouty
4,4,False,True,2,4675193,0,1511417999,0,0,0,0x5ad2b214e5085ce7210f0140531084739ac18c63394a...,ragamaffin,orangesoda,granitegrey,coffee,jaguar,thicccbrowz,chestnut,soserious


In [761]:
cat_price.shape

(131510, 3)

In [762]:
#remove unformatted data
cat_data=cat_data.drop(cat_data.index[0])

In [763]:
cat_data.shape

(284964, 19)

#### Mark or remove fancy

In [764]:
# check for fancy in later cells
cat_data['fancy']=cat_data['num'].apply(lambda x: 1 if x in fancy else 0)

cat_data[cat_data['fancy']==1].shape

(5438, 20)

In [765]:
#cat_data = cat_data[~cat_data['num'].isin(fancy)]

In [766]:
cat_data.shape

(284964, 20)

## Del or format features

#### Del gen 0

In [767]:
cat_data = cat_data[cat_data.generation != 0]
#cat_data = cat_data[cat_data.generation != 1]
#cat_data = cat_data[cat_data.generation != 2]

In [768]:
cat_data.shape

(280723, 20)

#### Del unnecessary features

In [769]:
cat_data=cat_data.drop('nextActionAt',axis=1)
cat_data=cat_data.drop('genes',axis=1)
#cat_data=cat_data.drop('siringWithId',axis=1)
cat_data=cat_data.drop('matronId',axis=1) #будут присутствовать у всех generation>0
cat_data=cat_data.drop('sireId',axis=1) #будут присутствовать у всех generation>0
#cat_data=cat_data.drop('birthTime',axis=1) #возможно стоит перевести в datetime и оставить день-месяц


In [770]:
cat_data.shape

(280723, 16)

#### Format true/false

In [771]:
cat_data.isGestating = cat_data.isGestating.astype(int)
cat_data.isReady = cat_data.isReady.astype(int)

In [772]:
# siringwith 
cat_data['siringWithId']=cat_data['siringWithId'].apply(lambda x: 1 if x!=0 else 0)

In [773]:
cat_data['birthTime'] = cat_data['birthTime'].apply(datetime.datetime.fromtimestamp)

## Ohe

#### label ohe

In [774]:
#check fancy
cat_df=pd.concat([cat_data[['num', 'isGestating', 'isReady','siringWithId','cooldownIndex', 'generation','fancy','birthTime']], pd.get_dummies(cat_data[['breed',
       'prim_col', 'sec_col', 'pat_col', 'pat_type', 'eye_type', 'mouth',
       'eye_col']])], axis=1)

#### ohe

In [775]:
temp=pd.get_dummies(cat_data['cooldownIndex'],prefix='cd')
cat_df=pd.concat([cat_df,temp],axis=1)
cat_df=cat_df.drop('cooldownIndex',axis=1)

In [776]:
temp2=pd.get_dummies(cat_data['generation'],prefix='gen')
cat_df=pd.concat([cat_df,temp2],axis=1)
cat_df=cat_df.drop('generation',axis=1)

## FEATURES

### rare

In [777]:
cat_df['dateBirth']=cat_df['birthTime'].apply(lambda x : x.date())

In [778]:
rare_feats=['eye_type_wingtips','pat_col_wolfgrey','breed_mainecoon','prim_col_oldlace',\
            'pat_col_cerulian','pat_col_violet','eye_type_fabulous','breed_chartreux',\
            'pat_type_jaguar','mouth_gerbil','prim_col_cottoncandy','mouth_dali','mouth_whixtensions',\
            'eye_col_bubblegum','sec_col_peach','eye_type_otaku','pat_type_tigerpunk','mouth_beard']

In [779]:
df_cs=pd.DataFrame(columns = rare_feats)
for i in rare_feats:
    df_c=cat_df.groupby(cat_df.dateBirth)[[i]].sum()
    df_cs[i]= df_c[i].cumsum()/cat_df.shape[0]
df_cs.columns=[s + '_cs' for s in rare_feats]
df_cs['dateBirth']=df_cs.index

In [780]:
merged_rare = pd.merge(left=cat_df,right=df_cs, how='left', left_on='dateBirth', right_on='dateBirth')

### common

In [781]:
common_feats=['pat_type_totesbasic', 'sec_col_granitegrey', 'sec_col_kittencream', \
              'pat_type_luckystripe', 'eye_type_thicccbrowz', 'eye_type_crazy', 'mouth_pouty',\
             'breed_himalayan', 'eye_col_sizzurp', 'breed_ragamaffin', 'mouth_happygokitty', 'mouth_soserious',\
              'eye_col_strawberry','prim_col_orangesoda','prim_col_aquamarine','mouth_saycheese','breed_munchkin',\
              'eye_type_raisedbrow','eye_type_simple','eye_col_topaz','breed_sphynx','pat_col_chocolate']

In [782]:
df_cs2=pd.DataFrame(columns = common_feats)
for i in common_feats:
    df_c2=cat_df.groupby(cat_df.dateBirth)[[i]].sum()
    df_cs2[i]= df_c2[i].cumsum()/cat_df.shape[0]
df_cs2.columns=[s + '_cs' for s in common_feats]
df_cs2['dateBirth']=df_cs2.index

In [783]:
merged_rare_common = pd.merge(left=merged_rare,right=df_cs2, how='left', left_on='dateBirth', right_on='dateBirth')

In [784]:
merged_rare_common.shape

(280723, 225)

In [785]:
cat_df=merged_rare_common

## Merge kitties features with kitties prices

In [786]:
cat_price.shape

(131510, 3)

In [787]:
#cat_price= cat_price.drop('Unnamed: 0', axis=1)

In [788]:
df= pd.merge(cat_df, cat_price, left_on = 'num', right_on = 'cat_id')
df.shape

(124827, 228)

## New date features

In [789]:
df['dateSold'] = df['timeStamp'].apply(datetime.datetime.fromtimestamp)

In [790]:
df['hourSold']=df['dateSold'].apply(lambda x: x.hour)

In [791]:
#list(set(df.hourSold))

In [792]:
df['weekday']=df['dateSold'].apply(lambda x: x.isoweekday())

In [793]:
df['weekend']=df['weekday'].apply(lambda x: 1 if x in [6,7] else 0)

In [794]:
df['evening']=df['hourSold'].apply(lambda x: 0 if x in range(3,18) else 1)

In [795]:
df['age']=df['dateSold'].apply(lambda x : x.date())-df['birthTime'].apply(lambda x : x.date())

In [796]:
df['age']=df['age']/np.timedelta64(1, 'D')
df['age']=df['age'].apply(lambda x: int(x))

#### ohe

In [797]:
temp=pd.get_dummies(df['hourSold'],prefix='h')
df=pd.concat([df,temp],axis=1)
df=df.drop('hourSold',axis=1)
temp2=pd.get_dummies(df['weekday'],prefix='wd')
df=pd.concat([df,temp2],axis=1)
df=df.drop('weekday',axis=1)

### Median price by day

In [798]:
df['date']=df['dateSold'].apply(lambda x : x.date())
df_medSold=df.groupby(df.dateSold.dt.date)[['soldPrice']].median()
df_medSold['date'] = df_medSold.index
df_medSold=df_medSold.rename(columns = {'soldPrice':'medPrice'})

### Mean price by day

In [799]:
df_meanSold=df.groupby(df.dateSold.dt.date)[['soldPrice']].mean()
df_meanSold['date'] = df_meanSold.index
df_meanSold=df_meanSold.rename(columns = {'soldPrice':'meanPrice'})

In [800]:
df_medSold

Unnamed: 0_level_0,medPrice,date
dateSold,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-11-24,8.16618,2017-11-24
2017-11-25,4.265795,2017-11-25
2017-11-26,2.289915,2017-11-26
2017-11-27,2.378905,2017-11-27
2017-11-28,2.407653,2017-11-28
2017-11-29,4.841811,2017-11-29
2017-11-30,4.492083,2017-11-30
2017-12-01,2.56038,2017-12-01
2017-12-02,2.327668,2017-12-02
2017-12-03,8.635878,2017-12-03


## Merge medianPrice, meanPrice with data

In [801]:
merged_left = pd.merge(left=df,right=df_medSold, how='left', left_on='date', right_on='date')
merged = pd.merge(left=merged_left,right=df_meanSold, how='left', left_on='date', right_on='date')

In [802]:
#merged['medDif']=merged['soldPrice']-merged['medPrice']

In [803]:
df=merged
df.shape

(124827, 266)

In [804]:
df = df.drop('num',axis=1)
df = df.drop('cat_id',axis=1)
df = df.drop('timeStamp',axis=1)
df = df.drop('dateSold',axis=1)

In [805]:
#df = df.drop('medPrice',axis=1)
df = df.drop('meanPrice',axis=1)
df = df.drop('date',axis=1)
df = df.drop('dateBirth',axis=1)

In [806]:
#df = df.drop('medDif',axis=1)
df = df.drop('birthTime',axis=1)

In [807]:
df['age']=np.log1p(df['age']+1)
df['medPrice']=np.log1p(df['medPrice'])

# OUTLIERS

In [808]:
%%time
out_train = df[['soldPrice']]
outliers_lst = list()
# Для каждого признака найдем слишком высокое или низкое значение
for feature in out_train.keys():
    
    # TODO: Считаем Q1 (25 процентов) 
    Q1 = np.percentile(out_train[feature],15)
    
    # TODO: Считаем Q1 (75 процентов)
    Q3 = np.percentile(out_train[feature],85)
    
    # TODO: Интерквартильный размах
    step = 1.5*(Q3-Q1)
    
    # Отображаем выбросы
    #print ("Возможные выбросы у признака '{}':".format(feature))
    #display(out_train[~((out_train[feature] >= Q1 - step) & (out_train[feature] <= Q3 + step))])
    
    for i in out_train[~((out_train[feature] >= Q1 - step) & (out_train[feature] <= Q3 + step))].index:
        outliers_lst.append(i)
    print(Q1,Q3)



7.76278392909 55.7212612389
Wall time: 43 ms


In [809]:
%%time
# OPTIONAL: Выбираем выбросы от которых хотим избавиться
outliers  = []


outliers_dct = {x:outliers_lst.count(x) for x in outliers_lst}
for i in outliers_dct.keys():
    if outliers_dct[i]>0:
        outliers.append(i)
#print (outliers)
print (len(outliers))

7314
Wall time: 991 ms


In [810]:
good_data = df.drop(df.index[outliers]).reset_index(drop = True)

In [811]:
good_data.shape

(117513, 258)

### rename

In [812]:
df= good_data

# Model

In [813]:
y_train = np.log1p(df.soldPrice) #skrewed price distribution

In [814]:
X_train = df.drop('soldPrice', axis =1)

In [815]:
from sklearn.linear_model import Ridge

train_X, valid_X, train_y, valid_y = train_test_split(X_train, y_train, test_size = 0.1, random_state = 42)
model = Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='sag', tol=0.001)
model.fit(train_X,train_y)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='sag', tol=0.001)

In [816]:
model = GridSearchCV(Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='sag', tol=0.001), cv=3,
                   param_grid={"alpha": [0.3, 0.4, 0.5]})

In [817]:
model.fit(train_X,train_y)

GridSearchCV(cv=3, error_score='raise',
       estimator=Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='sag', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.3, 0.4, 0.5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [818]:
pred = model.predict(valid_X)

In [819]:
valid = np.expm1(valid_y)
pred= np.expm1(pred)

In [820]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(valid, pred))

In [821]:
rms 

16.286373325989857

In [607]:
for x,y in zip(valid,pred):
    print(x,y)

8.59208966073 4.95712883326
4.99404 5.76556008408
7.55551118056 12.8655395755
16.2237728993 18.2852884665
28.0321376769 14.2606432176
6.97882764641 3.17084280079
12.99003659 16.9109897673
10.3468179056 16.089377064
9.52256 16.6191713601
82.3837109896 32.7515734195
12.0608160937 11.4563690932
26.7155613966 22.034060757
8.22045864444 5.64042432125
5.72212262787 7.51140676185
44.9758711163 71.5855729709
13.3321491098 18.1328219968
36.3096944583 30.0236540784
19.5849810221 26.3025420882
12.6032438802 13.7838149747
60.9067438472 55.8965187412
95.6225702488 106.537199026
15.5864794222 25.0732230085
11.778747588 20.5208380932
37.2539958064 33.6847059814
17.3796791037 20.13200209
9.61432 5.31222578827
30.8755273531 21.7758186355
27.9793571677 13.4202513109
10.0128195174 6.78094679556
10.8983858937 24.4633880395
9.32980329051 22.7518855038
2.34010808625 2.49535258511
14.1900574653 19.3882408133
15.1945772385 28.8386910607
9.16367348148 5.81553446612
5.28576690495 4.06468850181
1818.82906813 101

In [136]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
 
lr = LinearRegression()
#select 5 the most informative features
rfe = RFE(lr, 20) 
selector = rfe.fit(train_X,train_y)

In [137]:
names = list(train_X.columns)
print (sorted(zip(map(lambda x: round(x, 4), selector.ranking_), names)))


[(1, 'gen_14'), (1, 'gen_15'), (1, 'gen_16'), (1, 'gen_17'), (1, 'gen_18'), (1, 'gen_21'), (1, 'gen_22'), (1, 'gen_23'), (1, 'gen_24'), (1, 'gen_25'), (1, 'gen_28'), (1, 'gen_32'), (1, 'wd_1'), (1, 'wd_2'), (1, 'wd_3'), (1, 'wd_4'), (1, 'wd_5'), (1, 'wd_6'), (1, 'wd_7'), (1, 'weekend'), (2, 'gen_20'), (3, 'gen_19'), (4, 'gen_13'), (5, 'gen_12'), (6, 'gen_11'), (7, 'gen_10'), (8, 'gen_9'), (9, 'gen_8'), (10, 'gen_7'), (11, 'gen_26'), (12, 'gen_6'), (13, 'gen_5'), (14, 'gen_4'), (15, 'gen_27'), (16, 'gen_3'), (17, 'gen_1'), (18, 'gen_2'), (19, 'gen_34'), (20, 'gen_35'), (21, 'gen_31'), (22, 'gen_39'), (23, 'gen_38'), (24, 'gen_45'), (25, 'gen_49'), (26, 'gen_50'), (27, 'h_0'), (28, 'gen_58'), (29, 'h_23'), (30, 'h_20'), (31, 'h_21'), (32, 'h_1'), (33, 'h_18'), (34, 'evening'), (35, 'h_19'), (36, 'h_22'), (37, 'h_2'), (38, 'gen_30'), (39, 'gen_42'), (40, 'gen_41'), (41, 'h_7'), (42, 'h_8'), (43, 'h_17'), (44, 'h_16'), (45, 'h_14'), (46, 'h_9'), (47, 'h_15'), (48, 'h_10'), (49, 'h_13'), (5

In [71]:
from sklearn.linear_model import RandomizedLasso

Y = np.log1p(df.soldPrice) 
X = df.drop('soldPrice', axis =1)
names = [X.columns]
 
rlasso = RandomizedLasso(alpha=0.025)
rlasso.fit(X, Y)
 
print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), 
                 names), reverse=True)
)



Features sorted by their score:
[(0.0, Index(['isGestating', 'isReady', 'siringWithId', 'fancy', 'breed_chartreux',
       'breed_cymric', 'breed_himalayan', 'breed_laperm', 'breed_mainecoon',
       'breed_munchkin',
       ...
       'h_22', 'h_23', 'wd_1', 'wd_2', 'wd_3', 'wd_4', 'wd_5', 'wd_6', 'wd_7',
       'medPrice'],
      dtype='object', length=257))]


In [69]:
"""from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from minepy import MINE
 
np.random.seed(42)

Y = np.log1p(df.soldPrice) #skrewed price distribution
X = df.drop('soldPrice', axis =1)
 
names = [X.columns]
 
ranks = {}
 
def rank_to_dict(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))
 
lr = LinearRegression(normalize=True)
lr.fit(X, Y)
ranks["Linear reg"] = dict(zip(np.abs(lr.coef_), names)
 
ridge = Ridge(alpha=7)
ridge.fit(X, Y)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)
 
lasso = Lasso(alpha=.05)
lasso.fit(X, Y)
ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)
 
rlasso = RandomizedLasso(alpha=0.04)
rlasso.fit(X, Y)
ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)
 
#stop the search when 5 features are left (they will get equal scores)
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(X,Y)
ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)
 
rf = RandomForestRegressor()
rf.fit(X,Y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)
 
f, pval  = f_regression(X, Y, center=True)
ranks["Corr."] = rank_to_dict(f, names)
 
mine = MINE()
mic_scores = []
for i in range(X.shape[1]):
    mine.compute_score(X[:,i], Y)
    m = mine.mic()
    mic_scores.append(m)
    
ranks["MIC"] = rank_to_dict(mic_scores, names) 
 
r = {}
for name in names:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)

methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")
 
print ("\t%s" % "\t".join(methods))
for name in names:
    print ("%s\t%s" % (name, "\t".join(map(str, 
                         [ranks[method][name] for method in methods]))))
""";

# Model for last days

In [822]:
df=merged

In [823]:
df.shape

(124827, 266)

In [824]:
df.head()
df = df.sort_values('date')

In [826]:
df.tail()

Unnamed: 0,num,isGestating,isReady,siringWithId,fancy,birthTime,breed_chartreux,breed_cymric,breed_himalayan,breed_laperm,...,wd_1,wd_2,wd_3,wd_4,wd_5,wd_6,wd_7,date,medPrice,meanPrice
70275,138555,0,1,0,0,2017-12-08 02:11:47,0,1,0,0,...,0,0,0,1,0,0,0,2017-12-14,6.84982,20.360632
111212,241107,0,1,0,0,2017-12-11 14:30:53,0,0,0,0,...,0,0,0,1,0,0,0,2017-12-14,6.84982,20.360632
65804,129805,0,1,0,0,2017-12-07 17:33:01,0,0,0,0,...,0,0,0,1,0,0,0,2017-12-14,6.84982,20.360632
19839,40396,0,1,0,0,2017-12-03 12:52:41,0,0,0,0,...,0,0,0,1,0,0,0,2017-12-14,6.84982,20.360632
124826,284964,0,1,0,0,2017-12-13 09:18:55,0,0,1,0,...,0,0,0,1,0,0,0,2017-12-14,6.84982,20.360632


In [745]:
df = df.drop('num',axis=1)
df = df.drop('cat_id',axis=1)
df = df.drop('timeStamp',axis=1)
df = df.drop('dateSold',axis=1)

In [746]:
#df = df.drop('medPrice',axis=1)
df = df.drop('meanPrice',axis=1)
df = df.drop('date',axis=1)
df = df.drop('dateBirth',axis=1)

In [747]:
#df = df.drop('medDif',axis=1)
df = df.drop('birthTime',axis=1)

In [748]:
df['age']=np.log1p(df['age']+1)
df['medPrice']=np.log1p(df['medPrice'])

In [749]:
#total 124820
df_last = df[100000:]


In [750]:
df_last.shape

(24827, 258)

In [751]:
y_train = np.log1p(df_last.soldPrice) #skrewed price distribution

In [752]:
X_train = df_last.drop('soldPrice', axis =1)

In [753]:
from sklearn.linear_model import Ridge

train_X, valid_X, train_y, valid_y = train_test_split(X_train, y_train, test_size = 0.1, random_state = 42)
model = Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='sag', tol=0.001)
model.fit(train_X,train_y)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='sag', tol=0.001)

In [754]:
pred = model.predict(valid_X)

In [755]:
valid = np.expm1(valid_y)
pred= np.expm1(pred)

In [756]:
rms = sqrt(mean_squared_error(valid, pred))

In [757]:
rms 

48.65104082545659

In [662]:
for x,y in zip(valid,pred):
    print(x,y)

15.2853578358 17.1367839062
5.52780429167 4.58205368705
4.8949453109 10.0351329229
7.630018 6.60730047129
4.75208550667 6.23261827799
12.224616 12.8598866691
8.661548 17.3132736898
6.68072741722 4.74225308945
0.959314465 14.7849626866
6.41997409424 6.53128935402
173.248471884 100.70975682
5.66621850694 10.8895422549
10.7226453398 13.8593001251
19.5492301514 10.3731695089
6.20605365799 7.12019607244
16.6651067115 32.0432166091
54.1728230361 47.4260204668
54.8327339111 78.8532766537
32.2913903667 13.4415599784
5.35440011667 11.4403287642
20.43045 16.7053321261
22.6681767531 20.8954055769
13.3973385496 12.7431704759
28.3210431815 10.339663687
6.61246634271 5.69408336215
5.98115297889 6.03443007588
42.4513055375 8.41462787613
20.056010107 17.4960550779
7.92715635556 17.0931417319
6.04576020527 7.00158432533
14.3562999347 15.7768446909
14.5524978469 22.4750162899
6.01997325417 5.53412738091
5.18457513287 4.17654898989
5.9081546875 7.5233821788
50.5447084306 39.8165716456
7.95910932437 13.30

# CV

In [664]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

clf = Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='sag', tol=0.001)
train_X, valid_X, train_y, valid_y = train_test_split(X_train, y_train, test_size = 0.1, random_state = 42)

scores = cross_val_score(clf, train_X,train_y, cv=5, scoring='neg_mean_squared_error')


array([-0.30689744, -0.28276569, -0.30320546, -0.28507139, -0.28614697])

In [667]:
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='sag', tol=0.001), cv=3,
                   param_grid={"alpha": [0.3, 0.4, 0.5]})
model.fit(train_X,train_y)

GridSearchCV(cv=3, error_score='raise',
       estimator=Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='sag', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.3, 0.4, 0.5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [672]:
model.best_estimator_

Ridge(alpha=0.3, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='sag', tol=0.001)

In [670]:
pred = model.predict(valid_X)
valid = np.expm1(valid_y)
pred= np.expm1(pred)
valid = np.expm1(valid_y)
pred= np.expm1(pred)
res =sqrt(mean_squared_error(valid, pred))
print (res)

inf


  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
