In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from plotly.graph_objs import * 
import plotly.plotly as py
import plotly.offline as pyo
pyo.offline.init_notebook_mode()


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


Load data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv",)
test["loss"]=np.nan


Let's see how target distributed.

In [3]:
trace = {'type' : 'histogram',
             'x' : train["loss"],
             'histnorm':'count',
             'name' : 'F_p2',
             'xbins' :{'size' : 1},
             'marker': {'color' : '#d81d1399'}
        }

layout = {
    'title' :'loss distribution',
    'xaxis':{
        'title': 'loss',
        'nticks':100
        },
    'yaxis':{
        'title':'Count'},
    'bargap' : 0.2,
    'bargroupgap' :0.1
}
fig = Figure(data=[trace], layout=layout)
py.iplot(fig, filename='styled histogram of F_P2 value')


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points




Highly right skewed!

In [4]:
train["loss"].head()

0    2213.18
1    1283.60
2    3005.09
3     939.85
4    2763.85
Name: loss, dtype: float64

In [5]:
trans_y = np.log(train["loss"])

Let's make log transfromation.

In [6]:
trans_y.head()

0    7.702186
1    7.157424
2    8.008063
3    6.845720
4    7.924380
Name: loss, dtype: float64

In [7]:
trace.update({'x':trans_y})
fig = Figure(data=[trace], layout=layout)

In [8]:
py.iplot(fig, filename='styled histogram of F_P2 value')


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points




Now, it is normal like!

In [9]:
np.exp(trans_y).head()

0    2213.18
1    1283.60
2    3005.09
3     939.85
4    2763.85
Name: loss, dtype: float64

update target variable to log transformation.

In [10]:
train.loss = trans_y

See the summary of the train and test data before fit into model

In [29]:
train.describe()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
count,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0
mean,294135.982561,0.493861,0.507188,0.498918,0.491812,0.487428,0.490945,0.48497,0.486437,0.485506,0.498066,0.493511,0.49315,0.493138,0.495717,7.685268
std,169336.084867,0.18764,0.207202,0.202105,0.211292,0.209027,0.205273,0.17845,0.19937,0.18166,0.185877,0.209737,0.209427,0.212777,0.222488,0.811868
min,1.0,1.6e-05,0.001149,0.002634,0.176921,0.281143,0.012683,0.069503,0.23688,8e-05,0.0,0.035321,0.036232,0.000228,0.179722,-0.400478
25%,147748.25,0.34609,0.358319,0.336963,0.327354,0.281143,0.336105,0.350175,0.3128,0.35897,0.36458,0.310961,0.311661,0.315758,0.29461,7.093787
50%,294539.5,0.475784,0.555782,0.527991,0.452887,0.422268,0.440945,0.438285,0.44106,0.44145,0.46119,0.457203,0.462286,0.363547,0.407403,7.65708
75%,440680.5,0.623912,0.681761,0.634224,0.652072,0.643315,0.655021,0.591045,0.62358,0.56682,0.61459,0.678924,0.675759,0.689974,0.724623,8.25947
max,587633.0,0.984975,0.862654,0.944251,0.954297,0.983674,0.997162,1.0,0.9802,0.9954,0.99498,0.998742,0.998484,0.988494,0.844848,11.703647


In [12]:
test.describe()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
count,125546.0,125546.0,125546.0,125546.0,125546.0,125546.0,125546.0,125546.0,125546.0,125546.0,125546.0,125546.0,125546.0,125546.0,125546.0,0.0
mean,294067.153442,0.494447,0.506939,0.498255,0.492334,0.48764,0.492188,0.485945,0.487401,0.486015,0.498909,0.49436,0.494033,0.495086,0.495586,
std,170098.335649,0.187961,0.206837,0.201746,0.210815,0.209119,0.205574,0.17865,0.199549,0.182134,0.18595,0.210399,0.210148,0.213108,0.222613,
min,4.0,1.6e-05,0.001149,0.002634,0.176921,0.281143,0.012683,0.069503,0.23688,8e-05,0.0,0.035321,0.036232,0.000228,0.178568,
25%,146414.25,0.347403,0.358319,0.336963,0.327354,0.281143,0.336105,0.352087,0.31796,0.35897,0.36458,0.310961,0.318249,0.315758,0.294752,
50%,294306.0,0.475784,0.555782,0.527991,0.452887,0.422268,0.441525,0.438893,0.44106,0.44145,0.46672,0.457203,0.462286,0.363547,0.406104,
75%,441800.0,0.62663,0.681761,0.634224,0.652072,0.643315,0.659261,0.591284,0.62918,0.56889,0.61984,0.678924,0.682413,0.689974,0.724791,
max,587634.0,0.984975,0.862654,0.944251,0.956046,0.983107,0.997162,1.0,0.9828,0.9954,0.99498,0.99783,0.997416,0.988494,0.844814,


In [13]:
train.shape

(188318, 132)

In [14]:
test.shape

(125546, 132)

In [15]:
train.head(3)

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,7.702186
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,7.157424
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,8.008063


In [16]:
test.head(3)

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,4,A,B,A,A,A,A,A,A,B,...,0.466591,0.317681,0.61229,0.34365,0.38016,0.377724,0.369858,0.704052,0.392562,
1,6,A,B,A,B,A,A,A,A,B,...,0.482425,0.44376,0.7133,0.5189,0.60401,0.689039,0.675759,0.453468,0.208045,
2,9,A,B,A,B,B,A,B,A,B,...,0.212308,0.325779,0.29758,0.34365,0.30529,0.24541,0.241676,0.258586,0.297232,


concat train and test data into one data set.

In [17]:
data = pd.concat([train,test], axis = 0, join = 'outer', ignore_index = True)

In [18]:
col = list(data.columns)
col = col[1:]

In [19]:
col.append("id")

In [20]:
data = data[col]

In [21]:
data.head(3)

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss,id
0,A,B,A,B,A,A,A,A,B,A,...,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,7.702186,1
1,A,B,A,A,A,A,A,A,B,B,...,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,7.157424,2
2,A,B,A,A,B,A,A,A,B,B,...,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,8.008063,5


fill na.

In [22]:
data.ix[:,"cat1":"cat116"] = data.ix[:,"cat1":"cat116"].fillna("NA")



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated



encode categorical variable to dummy variable.

In [23]:
labelencoder = LabelEncoder()

In [24]:
data.ix[:,"cat1":"cat116"] = data.ix[:,"cat1":"cat116"].apply(lambda x: labelencoder.fit_transform(x) ) 

In [25]:
onehotencoder = OneHotEncoder(categorical_features = list(range(116)))

In [26]:
data.shape

(313864, 132)

In [27]:
data2 = onehotencoder.fit_transform(data.ix[:,0:130]).toarray()

In [28]:
data2.shape

(313864, 1190)

In [31]:
data2 = np.hstack((data2,data.ix[:,130:]))

In [32]:
data2.shape

(313864, 1192)

split into train and test set

In [33]:
train = data2[0:188318]

In [34]:
test = data2[188318:]

In [35]:
train.shape

(188318, 1192)

In [36]:
test.shape

(125546, 1192)

In [37]:
test[:,-1] = test[:,-1].astype("int")

In [38]:
from sklearn.linear_model import LinearRegression

In [39]:
train[0,-2]

7.7021856742941663

In [40]:
train[:,-1]

array([  1.00000000e+00,   2.00000000e+00,   5.00000000e+00, ...,
         5.87630000e+05,   5.87632000e+05,   5.87633000e+05])

In [41]:
X = train[:,0:-2]

In [42]:
Y  = train[:,-2]

In [43]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X, Y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [44]:
test[:,-2]

array([ nan,  nan,  nan, ...,  nan,  nan,  nan])

Predict target "loss"!

In [45]:
test[:,-2] = regressor.predict(test[:,0:-2])

In [46]:
test[:,-2]

array([ 7.28710938,  7.51367188,  9.34765625, ...,  7.90283203,
        6.86328125,  8.22412109])

In [47]:
pred =pd.DataFrame(test[:,-2:],columns = ["loss","id"])

In [48]:
pred = pred[["id","loss"]]

smooth extreme value

In [49]:
def smooth(x,a):
    if x> 7.685268+a*0.811868:
        x = 7.685268+a*0.811868
    elif x< 7.685268-a*0.811868:
        x = 7.685268-a*0.811868
    return x

Smooth method: all oberservation should be 95% range of normal distribution

Z(a=0.05)=1.96

In [50]:
pred["loss"]=pred["loss"].apply(lambda x:smooth(x,1.96))

In [51]:
pred["loss"]= np.exp(pred["loss"])

In [52]:
pred["loss"].max()

10684.285328258205

In [53]:
pred["loss"].min()

443.19361112611062

In [54]:
pred["id"] = pred["id"].astype("int")

In [55]:
pred.describe()

Unnamed: 0,id,loss
count,125546.0,125546.0
mean,294067.153442,2598.112085
std,170098.335649,1860.365441
min,4.0,443.193611
25%,146414.25,1424.007657
50%,294306.0,1991.572263
75%,441800.0,3016.096576
max,587634.0,10684.285328


output to csv file.

In [56]:
pred.to_csv("pred.csv",index= False)